In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
from collections import defaultdict

class DrugBankProcessor:
    def __init__(self, xml_file):
        self.xml_file = xml_file
        self.ns = {'db': 'http://www.drugbank.ca'}  # DrugBank namespace

    def extract_drug_interactions(self, chunk_size=1000):
        """
        Extract drug-drug interactions with additional properties.
        Yields chunks of interactions to manage memory.
        """
        interactions = []
        current_drug = None

        for event, elem in ET.iterparse(self.xml_file, events=('start', 'end')):
            if event == 'start' and elem.tag == f"{{{self.ns['db']}}}drug":
                # Get the current drug's basic info
                drug_id = elem.find(f".//{{{self.ns['db']}}}drugbank-id")
                drug_name = elem.find(f".//{{{self.ns['db']}}}name")
                current_drug = {
                    'drug_id': drug_id.text if drug_id is not None else None,
                    'drug_name': drug_name.text if drug_name is not None else None,
                    'mechanism_of_action': self._extract_mechanism_of_action(elem),
                    'pharmacodynamics': self._extract_pharmacodynamics(elem),
                    'metabolism': self._extract_metabolism(elem),
                }

            elif event == 'end' and elem.tag == f"{{{self.ns['db']}}}drug-interaction":
                if current_drug:
                    # Extract interaction information
                    interaction = {
                        'drug_id': current_drug['drug_id'],
                        'drug_name': current_drug['drug_name'],
                        'interacting_drug_id': elem.find(f".//{{{self.ns['db']}}}drugbank-id").text,
                        'interacting_drug_name': elem.find(f".//{{{self.ns['db']}}}name").text,
                        'description': elem.find(f".//{{{self.ns['db']}}}description").text,
                        'mechanism_of_action': current_drug['mechanism_of_action'],
                        'pharmacodynamics': current_drug['pharmacodynamics'],
                        'metabolism': current_drug['metabolism'],
                    }
                    interactions.append(interaction)

                # Yield chunks when they reach the specified size
                if len(interactions) >= chunk_size:
                    yield interactions
                    interactions = []

                elem.clear()

        # Yield any remaining interactions
        if interactions:
            yield interactions

    def _extract_targets(self, drug_elem):
        """
        Extract target information for a drug.
        """
        targets = []
        for target in drug_elem.findall(f".//{{{self.ns['db']}}}targets/{{{self.ns['db']}}}target"):
            target_id = target.find(f".//{{{self.ns['db']}}}id")
            target_name = target.find(f".//{{{self.ns['db']}}}name")
            if target_id is not None and target_name is not None:
                targets.append(f"{target_id.text}:{target_name.text}")
        return "; ".join(targets)

    def _extract_enzymes(self, drug_elem):
        """
        Extract enzyme information for a drug.
        """
        enzymes = []
        for enzyme in drug_elem.findall(f".//{{{self.ns['db']}}}enzymes/{{{self.ns['db']}}}enzyme"):
            enzyme_id = enzyme.find(f".//{{{self.ns['db']}}}id")
            enzyme_name = enzyme.find(f".//{{{self.ns['db']}}}name")
            if enzyme_id is not None and enzyme_name is not None:
                enzymes.append(f"{enzyme_id.text}:{enzyme_name.text}")
        return "; ".join(enzymes)

    def _extract_transporters(self, drug_elem):
        """
        Extract transporter information for a drug.
        """
        transporters = []
        for transporter in drug_elem.findall(f".//{{{self.ns['db']}}}transporters/{{{self.ns['db']}}}transporter"):
            transporter_id = transporter.find(f".//{{{self.ns['db']}}}id")
            transporter_name = transporter.find(f".//{{{self.ns['db']}}}name")
            if transporter_id is not None and transporter_name is not None:
                transporters.append(f"{transporter_id.text}:{transporter_name.text}")
        return "; ".join(transporters)

    def _extract_mechanism_of_action(self, drug_elem):
        """
        Extract mechanism of action for a drug.
        """
        moa_elem = drug_elem.find(f".//{{{self.ns['db']}}}pharmacodynamics/{{{self.ns['db']}}}mechanism-of-action")
        return moa_elem.text if moa_elem is not None else None

    def _extract_pharmacodynamics(self, drug_elem):
        """
        Extract pharmacodynamics for a drug.
        """
        pd_elem = drug_elem.find(f".//{{{self.ns['db']}}}pharmacodynamics")
        return pd_elem.text if pd_elem is not None else None

    def _extract_metabolism(self, drug_elem):
        """
        Extract metabolism for a drug.
        """
        metabolism_elem = drug_elem.find(f".//{{{self.ns['db']}}}metabolism")
        return metabolism_elem.text if metabolism_elem is not None else None

    def save_interactions_to_csv(self, output_file, chunk_size=1000):
        """
        Process interactions and save to CSV in chunks.
        """
        first_chunk = True

        for chunk in self.extract_drug_interactions(chunk_size):
            df = pd.DataFrame(chunk)

            if first_chunk:
                df.to_csv(output_file, index=False, mode='w')
                first_chunk = False
            else:
                df.to_csv(output_file, index=False, mode='a', header=False)

    def get_interaction_statistics(self):
        """
        Get basic statistics about drug interactions.
        """
        stats = defaultdict(int)

        for chunk in self.extract_drug_interactions():
            stats['total_interactions'] += len(chunk)
            stats['unique_drugs'] += len(set(x['drug_id'] for x in chunk))

        return dict(stats)


# Initialize the processor
processor = DrugBankProcessor(r'C:\Users\Kevin Nathanael\Music\DDI Prediction\full database.xml')

# Save all interactions to CSV
processor.save_interactions_to_csv('testing2.csv')

# Print interaction statistics
print(processor.get_interaction_statistics())

{'total_interactions': 2855848, 'unique_drugs': 7418}


In [3]:
x = pd.read_csv(r'C:\Users\Kevin Nathanael\Music\DDI Prediction\testing2.csv')
x

  x = pd.read_csv(r'C:\Users\Kevin Nathanael\Music\DDI Prediction\testing2.csv')


Unnamed: 0,drug_id,drug_name,interacting_drug_id,interacting_drug_name,description,targets,enzymes,transporters,mechanism_of_action,pharmacodynamics,metabolism
0,DB00001,Lepirudin,DB06605,Apixaban,Apixaban may increase the anticoagulant activi...,,,,,Lepirudin is a recombinant hirudin that acts a...,"As a polypeptide, lepirudin is expected to be ..."
1,DB00001,Lepirudin,DB06695,Dabigatran etexilate,Dabigatran etexilate may increase the anticoag...,,,,,Lepirudin is a recombinant hirudin that acts a...,"As a polypeptide, lepirudin is expected to be ..."
2,DB00001,Lepirudin,DB01254,Dasatinib,The risk or severity of bleeding and hemorrhag...,,,,,Lepirudin is a recombinant hirudin that acts a...,"As a polypeptide, lepirudin is expected to be ..."
3,DB00001,Lepirudin,DB01609,Deferasirox,The risk or severity of gastrointestinal bleed...,,,,,Lepirudin is a recombinant hirudin that acts a...,"As a polypeptide, lepirudin is expected to be ..."
4,DB00001,Lepirudin,DB01586,Ursodeoxycholic acid,The risk or severity of bleeding and bruising ...,,,,,Lepirudin is a recombinant hirudin that acts a...,"As a polypeptide, lepirudin is expected to be ..."
...,...,...,...,...,...,...,...,...,...,...,...
2855843,DB19413,Influenza A Virus A/Thailand/8/2022 IVR-237 (H...,DB13509,Aloxiprin,The risk or severity of Reye's syndrome can be...,,,,,,
2855844,DB19413,Influenza A Virus A/Thailand/8/2022 IVR-237 (H...,DB13538,Guacetisal,The risk or severity of Reye's syndrome can be...,,,,,,
2855845,DB19413,Influenza A Virus A/Thailand/8/2022 IVR-237 (H...,DB13612,Carbaspirin calcium,The risk or severity of Reye's syndrome can be...,,,,,,
2855846,DB19413,Influenza A Virus A/Thailand/8/2022 IVR-237 (H...,DB14006,Choline salicylate,The risk or severity of Reye's syndrome can be...,,,,,,
