In [None]:
import itertools
import pandas as pd
import xml.etree.ElementTree as ET
from collections import defaultdict

class DrugBankProcessor:
    def __init__(self, xml_file):
        self.xml_file =  r"C:\Users\Kevin Nathanael\Documents\DDI Prediction\data\full database.xml"
        self.ns = {'db': 'http://www.drugbank.ca'}  # DrugBank namespace
        
    def extract_drug_interactions(self, chunk_size=1000):
        """
        Extract drug-drug interactions from DrugBank XML.
        Yields chunks of interactions to manage memory.
        """
        interactions = []
        current_drug = None
        
        for event, elem in ET.iterparse(self.xml_file, events=('start', 'end')):
            if event == 'start' and elem.tag == f"{{{self.ns['db']}}}drug":
                # Get the current drug's basic info
                drug_id = elem.find(f".//{{{self.ns['db']}}}drugbank-id")
                drug_name = elem.find(f".//{{{self.ns['db']}}}name")
                current_drug = {
                    'drugbank_id': drug_id.text if drug_id is not None else None,
                    'name': drug_name.text if drug_name is not None else None
                }
                
            elif event == 'end' and elem.tag == f"{{{self.ns['db']}}}drug-interaction":
                if current_drug:
                    # Extract interaction information
                    interaction = {
                        'drug_id': current_drug['drugbank_id'],
                        'drug_name': current_drug['name'],
                        'interacting_drug_id': elem.find(f".//{{{self.ns['db']}}}drugbank-id").text,
                        'interacting_drug_name': elem.find(f".//{{{self.ns['db']}}}name").text,
                        'description': elem.find(f".//{{{self.ns['db']}}}description").text
                    }
                    interactions.append(interaction)
                
                # Yield chunks when they reach the specified size
                if len(interactions) >= chunk_size:
                    yield interactions
                    interactions = []
                    
                elem.clear()
            
        # Yield any remaining interactions
        if interactions:
            yield interactions
    
    def extract_drug_properties(self, properties_of_interest=None):
        """
        Extract basic drug properties.
        
        Args:
            properties_of_interest (list): List of property tags to extract
                                        (without namespace)
        """
        if properties_of_interest is None:
            properties_of_interest = ['drugbank-id', 'name', 'description', 
                                    'groups', 'mechanism-of-action']
        
        drugs = []
        
        for event, elem in ET.iterparse(self.xml_file, events=('end',)):
            if elem.tag == f"{{{self.ns['db']}}}drug":
                drug_data = {}
                
                for prop in properties_of_interest:
                    prop_elem = elem.find(f".//{{{self.ns['db']}}}{prop}")
                    drug_data[prop] = prop_elem.text if prop_elem is not None else None
                
                drugs.append(drug_data)
                elem.clear()
                
        return pd.DataFrame(drugs)
    
    def save_interactions_to_csv(self, output_file, chunk_size=1000):
        """
        Process interactions and save to CSV in chunks.
        """
        first_chunk = True
        
        for chunk in self.extract_drug_interactions(chunk_size):
            df = pd.DataFrame(chunk)
            
            if first_chunk:
                df.to_csv(output_file, index=False, mode='w')
                first_chunk = False
            else:
                df.to_csv(output_file, index=False, mode='a', header=False)
    
    def get_interaction_statistics(self):
        """
        Get basic statistics about drug interactions.
        """
        stats = defaultdict(int)
        
        for chunk in self.extract_drug_interactions():
            stats['total_interactions'] += len(chunk)
            stats['unique_drugs'] += len(set(x['drug_id'] for x in chunk))
            
        return dict(stats)

processor = DrugBankProcessor('drugbank.xml')

# Save all interactions to CSV
processor.save_interactions_to_csv('drug_interactions.csv')

# Or process interactions in chunks
for interactions_chunk in processor.extract_drug_interactions():
    # Process each chunk of interactions
    for interaction in interactions_chunk:
        print(f"Drug {interaction['drug_name']} interacts with {interaction['interacting_drug_name']}")