# 1.Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import xml.etree.ElementTree as ET
from collections import defaultdict
import spacy
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 2.Data Extraction 

In [2]:
def list_files_and_count(directory_path):
    """
    Lists all files and directories in a given path and counts the files.

    Args:
        directory_path (str): Path to the directory.

    Returns:
        list: List of files and directories.
        int: Number of files in the directory.
    """
    dir_list = os.listdir(directory_path)
    file_count = len([f for f in dir_list if os.path.isfile(os.path.join(directory_path, f))])

    print(f"Files and directories in '{directory_path}':")
    print(dir_list)
    print(f"Number of files in {directory_path}: {file_count}\n")

    return dir_list, file_count


class DDIExtractor:
    """
    Extractor class to parse and process DDI XML data from DrugBank and MedLine datasets.
    """

    def __init__(self):
        self.entities = defaultdict(list)
        self.pairs = []
        self.sentences = []

    def parse_medline_file(self, file_path):
        """Parse MedLine format XML files."""
        self._parse_file(file_path, is_medline=True)

    def parse_drugbank_file(self, file_path):
        """Parse DrugBank format XML files."""
        self._parse_file(file_path, is_medline=False)

    def _parse_file(self, file_path, is_medline):
        """Generic method to parse XML files based on their type."""
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()

            document = (
                root.find('.//document_content/document') if is_medline else root
            )
            if document is not None:
                self._process_document(
                    document,
                    source_type='medline' if is_medline else 'drugbank',
                    file_name=os.path.basename(file_path),
                )
        except ET.ParseError as e:
            print(f"Error parsing file {file_path}: {e}")

    def _process_document(self, document, source_type, file_name):
        """Process a document element regardless of its source."""
        doc_id = document.get('id', '')

        for sentence in document.findall('.//sentence'):
            sent_id = sentence.get('id', '')
            sent_text = sentence.get('text', '')

            self.sentences.append({
                'doc_id': doc_id,
                'sent_id': sent_id,
                'text': sent_text,
                'source_type': source_type,
                'file_name': file_name,
            })

            # Extract entities
            for entity in sentence.findall('entity'):
                self.entities[entity.get('type', '')].append({
                    'doc_id': doc_id,
                    'sent_id': sent_id,
                    'entity_id': entity.get('id', ''),
                    'type': entity.get('type', ''),
                    'text': entity.get('text', ''),
                    'charOffset': entity.get('charOffset', ''),
                    'source_type': source_type,
                    'file_name': file_name,
                })

            # Extract DDI pairs
            for pair in sentence.findall('pair'):
                self.pairs.append({
                    'doc_id': doc_id,
                    'sent_id': sent_id,
                    'pair_id': pair.get('id', ''),
                    'e1': pair.get('e1', ''),
                    'e2': pair.get('e2', ''),
                    'ddi': pair.get('ddi', ''),
                    'source_type': source_type,
                    'file_name': file_name,
                })

    def process_directory(self, directory_path, file_type):
        """Process all XML files in a directory."""
        for filename in os.listdir(directory_path):
            if filename.endswith('.xml') and not filename.startswith('.'):
                file_path = os.path.join(directory_path, filename)
                if file_type == 'medline':
                    self.parse_medline_file(file_path)
                else:
                    self.parse_drugbank_file(file_path)

    def get_statistics(self):
        """Generate statistics about the extracted data."""
        return {
            'total_sentences': len(self.sentences),
            'total_pairs': len(self.pairs),
            'entity_counts': {etype: len(entities) for etype, entities in self.entities.items()},
            'positive_interactions': len([p for p in self.pairs if p['ddi'] == 'true']),
            'negative_interactions': len([p for p in self.pairs if p['ddi'] == 'false']),
        }

    def to_dataframes(self):
        """Convert extracted data to pandas DataFrames."""
        entities_df = pd.DataFrame([
            entity for entities in self.entities.values() 
            for entity in entities
        ])
        pairs_df = pd.DataFrame(self.pairs)
        sentences_df = pd.DataFrame(self.sentences)

        return entities_df, pairs_df, sentences_df


def main():
    # Define paths
    train_drugbank_path = "DDICorpus/Train/DrugBank"
    train_medline_path = "DDICorpus/Train/MedLine"

    # List files and count
    list_files_and_count(train_drugbank_path)
    list_files_and_count(train_medline_path)

    # Initialize the extractor
    extractor = DDIExtractor()

    # Process DrugBank files
    print(f"Processing DrugBank files from {train_drugbank_path}...")
    extractor.process_directory(train_drugbank_path, 'drugbank')

    # Process MedLine files
    print(f"Processing MedLine files from {train_medline_path}...")
    extractor.process_directory(train_medline_path, 'medline')

    # Display extraction statistics
    stats = extractor.get_statistics()
    print("\nExtraction Statistics:")
    print(f"Total sentences processed: {stats['total_sentences']}")
    print(f"Total DDI pairs found: {stats['total_pairs']}")
    print("\nEntity counts by type:")
    for etype, count in stats['entity_counts'].items():
        print(f"{etype}: {count}")
    print(f"\nPositive interactions: {stats['positive_interactions']}")
    print(f"Negative interactions: {stats['negative_interactions']}")

    # Save extracted data to CSV
    entities_df, pairs_df, sentences_df = extractor.to_dataframes()
    entities_df.to_csv('entities.csv', index=False)
    pairs_df.to_csv('pairs.csv', index=False)
    sentences_df.to_csv('sentences.csv', index=False)
    print("\nData saved to CSV files.")


if __name__ == "__main__":
    main()


Files and directories in 'DDICorpus/Train/DrugBank':
['19-norandrostenedione_ddi.xml', 'Abarelix_ddi.xml', 'Abatacept_ddi.xml', 'Abciximab_ddi.xml', 'Acamprosate_ddi.xml', 'Acarbose_ddi.xml', 'Acebutolol_ddi.xml', 'Acetazolamide_ddi.xml', 'Acetohydroxamic Acid_ddi.xml', 'Aciclovir_ddi.xml', 'Acitretin_ddi.xml', 'Adalimumab_ddi.xml', 'Adapalene_ddi.xml', 'Adefovir Dipivoxil_ddi.xml', 'Adenosine_ddi.xml', 'Adinazolam_ddi.xml', 'Agalsidase beta_ddi.xml', 'Albendazole_ddi.xml', 'Alclometasone_ddi.xml', 'Aldesleukin_ddi.xml', 'Alefacept_ddi.xml', 'Alemtuzumab_ddi.xml', 'Alendronate_ddi.xml', 'Alfentanil_ddi.xml', 'Alfuzosin_ddi.xml', 'Alglucosidase alfa_ddi.xml', 'Aliskiren_ddi.xml', 'Alitretinoin_ddi.xml', 'Allopurinol_ddi.xml', 'Almotriptan_ddi.xml', 'Alosetron_ddi.xml', 'Alprazolam_ddi.xml', 'Alprostadil_ddi.xml', 'Alteplase_ddi.xml', 'Altretamine_ddi.xml', 'Amantadine_ddi.xml', 'Amifostine_ddi.xml', 'Amiloride_ddi.xml', 'Aminocaproic Acid_ddi.xml', 'Aminoglutethimide_ddi.xml', 'Aminohip

# 3.Data Loading and Data Preparation

In [3]:
def load_nlp():
    try:
        return spacy.load('en_core_web_sm')
    except OSError:
        print("Downloading spaCy model...")
        import subprocess
        subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
        return spacy.load('en_core_web_sm')


def prepare_data():
    # Load the CSV files
    entities_df = pd.read_csv('entities_df.csv')
    sentences_df = pd.read_csv('sentences_df.csv')
    pairs_df = pd.read_csv('pairs_df.csv')
    
    # Rename columns for clarity before merging
    entities_df_drug1 = entities_df.copy()
    entities_df_drug2 = entities_df.copy()
    
    entities_df_drug1.columns = [
        f'drug1_{col}' if col not in ['doc_id', 'sent_id'] else col 
        for col in entities_df_drug1.columns
    ]
    entities_df_drug2.columns = [
        f'drug2_{col}' if col not in ['doc_id', 'sent_id'] else col 
        for col in entities_df_drug2.columns
    ]
    
    # Merge all dataframes
    merged_df = pairs_df.merge(
        sentences_df[['doc_id', 'sent_id', 'text']], 
        on=['doc_id', 'sent_id']
    ).merge(
        entities_df_drug1,
        left_on=['doc_id', 'sent_id', 'e1'],
        right_on=['doc_id', 'sent_id', 'drug1_entity_id']
    ).merge(
        entities_df_drug2,
        left_on=['doc_id', 'sent_id', 'e2'],
        right_on=['doc_id', 'sent_id', 'drug2_entity_id']
    )
    
    # Create enhanced feature text using entity types
    merged_df['combined_text'] = (
        'drug1_type: ' + merged_df['drug1_type'] + 
        ' drug1: ' + merged_df['drug1_text'] + 
        ' drug2_type: ' + merged_df['drug2_type'] + 
        ' drug2: ' + merged_df['drug2_text'] + 
        ' context: ' + merged_df['text']
    )
    
    # Convert ddi column to numeric
    merged_df['ddi'] = (merged_df['ddi'].map({'TRUE': 1, 'FALSE': 0})
                       if merged_df['ddi'].dtype == object 
                       else merged_df['ddi'])
    
    # Add entity type combination feature
    merged_df['type_combination'] = merged_df['drug1_type'] + '_' + merged_df['drug2_type']
    
    return merged_df

def analyze_entity_types(data):
    print("\nEntity Type Analysis:")
    print("\nDrug 1 Types Distribution:")
    print(data['drug1_type'].value_counts())
    print("\nDrug 2 Types Distribution:")
    print(data['drug2_type'].value_counts())
    print("\nMost Common Type Combinations:")
    print(data['type_combination'].value_counts().head())

# 4. Model Training and Evaluation

In [4]:
def train_model(data):
    # Create features using bag of words with bigrams
    vectorizer = CountVectorizer(
        max_features=1500,
        ngram_range=(1, 2),
        stop_words='english'
    )
    X = vectorizer.fit_transform(data['combined_text'])
    
    # Add type combination as a feature
    type_encoder = CountVectorizer()
    type_features = type_encoder.fit_transform(data['type_combination'])
    
    # Combine text and type features
    X = np.hstack([X.toarray(), type_features.toarray()])
    
    y = data['ddi']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Train model
    model = RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    
    # Evaluate
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    print("\nTraining Performance:")
    print(classification_report(y_train, train_pred))
    print("\nTest Performance:")
    print(classification_report(y_test, test_pred))
    
    # Print confusion matrix
    print("\nConfusion Matrix (Test Set):")
    print(confusion_matrix(y_test, test_pred))
    
    return model, vectorizer, type_encoder

# 5.Interaction Prediction(Example)

In [6]:
def predict_interaction(model, vectorizer, type_encoder, drug1, drug1_type, drug2, drug2_type, context):
    # Prepare text features
    combined_text = f"drug1_type: {drug1_type} drug1: {drug1} drug2_type: {drug2_type} drug2: {drug2} context: {context}"
    text_features = vectorizer.transform([combined_text])
    
    # Prepare type combination features
    type_combination = f"{drug1_type}_{drug2_type}"
    type_features = type_encoder.transform([type_combination])
    
    # Combine features
    features = np.hstack([text_features.toarray(), type_features.toarray()])
    
    # Make prediction
    prediction = model.predict(features)[0]
    probability = model.predict_proba(features)[0]
    
    return {
        'has_interaction': bool(prediction),
        'confidence': float(max(probability)),
        'drug1': drug1,
        'drug2': drug2,
        'drug1_type': drug1_type,
        'drug2_type': drug2_type
    }

# Main Funtion

In [7]:
def main():
    try:
        # Prepare data
        print("Loading and preparing data...")
        data = prepare_data()
        print(f"Processed {len(data)} drug pairs")
        
        # Analyze entity types
        analyze_entity_types(data)
        
        # Train model
        print("\nTraining model...")
        model, vectorizer, type_encoder = train_model(data)
        
        # Example prediction
        print("\nExample Predictions:")
        test_cases = [
            ("Aspirin", "drug", "Warfarin", "drug", 
             "The patient is taking both medications for blood thinning."),
            ("Amoxicillin", "drug", "Ibuprofen", "brand", 
             "The patient was prescribed both medications.")
        ]
        
        for drug1, drug1_type, drug2, drug2_type, context in test_cases:
            result = predict_interaction(
                model, vectorizer, type_encoder,
                drug1, drug1_type, drug2, drug2_type, context
            )
            print(f"\nDrug 1: {result['drug1']} (Type: {result['drug1_type']})")
            print(f"Drug 2: {result['drug2']} (Type: {result['drug2_type']})")
            print(f"Interaction: {'Yes' if result['has_interaction'] else 'No'}")
            print(f"Confidence: {result['confidence']:.2%}")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        if 'data' in locals():
            print("\nAvailable columns:")
            print(data.columns.tolist())

if __name__ == "__main__":
    main()

Loading and preparing data...
Processed 26005 drug pairs

Entity Type Analysis:

Drug 1 Types Distribution:
drug1_type
drug      16195
group      6961
brand      2744
drug_n      105
Name: count, dtype: int64

Drug 2 Types Distribution:
drug2_type
drug      17165
group      6214
brand      2436
drug_n      190
Name: count, dtype: int64

Most Common Type Combinations:
type_combination
drug_drug      12081
group_drug      3557
group_group     2899
drug_group      2823
brand_drug      1467
Name: count, dtype: int64

Training model...

Training Performance:
              precision    recall  f1-score   support

       False       1.00      0.97      0.98     17773
        True       0.83      0.98      0.90      3031

    accuracy                           0.97     20804
   macro avg       0.91      0.97      0.94     20804
weighted avg       0.97      0.97      0.97     20804


Test Performance:
              precision    recall  f1-score   support

       False       0.92      0.93      