In [9]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer


In [10]:
train_path = r"C:\Users\likit\Desktop\drug_decision_support\data\drugLibTrain_raw.tsv"
df = pd.read_csv(train_path, sep="\t")

df.head()


Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


In [11]:
# Keep only required columns
df = df[['urlDrugName', 'sideEffectsReview', 'sideEffects']]
df.dropna(inplace=True)

# Rename drug column to a standard name
df = df.rename(columns={'urlDrugName': 'drug_name'})

# Normalize drug names
df['drug_name'] = df['drug_name'].str.lower().str.strip()

# Reset index for TF-IDF alignment
df = df.reset_index(drop=True)




In [12]:
df

Unnamed: 0,drug_name,sideEffectsReview,sideEffects
0,enalapril,"cough, hypotension , proteinuria, impotence , ...",Mild Side Effects
1,ortho-tri-cyclen,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...",Severe Side Effects
2,ponstel,Heavier bleeding and clotting than normal.,No Side Effects
3,prilosec,"Constipation, dry mouth and some mild dizzines...",Mild Side Effects
4,lyrica,I felt extremely drugged and dopey. Could not...,Severe Side Effects
...,...,...,...
3027,vyvanse,"Restless legs at night, insomnia, headache (so...",Mild Side Effects
3028,zoloft,"Weight gain, extreme tiredness during the day,...",Extremely Severe Side Effects
3029,climara,Constant issues with the patch not staying on....,Moderate Side Effects
3030,trileptal,"Dizziness, fatigue, nausea",Mild Side Effects


In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text

df['clean_text'] = df['sideEffectsReview'].apply(clean_text)
df

Unnamed: 0,drug_name,sideEffectsReview,sideEffects,clean_text
0,enalapril,"cough, hypotension , proteinuria, impotence , ...",Mild Side Effects,cough hypotension proteinuria impotence rena...
1,ortho-tri-cyclen,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...",Severe Side Effects,heavy cycle cramps hot flashes fatigue long la...
2,ponstel,Heavier bleeding and clotting than normal.,No Side Effects,heavier bleeding and clotting than normal
3,prilosec,"Constipation, dry mouth and some mild dizzines...",Mild Side Effects,constipation dry mouth and some mild dizziness...
4,lyrica,I felt extremely drugged and dopey. Could not...,Severe Side Effects,i felt extremely drugged and dopey could not ...
...,...,...,...,...
3027,vyvanse,"Restless legs at night, insomnia, headache (so...",Mild Side Effects,restless legs at night insomnia headache somet...
3028,zoloft,"Weight gain, extreme tiredness during the day,...",Extremely Severe Side Effects,weight gain extreme tiredness during the day i...
3029,climara,Constant issues with the patch not staying on....,Moderate Side Effects,constant issues with the patch not staying on ...
3030,trileptal,"Dizziness, fatigue, nausea",Mild Side Effects,dizziness fatigue nausea


In [14]:
vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1,2),
    stop_words='english'
)

X = vectorizer.fit_transform(df['clean_text'])
feature_names = vectorizer.get_feature_names_out()


In [15]:
feature_names

array(['abated', 'abdomen', 'abdominal', ..., 'zoloft', 'zombie',
       'zyrtec'], shape=(3000,), dtype=object)

In [14]:
drug_profiles = {}

for drug in df['drug_name'].unique():
    drug_df = df[df['drug_name'] == drug]

    # Minimum data threshold
    if len(drug_df) < 10:
        continue

    # Align TF-IDF rows
    row_indices = drug_df.index.tolist()
    tfidf_sub = X[row_indices, :]

    # Mean TF-IDF scores
    mean_scores = tfidf_sub.mean(axis=0).A1
    top_indices = mean_scores.argsort()[-15:]

    common_effects = [feature_names[i] for i in top_indices]

    severity_dist = (
        drug_df['sideEffects']
        .value_counts(normalize=True)
        .to_dict()
    )

    drug_profiles[drug] = {
        "common_effects": common_effects,
        "severity_distribution": severity_dist
    }


In [15]:
print("Number of drugs learned:", len(drug_profiles))
print("Sample drugs:", list(drug_profiles.keys())[:10])

joblib.dump(
    drug_profiles,
    r"C:\Users\likit\Desktop\drug_decision_support\models\drug_profiles.pkl"
)


Number of drugs learned: 99
Sample drugs: ['ortho-tri-cyclen', 'prilosec', 'lyrica', 'propecia', 'vyvanse', 'elavil', 'xanax', 'claritin', 'effexor-xr', 'neurontin']


['C:\\Users\\likit\\Desktop\\drug_decision_support\\models\\drug_profiles.pkl']

In [16]:
drug_profiles[list(drug_profiles.keys())[0]]


{'common_effects': ['drive',
  'sex',
  'really',
  'got',
  'sex drive',
  'breast',
  'minimal',
  'bleeding',
  'control',
  'hair',
  'birth',
  'birth control',
  'pregnant',
  'month',
  'moody'],
 'severity_distribution': {'Mild Side Effects': 0.5333333333333333,
  'Severe Side Effects': 0.2,
  'No Side Effects': 0.13333333333333333,
  'Moderate Side Effects': 0.13333333333333333}}