In [1]:
import pandas as pd
import numpy as np
import re
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


In [2]:
train_path = r"C:\Users\likit\Desktop\drug_decision_support\data\drugLibTrain_raw.tsv"
df_train = pd.read_csv(train_path, sep="\t")

df_train.head()


Unnamed: 0.1,Unnamed: 0,urlDrugName,rating,effectiveness,sideEffects,condition,benefitsReview,sideEffectsReview,commentsReview
0,2202,enalapril,4,Highly Effective,Mild Side Effects,management of congestive heart failure,slowed the progression of left ventricular dys...,"cough, hypotension , proteinuria, impotence , ...","monitor blood pressure , weight and asses for ..."
1,3117,ortho-tri-cyclen,1,Highly Effective,Severe Side Effects,birth prevention,Although this type of birth control has more c...,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...","I Hate This Birth Control, I Would Not Suggest..."
2,1146,ponstel,10,Highly Effective,No Side Effects,menstrual cramps,I was used to having cramps so badly that they...,Heavier bleeding and clotting than normal.,I took 2 pills at the onset of my menstrual cr...
3,3947,prilosec,3,Marginally Effective,Mild Side Effects,acid reflux,The acid reflux went away for a few months aft...,"Constipation, dry mouth and some mild dizzines...",I was given Prilosec prescription at a dose of...
4,1951,lyrica,2,Marginally Effective,Severe Side Effects,fibromyalgia,I think that the Lyrica was starting to help w...,I felt extremely drugged and dopey. Could not...,See above


In [3]:
df_train = df_train[['sideEffectsReview', 'sideEffects']]
df_train.dropna(inplace=True)
df_train = df_train.reset_index(drop=True)


In [4]:
def map_severity_5class(label):
    label = label.lower()

    if "no" in label:
        return 0
    elif "mild" in label:
        return 1
    elif "moderate" in label:
        return 2
    elif "extreme" in label:
        return 4
    elif "severe" in label:
        return 3
    else:
        return None


In [5]:
df_train['label'] = df_train['sideEffects'].apply(map_severity_5class)
df_train = df_train[df_train['label'].notna()]
df_train['label'] = df_train['label'].astype(int)


In [6]:
severity_map = {
    0: "No Side Effects",
    1: "Mild",
    2: "Moderate",
    3: "Severe",
    4: "Extremely Severe"
}


In [7]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text

df_train['clean_text'] = df_train['sideEffectsReview'].apply(clean_text)


In [8]:
df_train

Unnamed: 0,sideEffectsReview,sideEffects,label,clean_text
0,"cough, hypotension , proteinuria, impotence , ...",Mild Side Effects,1,cough hypotension proteinuria impotence rena...
1,"Heavy Cycle, Cramps, Hot Flashes, Fatigue, Lon...",Severe Side Effects,3,heavy cycle cramps hot flashes fatigue long la...
2,Heavier bleeding and clotting than normal.,No Side Effects,0,heavier bleeding and clotting than normal
3,"Constipation, dry mouth and some mild dizzines...",Mild Side Effects,1,constipation dry mouth and some mild dizziness...
4,I felt extremely drugged and dopey. Could not...,Severe Side Effects,3,i felt extremely drugged and dopey could not ...
...,...,...,...,...
3027,"Restless legs at night, insomnia, headache (so...",Mild Side Effects,1,restless legs at night insomnia headache somet...
3028,"Weight gain, extreme tiredness during the day,...",Extremely Severe Side Effects,4,weight gain extreme tiredness during the day i...
3029,Constant issues with the patch not staying on....,Moderate Side Effects,2,constant issues with the patch not staying on ...
3030,"Dizziness, fatigue, nausea",Mild Side Effects,1,dizziness fatigue nausea


In [21]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words='english'
)

X_train = vectorizer.fit_transform(df_train['clean_text'])
y_train = df_train['label']


In [22]:
model = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    multi_class='auto'
)

model.fit(X_train, y_train)




0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [23]:
y_pred = model.predict(X_train)
print(classification_report(y_train, y_pred))


              precision    recall  f1-score   support

           0       0.88      0.89      0.89       858
           1       0.89      0.77      0.83      1016
           2       0.80      0.84      0.82       614
           3       0.82      0.95      0.88       369
           4       0.85      0.99      0.91       175

    accuracy                           0.85      3032
   macro avg       0.85      0.89      0.87      3032
weighted avg       0.86      0.85      0.85      3032



In [24]:
feature_names = vectorizer.get_feature_names_out()
coef = model.coef_

def top_keywords(class_index, n=10):
    top_idx = np.argsort(coef[class_index])[-n:]
    return [feature_names[i] for i in top_idx]


In [25]:
top_keywords(4)  # Extremely Severe signals


['swollen',
 'depression',
 'benefits',
 'suicide',
 'said',
 'shock',
 'died',
 'extreme',
 'pain',
 'severe']

In [26]:
joblib.dump(
    model,
    r"C:\Users\likit\Desktop\drug_decision_support\models\severity_model.pkl"
)
joblib.dump(
    vectorizer,
    r"C:\Users\likit\Desktop\drug_decision_support\models\tfidf_vectorizer.pkl"
)


['C:\\Users\\likit\\Desktop\\drug_decision_support\\models\\tfidf_vectorizer.pkl']