In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:

# Load the dataset
train_df = pd.read_csv(r"C:\Users\haris\major\DRS\DRS\dataset\drugsComTrain_raw.csv", sep=',')
test_df = pd.read_csv(r"C:\Users\haris\major\DRS\DRS\dataset\drugsComTest_raw.csv", sep=',')
# Drop rows with missing values in 'review' or 'rating'
train_test_df = pd.concat([train_df, test_df], ignore_index=True)
train_test_df.dropna(subset=['review', 'rating', 'usefulCount'], inplace=True)
train_test_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [3]:

# Drop missing values
#train_df.dropna(subset=['review', 'rating', 'usefulCount'], inplace=True)
#test_df.dropna(subset=['review', 'rating', 'usefulCount'], inplace=True)

In [4]:

# Generate sentiment labels
train_test_df['sentiment'] = train_test_df['rating'].apply(lambda x: 0 if x < 5 else (1 if x in [5, 6] else 2))


In [5]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(train_test_df['review'], train_test_df['sentiment'], test_size=0.2, random_state=42)


In [6]:
# Text preprocessing with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)


In [7]:
# Train the Logistic Regression model
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)


In [8]:

# Evaluate the model
y_pred = log_reg.predict(X_val_tfidf)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))


Accuracy: 0.7890637714179434
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.69      0.70     10722
           1       0.40      0.05      0.10      3945
           2       0.82      0.93      0.87     28346

    accuracy                           0.79     43013
   macro avg       0.64      0.56      0.55     43013
weighted avg       0.76      0.79      0.76     43013



In [29]:

# Analyze results for a specific condition
condition = "Birth Control"
data_filtered = train_test_df[train_test_df['condition'] == condition][['drugName', 'review', 'usefulCount']]



In [31]:
# Predict sentiment for the filtered data
reviews = data_filtered['review'].tolist()
drugs = data_filtered['drugName'].tolist()
useful_counts = data_filtered['usefulCount'].tolist()

# Transform reviews to TF-IDF format
reviews_tfidf = tfidf_vectorizer.transform(reviews)

# Predict sentiment
predictions = log_reg.predict(reviews_tfidf)


In [33]:

# Aggregate positive reviews and useful counts
drug_sentiment_data = {}

for i, drug in enumerate(drugs):
    sentiment = predictions[i]
    useful_count = useful_counts[i]

    if drug not in drug_sentiment_data:
        drug_sentiment_data[drug] = {'positive_count': 0, 'useful_count_sum': 0}

    if sentiment == 2:  # Positive sentiment
        drug_sentiment_data[drug]['positive_count'] += 1
        drug_sentiment_data[drug]['useful_count_sum'] += useful_count



In [35]:
# Create a DataFrame for ranking
drug_summary_df = pd.DataFrame.from_dict(drug_sentiment_data, orient='index')
drug_summary_df['drugName'] = drug_summary_df.index
drug_summary_df.reset_index(drop=True, inplace=True)

# Sort by positive count and useful count sum
drug_summary_df.sort_values(by=['positive_count', 'useful_count_sum'], ascending=False, inplace=True)

In [37]:
# Display top recommended drugs
top_drugs = drug_summary_df.head(5)
print("Top Recommended Drugs Based on Positive Reviews and Useful Count:")
print(top_drugs)

Top Recommended Drugs Based on Positive Reviews and Useful Count:
    positive_count  useful_count_sum                           drugName
4             2400             18849                       Etonogestrel
10            2152             18716                     Levonorgestrel
21            1714             16086  Ethinyl estradiol / norethindrone
3             1461             10081                          Nexplanon
15            1212             11988   Ethinyl estradiol / norgestimate


In [39]:
top_drugs.head()

Unnamed: 0,positive_count,useful_count_sum,drugName
4,2400,18849,Etonogestrel
10,2152,18716,Levonorgestrel
21,1714,16086,Ethinyl estradiol / norethindrone
3,1461,10081,Nexplanon
15,1212,11988,Ethinyl estradiol / norgestimate
