In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load dataset
train_df = pd.read_csv(r"C:\Users\haris\major\DRS\DRS\dataset\drugsComTrain_raw.csv", sep=',')
test_df = pd.read_csv(r"C:\Users\haris\major\DRS\DRS\dataset\drugsComTest_raw.csv", sep=',')
# Drop rows with missing values in 'review' or 'rating'
train_test_df = pd.concat([train_df, test_df], ignore_index=True)
train_test_df.dropna(subset=['review', 'rating', 'usefulCount'], inplace=True)
train_test_df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [3]:
# Drop rows with missing values
train_df = train_df.dropna(axis=0)
test_df = test_df.dropna(axis=0)

In [4]:
# Tokenizer and sequence preprocessing
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_df['review'])

train_sequences = tokenizer.texts_to_sequences(train_test_df['review'])


max_len = 100
train_data = pad_sequences(train_sequences, maxlen=max_len)


In [5]:
# Convert rating labels to indices (negative, neutral, positive)
train_labels = np.zeros((len(train_test_df), 3))
train_labels[:, 0] = train_test_df['rating'] < 5
train_labels[:, 1] = (train_test_df['rating'] == 5) | (train_test_df['rating'] == 6)
train_labels[:, 2] = train_test_df['rating'] > 6

train_labels = np.argmax(train_labels, axis=1)


In [6]:
# CNN model creation
embedding_dim = 50
model = Sequential()
model.add(Embedding(10000, embedding_dim))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(3, activation='softmax'))


In [7]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [8]:
# Train the model
history = model.fit(train_data, train_labels, epochs=30, batch_size=128, validation_split=0.2)


Epoch 1/30
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 14ms/step - accuracy: 0.7352 - loss: 0.6827 - val_accuracy: 0.8156 - val_loss: 0.4928
Epoch 2/30
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 14ms/step - accuracy: 0.8375 - loss: 0.4358 - val_accuracy: 0.8325 - val_loss: 0.4492
Epoch 3/30
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 14ms/step - accuracy: 0.8836 - loss: 0.3178 - val_accuracy: 0.8567 - val_loss: 0.3994
Epoch 4/30
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - accuracy: 0.9266 - loss: 0.2127 - val_accuracy: 0.8743 - val_loss: 0.3991
Epoch 5/30
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 14ms/step - accuracy: 0.9602 - loss: 0.1281 - val_accuracy: 0.8906 - val_loss: 0.4170
Epoch 6/30
[1m1345/1345[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 15ms/step - accuracy: 0.9776 - loss: 0.0779 - val_accuracy: 0.8984 - val_loss: 0.4670
Epoc

In [9]:
# Function for sentiment analysis
def sentiment_analysis_direct(review):
    sequence = pad_sequences(tokenizer.texts_to_sequences([review]), maxlen=100)
    prediction = model.predict(sequence)[0]
    sentiment = np.argmax(prediction)
    if sentiment == 0:
        return "Negative"
    elif sentiment == 1:
        return "Neutral"
    elif sentiment == 2:
        return "Positive"


In [27]:
# Filter data for a specific condition
condition = "Birth Control"
data_filtered = train_test_df[train_test_df['condition'] == condition][['drugName', 'review', 'usefulCount']]


In [29]:
# Extract drug names, reviews, and useful counts
drug_list = data_filtered['drugName'].tolist()
review_list = data_filtered['review'].tolist()
useful_count_list = data_filtered['usefulCount'].tolist()


In [None]:
# Analyze sentiment and aggregate useful counts
drug_sentiment_data = {}

for i, review in enumerate(review_list):
    sentiment = sentiment_analysis_direct(review)
    drug_name = drug_list[i]
    useful_count = useful_count_list[i]
    
    if drug_name not in drug_sentiment_data:
        drug_sentiment_data[drug_name] = {'positive_count': 0, 'useful_count_sum': 0}
    
    if sentiment == "Positive":
        drug_sentiment_data[drug_name]['positive_count'] += 1
        drug_sentiment_data[drug_name]['useful_count_sum'] += useful_count

# Convert drug sentiment data to a DataFrame for sorting
drug_summary_df = pd.DataFrame.from_dict(drug_sentiment_data, orient='index')
drug_summary_df['drugName'] = drug_summary_df.index
drug_summary_df.reset_index(drop=True, inplace=True)

# Sort drugs by positive count and useful count sum
drug_summary_df.sort_values(by=['positive_count', 'useful_count_sum'], ascending=False, inplace=True)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33

In [13]:
# Display the top recommended drugs
top_drugs = drug_summary_df.head(5)
print("Top Recommended Drugs Based on Positive Reviews and Useful Count:")
top_drugs.head()


Top Recommended Drugs Based on Positive Reviews and Useful Count:


Unnamed: 0,positive_count,useful_count_sum,drugName
0,153,3875,Acetaminophen / butalbital / caffeine
4,106,3474,Fioricet
2,52,481,Acetaminophen / dichloralphenazone / isomethep...
3,23,705,Naproxen
11,23,469,Aspirin / butalbital / caffeine / codeine
