In [1]:
# importing required libraries
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
# reading the drug dataset
final_df = pd.read_csv(r"C:\Users\patel\OneDrive\Desktop\AI\Project\DataSet\DiseaseDrug_dataset.csv")
final_df.head()

Unnamed: 0,Drug,Disease,Review,Rating,UsefulCount,Symptoms
0,Sulfamethoxazole / trimethoprim,urinary tract infection,"""I have bad side effects from all antibiotics,...",1,17,"[' burning_micturition', ' bladder_discomfort'..."
1,Levofloxacin,urinary tract infection,"""This last Monday I was detected by my doctor ...",3,24,"[' burning_micturition', ' bladder_discomfort'..."
2,Sulfamethoxazole / trimethoprim,urinary tract infection,"""I am currently suffering from recurring cysti...",1,7,"[' burning_micturition', ' bladder_discomfort'..."
3,Nitrofurantoin,urinary tract infection,"""I have been taking 4 a day for last 6 days, 2...",5,10,"[' burning_micturition', ' bladder_discomfort'..."
4,Macrobid,urinary tract infection,"""I wish I would have read these reviews before...",1,28,"[' burning_micturition', ' bladder_discomfort'..."


In [3]:
# initializing the instance of the sentiment analyzer
sentiment_analyser = SentimentIntensityAnalyzer()

In [4]:
# creating a list of reviews which can be an input for vader analysis.
list_of_reviews = list(final_df.Review)
sentiments = []
positive = []
negative = []
neutral = []
compound = []

In [5]:
# getting the polarity score of sentiment for the rewiews 
for rew in list_of_reviews:
  compoundS = sentiment_analyser.polarity_scores(rew)["compound"]
  positiveS = sentiment_analyser.polarity_scores(rew)["pos"]
  neutralS = sentiment_analyser.polarity_scores(rew)["neu"]
  negativeS = sentiment_analyser.polarity_scores(rew)["neg"]

  positive.append(positiveS)
  negative.append(negativeS)
  neutral.append(neutralS)
  compound.append(compoundS)
  sentiments.append({"Review":rew,
                     "Positive": positiveS,
                     "Negative": neutralS,
                     "Neutral": negativeS,
                     "Compound": compoundS})

nlp_df = pd.DataFrame.from_dict(sentiments)

In [6]:
nlp_df.head()

Unnamed: 0,Review,Positive,Negative,Neutral,Compound
0,"""I have bad side effects from all antibiotics,...",0.044,0.85,0.106,-0.8384
1,"""This last Monday I was detected by my doctor ...",0.054,0.854,0.092,-0.7825
2,"""I am currently suffering from recurring cysti...",0.075,0.694,0.231,-0.9471
3,"""I have been taking 4 a day for last 6 days, 2...",0.07,0.642,0.288,-0.9669
4,"""I wish I would have read these reviews before...",0.059,0.747,0.195,-0.9593


In [7]:
# adding the sentiment analysis columns to the final dataset
final_df["Positive"] = positive
final_df["Negative"] = negative
final_df["Neutral"] = neutral
final_df["Compound"] = compound
final_df["Sentiment"] = ''

In [8]:
final_df.head()

Unnamed: 0,Drug,Disease,Review,Rating,UsefulCount,Symptoms,Positive,Negative,Neutral,Compound,Sentiment
0,Sulfamethoxazole / trimethoprim,urinary tract infection,"""I have bad side effects from all antibiotics,...",1,17,"[' burning_micturition', ' bladder_discomfort'...",0.044,0.106,0.85,-0.8384,
1,Levofloxacin,urinary tract infection,"""This last Monday I was detected by my doctor ...",3,24,"[' burning_micturition', ' bladder_discomfort'...",0.054,0.092,0.854,-0.7825,
2,Sulfamethoxazole / trimethoprim,urinary tract infection,"""I am currently suffering from recurring cysti...",1,7,"[' burning_micturition', ' bladder_discomfort'...",0.075,0.231,0.694,-0.9471,
3,Nitrofurantoin,urinary tract infection,"""I have been taking 4 a day for last 6 days, 2...",5,10,"[' burning_micturition', ' bladder_discomfort'...",0.07,0.288,0.642,-0.9669,
4,Macrobid,urinary tract infection,"""I wish I would have read these reviews before...",1,28,"[' burning_micturition', ' bladder_discomfort'...",0.059,0.195,0.747,-0.9593,


In [9]:
final_df.loc[final_df['Compound'] >= 0.05, 'Sentiment'] = 'Positive'
final_df.loc[final_df['Compound'] <= -0.05, 'Sentiment'] = 'Negative'
final_df["Sentiment"].replace('', 'Neutral', inplace = True)

In [10]:
# deleting the positive, negative, neutral and compound columns.
final_df = final_df.drop(columns = ["Positive", "Negative", "Neutral", "Compound"])


In [11]:
final_df = final_df[["Disease", "Drug", "Symptoms", "Review", "Sentiment", "Rating", "UsefulCount"]]
final_df.head()

Unnamed: 0,Disease,Drug,Symptoms,Review,Sentiment,Rating,UsefulCount
0,urinary tract infection,Sulfamethoxazole / trimethoprim,"[' burning_micturition', ' bladder_discomfort'...","""I have bad side effects from all antibiotics,...",Negative,1,17
1,urinary tract infection,Levofloxacin,"[' burning_micturition', ' bladder_discomfort'...","""This last Monday I was detected by my doctor ...",Negative,3,24
2,urinary tract infection,Sulfamethoxazole / trimethoprim,"[' burning_micturition', ' bladder_discomfort'...","""I am currently suffering from recurring cysti...",Negative,1,7
3,urinary tract infection,Nitrofurantoin,"[' burning_micturition', ' bladder_discomfort'...","""I have been taking 4 a day for last 6 days, 2...",Negative,5,10
4,urinary tract infection,Macrobid,"[' burning_micturition', ' bladder_discomfort'...","""I wish I would have read these reviews before...",Negative,1,28


In [12]:
# storing the final dataset into a csv file.
final_df.to_csv('C:/Users/patel/OneDrive/Desktop/AI/Project/DataSet/Sentiment_Analysis.csv',index=False)

In [13]:
# reading the dataset with sentiment analysis of reviews
df = pd.read_csv(r"C:\Users\patel\OneDrive\Desktop\AI\Project\DataSet\Sentiment_Analysis.csv")
df.head()

Unnamed: 0,Disease,Drug,Symptoms,Review,Sentiment,Rating,UsefulCount
0,urinary tract infection,Sulfamethoxazole / trimethoprim,"[' burning_micturition', ' bladder_discomfort'...","""I have bad side effects from all antibiotics,...",Negative,1,17
1,urinary tract infection,Levofloxacin,"[' burning_micturition', ' bladder_discomfort'...","""This last Monday I was detected by my doctor ...",Negative,3,24
2,urinary tract infection,Sulfamethoxazole / trimethoprim,"[' burning_micturition', ' bladder_discomfort'...","""I am currently suffering from recurring cysti...",Negative,1,7
3,urinary tract infection,Nitrofurantoin,"[' burning_micturition', ' bladder_discomfort'...","""I have been taking 4 a day for last 6 days, 2...",Negative,5,10
4,urinary tract infection,Macrobid,"[' burning_micturition', ' bladder_discomfort'...","""I wish I would have read these reviews before...",Negative,1,28


In [14]:
#Sorting the data based on drug name
df = df.sort_values(['Drug'])

In [15]:
# function to calculate the weighted average
def calc_weighted_avg(group, avg_name, weight_name):
    a = group[avg_name]
    b = group[weight_name]
    try:
        return (a * b).sum() / b.sum()
    except ZeroDivisionError:
        return a.mean()

In [16]:
# creating a groups for diseases and their drugs based on the ratings and thier count 
df.groupby(["Disease", "Drug"]).apply(calc_weighted_avg, "Rating", "UsefulCount")

Disease                  Drug                           
acne                     Absorica                            6.692308
                         Acanya                              7.000000
                         Accutane                            8.798081
                         Acnex                              10.000000
                         Aczone                              8.584971
                                                              ...    
urinary tract infection  Sulfamethoxazole / trimethoprim     5.780838
                         Trimethoprim                        3.029851
                         Unasyn                              7.000000
                         Uribel                              8.967213
                         Vibramycin                         10.000000
Length: 305, dtype: float64

In [17]:
df1 = pd.DataFrame(df.groupby(["Drug"]).apply(calc_weighted_avg, "Rating", "UsefulCount").reset_index())

In [18]:
df1 = df1.rename(columns={0: "Rating_Weighted_Avg"})

In [19]:
df1.head()

Unnamed: 0,Drug,Rating_Weighted_Avg
0,Absorica,6.692308
1,Acanya,7.0
2,Accutane,8.798081
3,Acetaminophen / aspirin / caffeine,7.995652
4,Acetaminophen / aspirin / caffeine / salicylamide,10.0


In [20]:
# combining the drug dataset with the weighted drug dataset based on similar drug
new_df = pd.merge(df1, df, on='Drug')
new_df.head()

Unnamed: 0,Drug,Rating_Weighted_Avg,Disease,Symptoms,Review,Sentiment,Rating,UsefulCount
0,Absorica,6.692308,acne,"[' skin_rash', ' pus_filled_pimples', ' blackh...","""Absorica worked right away. After the second ...",Positive,4,6
1,Absorica,6.692308,acne,"[' skin_rash', ' pus_filled_pimples', ' blackh...","""My son who has tried almost every available e...",Negative,9,7
2,Acanya,7.0,acne,"[' skin_rash', ' pus_filled_pimples', ' blackh...","""I&#039;ve always been hesitant to use acne me...",Positive,9,9
3,Acanya,7.0,acne,"[' skin_rash', ' pus_filled_pimples', ' blackh...","""My dermatologist gave me Acanya to use in the...",Negative,4,15
4,Acanya,7.0,acne,"[' skin_rash', ' pus_filled_pimples', ' blackh...","""I was breaking out badly on the sides of my c...",Negative,10,5


In [21]:
print("No. of unique drugs: ", new_df['Drug'].nunique())
new_df['Drug'].unique()

No. of unique drugs:  286


array(['Absorica', 'Acanya', 'Accutane',
       'Acetaminophen / aspirin / caffeine',
       'Acetaminophen / aspirin / caffeine / salicylamide', 'Aciphex',
       'Acitretin', 'Acnex', 'Aczone', 'Adalimumab', 'Adapalene',
       'Adapalene / benzoyl peroxide', 'Adoxa', 'Aldactone',
       'Almotriptan', 'Altabax', 'Altace',
       'Aluminum hydroxide / magnesium hydroxide / simethicone', 'Amerge',
       'Ammonium lactate / halobetasol', 'Amnesteem', 'Amoxicillin',
       'Amoxicillin / clavulanate', 'Amoxil', 'Amrix', 'Aspirin',
       'Aspirin / butalbital / caffeine', 'Atovaquone',
       'Atovaquone / proguanil', 'Atralin', 'Augmentin', 'Avelox',
       'Axert', 'Axid', 'Axid AR', 'Azelaic acid', 'Azelex',
       'Azithromycin', 'Bactrim', 'Bactrim DS', 'Bayer Aspirin',
       'BenzEFoam', 'BenzEFoam Ultra', 'Benzaclin', 'Benzoyl peroxide',
       'Benzoyl peroxide / clindamycin',
       'Benzoyl peroxide / erythromycin', 'Benzoyl peroxide / sulfur',
       'Betamethasone / calcip

In [22]:
print("No. of unique diseases: ", new_df['Disease'].nunique())
new_df['Disease'].unique()

No. of unique diseases:  12


array(['acne', 'migraine', 'gerd', 'psoriasis', 'heart attack',
       'urinary tract infection', 'pneumonia', 'malaria', 'hepatitis c',
       'hyperthyroidism', 'gastroenteritis', 'hepatitis b'], dtype=object)

In [23]:
#Sorting dataset and grouping by disease
new_df = new_df.sort_values(['Disease','Rating_Weighted_Avg'],ascending=False, ignore_index=True).groupby('Disease').head()
new_df.head()

Unnamed: 0,Drug,Rating_Weighted_Avg,Disease,Symptoms,Review,Sentiment,Rating,UsefulCount
0,Cefixime,10.0,urinary tract infection,"[' burning_micturition', ' bladder_discomfort'...","""My daughter was born with urinary reflux, the...",Negative,10,7
1,Doribax,10.0,urinary tract infection,"[' burning_micturition', ' bladder_discomfort'...","""It is proved itself it is an excellent drug o...",Positive,10,1
2,Doripenem,10.0,urinary tract infection,"[' burning_micturition', ' bladder_discomfort'...","""It is proved itself it is an excellent drug o...",Positive,10,1
3,Lactobacillus acidophilus,10.0,urinary tract infection,"[' burning_micturition', ' bladder_discomfort'...","""This has been a miracle for me, and it was by...",Negative,10,189
4,Macrodantin,10.0,urinary tract infection,"[' burning_micturition', ' bladder_discomfort'...","""Started experiecing pain peeing and went for ...",Positive,10,28


In [24]:
# storing the dataset contaning the calculated weighted average into a csv file
new_df.to_csv('C:/Users/patel/OneDrive/Desktop/AI/Project/DataSet/Drug_Recommendation_data.csv',index=False)