In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions

plt.style.use('ggplot')
%config InlineBackend.figure_format = 'svg'
%matplotlib inline
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

In [2]:
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, make_scorer
from sklearn.datasets import fetch_20newsgroups
import gensim

### Import cleaned datasets

In [11]:
df_train = pd.read_csv('/Users/jsong/Documents/durg-recommendation/df_train.csv')

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148387 entries, 0 to 148386
Data columns (total 5 columns):
drugName       148387 non-null object
condition      148387 non-null object
rating         148387 non-null float64
usefulCount    148387 non-null int64
review         148386 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 5.7+ MB


In [13]:
df_train.dropna()

Unnamed: 0,drugName,condition,rating,usefulCount,review
0,Guanfacine,ADHD,8.0,192,son halfway fourth intuniv became concerned be...
1,Lybrel,Birth Control,5.0,17,used take another contraceptive pill cycle hap...
2,Ortho Evra,Birth Control,8.0,10,first time using form birth control glad went ...
3,Cialis,Benign Prostatic Hyperplasia,2.0,43,nd started work rock hard erection however exp...
4,Levonorgestrel,Emergency Contraception,1.0,5,pulled cummed bit took plan b hour later took ...
...,...,...,...,...,...
148382,Junel 1.5 / 30,Birth Control,6.0,0,would second junel birth control year changed ...
148383,Metoclopramide,Nausea/Vomiting,1.0,34,given surgey immediately became anxious could ...
148384,Orencia,Rheumatoid Arthritis,2.0,35,limited improvement month developed bad rash m...
148385,Thyroid desiccated,Underactive Thyroid,10.0,79,thyroid medication year spent first synthroid ...


In [9]:
df_topic = pd.read_csv('/Users/jsong/Documents/durg-recommendation/df_dominant_topic.csv')

In [10]:
df_topic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148386 entries, 0 to 148385
Data columns (total 4 columns):
Dominant_Topic       148386 non-null float64
Perc_Contribution    148386 non-null float64
Topic_Keywords       148386 non-null object
review               148386 non-null object
dtypes: float64(2), object(2)
memory usage: 4.5+ MB


In [14]:
df = pd.concat([df_train, df_topic], axis=1, join='inner')

In [16]:
del df['review']

In [17]:
df

Unnamed: 0,drugName,condition,rating,usefulCount,Dominant_Topic,Perc_Contribution,Topic_Keywords
0,Guanfacine,ADHD,8.0,192,2.0,0.6443,"side effect, no side, year ago, panic attack"
1,Lybrel,Birth Control,5.0,17,3.0,0.6844,"birth control, mood swing, weight gain, side e..."
2,Ortho Evra,Birth Control,8.0,10,3.0,0.8630,"birth control, mood swing, weight gain, side e..."
3,Cialis,Benign Prostatic Hyperplasia,2.0,43,0.0,0.5926,"blood pressure, started taking, lost lb, feel ..."
4,Levonorgestrel,Emergency Contraception,1.0,5,1.0,0.9370,"felt like, first time, yeast infection, took pill"
...,...,...,...,...,...,...,...
148381,Tekturna,High Blood Pressure,7.0,18,3.0,0.9193,"birth control, mood swing, weight gain, side e..."
148382,Junel 1.5 / 30,Birth Control,6.0,0,0.0,0.6094,"blood pressure, started taking, lost lb, feel ..."
148383,Metoclopramide,Nausea/Vomiting,1.0,34,2.0,0.6066,"side effect, no side, year ago, panic attack"
148384,Orencia,Rheumatoid Arthritis,2.0,35,1.0,0.4715,"felt like, first time, yeast infection, took pill"


In [53]:
drug_per_condition = df.groupby(['condition'])['drugName'].nunique().sort_values(ascending=False)
drug_per_condition

condition
Pain                                200
Birth Control                       172
High Blood Pressure                 140
Acne                                117
Depression                          105
                                   ... 
Sexual Dysfunction, SSRI Induced     11
Nausea/Vomiting, Postoperative       11
Hyperhidrosis                        11
Ovarian Cance                        11
Keratosis                            11
Name: drugName, Length: 185, dtype: int64

In [48]:
drug_per_condition[:10]

condition
Pain                    200
Birth Control           172
High Blood Pressure     140
Acne                    117
Depression              105
Rheumatoid Arthritis     98
Diabetes, Type 2         89
Allergic Rhinitis        88
Osteoarthritis           80
Bipolar Disorde          80
Name: drugName, dtype: int64

In [49]:
condition_1=drug_per_condition[:10].keys()
condition_1

Index(['Pain', 'Birth Control', 'High Blood Pressure', 'Acne', 'Depression',
       'Rheumatoid Arthritis', 'Diabetes, Type 2', 'Allergic Rhinitis',
       'Osteoarthritis', 'Bipolar Disorde'],
      dtype='object', name='condition')

In [51]:
#selecting only top 10 conditions
df_top_10=df[df['condition'].isin(condition_1)]
df_top_10.head()

Unnamed: 0,drugName,condition,rating,usefulCount,Dominant_Topic,Perc_Contribution,Topic_Keywords
1,Lybrel,Birth Control,5.0,17,3.0,0.6844,"birth control, mood swing, weight gain, side e..."
2,Ortho Evra,Birth Control,8.0,10,3.0,0.863,"birth control, mood swing, weight gain, side e..."
5,Aripiprazole,Bipolar Disorde,10.0,32,0.0,0.5667,"blood pressure, started taking, lost lb, feel ..."
7,Ethinyl estradiol / levonorgestrel,Birth Control,8.0,1,3.0,0.5531,"birth control, mood swing, weight gain, side e..."
9,L-methylfolate,Depression,10.0,54,2.0,0.8252,"side effect, no side, year ago, panic attack"


In [61]:
top_10=df_top_10.groupby(['condition']).Dominant_Topic.value_counts(normalize=True)

In [62]:
top_10

condition             Dominant_Topic
Acne                  3.0               0.696791
                      2.0               0.122995
                      1.0               0.099465
                      0.0               0.080749
Allergic Rhinitis     0.0               0.456158
                      2.0               0.426601
                      1.0               0.091626
                      3.0               0.025616
Bipolar Disorde       2.0               0.657858
                      0.0               0.246492
                      3.0               0.067353
                      1.0               0.028297
Birth Control         3.0               0.775175
                      1.0               0.119787
                      2.0               0.060419
                      0.0               0.044619
Depression            2.0               0.696259
                      0.0               0.224997
                      3.0               0.046897
                      1.0       

### ==> visualization in Tableau