### Import Libraries


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
import plotly.express as px
import numpy as np
import pandas as pd
from mlxtend.plotting import plot_learning_curves
from sklearn.preprocessing import LabelEncoder


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from nltk.stem import WordNetLemmatizer, SnowballStemmer

## Data Preprocessing

In [None]:
# train data
df_train = pd.read_csv("/content/drive/My Drive/data/drugsComTest_raw.csv")

# test data
df_test = pd.read_csv("/content/drive/My Drive/data/drugsComTest_raw.csv")

In [None]:
# Removing nan values
df_train = df_train.dropna()
df_test = df_test.dropna()

In [None]:
df_train.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,163740,Mirtazapine,Depression,"""I&#039;ve tried a few antidepressants over th...",10,28-Feb-12,22
1,206473,Mesalamine,"Crohn's Disease, Maintenance","""My son has Crohn&#039;s disease and has done ...",8,17-May-09,17
2,159672,Bactrim,Urinary Tract Infection,"""Quick reduction of symptoms""",9,29-Sep-17,3
3,39293,Contrave,Weight Loss,"""Contrave combines drugs that were used for al...",9,5-Mar-17,35
4,97768,Cyclafem 1 / 35,Birth Control,"""I have been on this birth control for one cyc...",9,22-Oct-15,4


#### Show Class Distibution 

In [None]:
def plot_bar_chart(df):
    # analyze the condition labels
    counts_series = df.condition.value_counts()
    counts_df = pd.DataFrame(counts_series)
    counts_df.reset_index(level=0, inplace=True)

    number_of_classes(df)

    fig = px.bar(counts_df, x="index", y="condition", orientation='v',
              height=400,
              title='xc')
    fig.show()

def number_of_classes(df):
    print("Number of classes: ", len(df["condition"].unique()))

plot_bar_chart(df_train)

Number of classes:  708


In [None]:
# Keeping classes which have more than 20 values in them
index_counts = df_train["condition"].value_counts()[df_train.condition.value_counts() >= 20].index
df_train = df_train[df_train["condition"].isin(index_counts)]
number_of_classes(df_train)

Number of classes:  214


#### Balancing Classes
Undersampling all classes with samples greater than 200 to 200

In [None]:
# undersampling all classes with samples greater than 200 to 200
condition_over200 = df_train["condition"].value_counts()[df_train.condition.value_counts() >= 200].index

for condition in condition_over200:
    # randomly shuffle the samples
    condition_samples = df_train[df_train["condition"]==condition]
    condition_samples = condition_samples.sample(frac=1).reset_index(drop=True)

    # extract only 200
    condition_samples = condition_samples[:200]

    df_train = df_train[df_train["condition"]!=condition]
    # put it back
    df_train = pd.concat([df_train, condition_samples], ignore_index=True)

### Filtering Labels and removing alpha numeric values.


In [None]:
def filter_labels(labels):
    labels = labels.tolist()
    labels_truth = []
    for label in labels:
        if label[0].isdigit():
            labels_truth.append(False)
        else:
            labels_truth.append(True)
    return labels_truth

df_train = df_train[filter_labels(df_train["condition"])]
df_test = df_test[filter_labels(df_test["condition"])]
print("Train ", number_of_classes(df_train))
print("Test ", number_of_classes(df_test))

Number of classes:  210
Train  None
Number of classes:  664
Test  None


### Only keeping the classes that are in the training set, in to the test set

In [None]:
df_test = df_test[df_test["condition"].isin(df_train["condition"])]
number_of_classes(df_test)

Number of classes:  210


### Revised Class Distribution

In [None]:
plot_bar_chart(df_train)

Number of classes:  210


### Encoding Labels

In [None]:
# combining drug name with review
df_train["combined"] = df_train["drugName"].str.lower() + ": " + df_train["review"].str.lower()
df_test["combined"] = df_test["drugName"].str.lower() + ": " + df_test["review"].str.lower()

# df_train["combined"] = df_train["review"].str.lower()
# df_test["combined"] = df_test["review"].str.lower()

In [None]:
labelencoder = LabelEncoder()
df_train["Label"] = labelencoder.fit_transform(df_train["condition"])
df_test["Label"] = labelencoder.fit_transform(df_test["condition"])
len(labelencoder.classes_)

210

### Removing Stepwords and stemming

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def filter_data(reviews):
  
  """
  Filter the synopsis of training and testing df.
  This function removes stop and stem words from the synopsis
  :param synopsis_series:
  :return:
  """
  # covert to lower case
  reviews = reviews.str.lower()
  stop = stopwords.words('english')
  stemmer = SnowballStemmer("english")
  wordnet_lemmatizer = WordNetLemmatizer()
  # remove stop words
  series = reviews.apply(
      lambda x: ' '.join([word for word in x.split() if word not in stop]))

  # remove stem words
  series = series.apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

  # lematize stem words
#         series = series.apply(
#             lambda x: ' '.join([wordnet_lemmatizer.lemmatize(word, pos="v")
#                                 for word in x.split()]))
  return series
    
df_train["combined"] = filter_data(df_train["combined"])
df_test["combined"] = filter_data(df_test["combined"])

In [None]:
# shuffle the training dataframe and first work with it
df_train = df_train.sample(frac=1)
X = df_train['combined']
Y = df_train['Label']

## Creating Model

### Defining a function for K-Fold Cross Validation for multiple classifiers

In [None]:
def cross_val_multiple_classifiers(X, Y):
  classifiers = [MultinomialNB(), SGDClassifier(loss="modified_huber"), RandomForestClassifier(n_estimators=100), KNeighborsClassifier(n_neighbors=5)]
  labels = ['Multinomial Naive Bayes', 'SGD Classifier', 'Random Forest',  'KNN']
  clf_cv_mean = []
  clf_cv_std = []
  for clf, label in zip(classifiers, labels): 
      scores = cross_val_score(clf, X, Y, cv=4, scoring='accuracy')
      print ("Accuracy: %.2f (+/- %.2f) [%s]" %(scores.mean(), scores.std(), label))



### Creating Count Vectorizer Object and transforming whole training dataset from it

In [None]:
count_vectorizer = CountVectorizer(ngram_range=(1,2))
X_count_vec = count_vectorizer.fit_transform(X)

#### Calling K Fold validation of multiple classifiers on Count Vectorized Dataset

In [None]:
# calling multiple classifiers on the vectorized features
cross_val_multiple_classifiers(X_count_vec, Y)

Accuracy: 0.47 (+/- 0.00) [Multinomial Naive Bayes]
Accuracy: 0.57 (+/- 0.01) [SGD Classifier]
Accuracy: 0.65 (+/- 0.00) [Random Forest]
Accuracy: 0.08 (+/- 0.00) [KNN]


### Creating TFIDF Vectorizer

In [None]:
tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_tf_idf_vec = tf_idf_vectorizer.fit_transform(X)

#### Calling K Fold validation of multiple classifiers on TFIDF Vectorized Dataset

In [None]:
cross_val_multiple_classifiers(X_tf_idf_vec, Y)

Accuracy: 0.47 (+/- 0.00) [Multinomial Naive Bayes]
Accuracy: 0.71 (+/- 0.00) [SGD Classifier]
Accuracy: 0.66 (+/- 0.00) [Random Forest]
Accuracy: 0.53 (+/- 0.01) [KNN]


## Ensemble Model

Combining Results from SGDClassifier of Count Vectorizer Model and TFIDF Vectorizer Model using Soft Voting

### Performing train Test Split

In [None]:
# divide the dataset in to train and test set
random_seed = 63445
x1_train, x1_val, y1_train, y1_val = train_test_split(df_train['combined'], df_train['Label'], shuffle=True, test_size = 0.30, random_state=random_seed, stratify=df_train['Label'])

### Vectorizing Train and Validate Objects

In [None]:
def transform_data_in_count_vec_and_tfidf(x_data, count_vectorizer, tf_idf_vectorizer):
  return count_vectorizer.transform(x_data), tf_idf_vectorizer.transform(x_data)

In [None]:
x1_train_count_vec, x1_train_tf_idf = transform_data_in_count_vec_and_tfidf(x1_train,
                                                                            count_vectorizer, 
                                                                            tf_idf_vectorizer)

x1_val_count_vec, x1_val_tf_idf = transform_data_in_count_vec_and_tfidf(x1_val,
                                                                            count_vectorizer, 
                                                                            tf_idf_vectorizer)

In [None]:
sgd_classifier1 = SGDClassifier(loss="modified_huber")
sgd_classifier2 = SGDClassifier(loss="modified_huber")

# fitting model 1 on count vectorizer dataset
sgd_classifier1.fit(x1_train_count_vec, y1_train)

# fitting model 2 on tf idf vectorizer object
sgd_classifier2.fit(x1_train_tf_idf, y1_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def get_ensemble_preds(clfs, x1, x2, y):  
  print("Accuracy of SGD Classifier on Validation Set from Count Vectorizer Object: ", 
        accuracy_score(y, clfs[0].predict(x1)))
  print("Micro F-1 Score of SGD Classifier on Validation Set from Count Vectorizer Object: ", 
        f1_score(y, clfs[0].predict(x1), average="macro"))
  
  print("Accuracy of SGD Classifier Validation Set from TF-IDF Vectorizer Object: ", 
        accuracy_score(y, clfs[1].predict(x2)))
  print("Micro F-1 Score of SGD Classifier Validation Set from TF-IDF Vectorizer Object: ", 
        f1_score(y, clfs[1].predict(x2), average="macro"))
  

  probs_combined = (clfs[0].predict_proba(x1) + 
                    clfs[1].predict_proba(x2)) / 2
  print("Ensemble Accuracy Score: ", accuracy_score(y, 
                                                    np.argmax(probs_combined, 
                                                              axis=1)))
  print("Ensemble Micro F-1 Score: ", f1_score(y, 
                                                    np.argmax(probs_combined, 
                                                              axis=1), average="macro"))


In [None]:
# getting ensemble accuracy on validation set
get_ensemble_preds([sgd_classifier1, sgd_classifier2], x1_val_count_vec, 
                   x1_val_tf_idf, y1_val)

Accuracy of SGD Classifier on Validation Set from Count Vectorizer Object:  0.563508064516129
Micro F-1 Score of SGD Classifier on Validation Set from Count Vectorizer Object:  0.5182146003429337
Accuracy of SGD Classifier Validation Set from TF-IDF Vectorizer Object:  0.7091733870967742
Micro F-1 Score of SGD Classifier Validation Set from TF-IDF Vectorizer Object:  0.6598764082844812
Ensemble Accuracy Score:  0.7037970430107527
Ensemble Micro F-1 Score:  0.6626992789034075


In [None]:

from sklearn import metrics
auc = metrics.roc_auc_score(y1_val, sgd_classifier2.predict_proba(x1_val_tf_idf), 
                            multi_class="ovr")
print(auc)

0.9692289158293079


### Getting Results of Ensemble Model on Test Set


In [None]:
x_test_count_vec, x_test_tf_idf = transform_data_in_count_vec_and_tfidf(df_test["combined"],
                                                                            count_vectorizer, 
                                                                            tf_idf_vectorizer)
get_ensemble_preds([sgd_classifier1, sgd_classifier2], x_test_count_vec, 
                   x_test_tf_idf, df_test["Label"].tolist())

Accuracy of SGD Classifier on Validation Set from Count Vectorizer Object:  0.6270098978913188
Micro F-1 Score of SGD Classifier on Validation Set from Count Vectorizer Object:  0.6270098978913188
Accuracy of SGD Classifier Validation Set from TF-IDF Vectorizer Object:  0.7435155119126795
Micro F-1 Score of SGD Classifier Validation Set from TF-IDF Vectorizer Object:  0.7435155119126795
Ensemble Accuracy Score:  0.7369821212002661
Ensemble Micro F-1 Score:  0.736982121200266
