In [72]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn.utils import shuffle
from nltk.corpus import stopwords
import nltk.stem as stm # Import stem class from nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

## Reading training and validation dataset

In [55]:
df_train = pd.read_csv(r'C:\Users\hp\Desktop\SMM4H_DATASETS\trn_cls.csv' , header = None )
df_train = df_train.rename(columns = { 0 : "ADR" ,  1 : "Tweet"} )
df_train.head()

Unnamed: 0,ADR,Tweet
0,1.0,do any of my friends who take pristiq have pro...
1,1.0,humira helped my crohn s but then respiratory...
2,1.0,I feel siiiiiiiiiiiiiiick. Damn you venlafaxine.
3,1.0,it is totally stupid withdrawing from temazep...
4,1.0,monique thinks the grapefruit i ate amp latuda...


In [56]:
df_val = pd.read_csv(r'C:\Users\hp\Desktop\SMM4H_DATASETS\val_cls.csv' , header = None )
df_val = df_val.rename(columns = { 0 : "ADR" ,  1 : "Tweet"} )
df_val.head()

Unnamed: 0,ADR,Tweet
0,1.0,did not know Lamotrigine was addictive Stoppe...
1,1.0,i used to dream every night without fail on f...
2,1.0,tamiflu has a side effect which includes vomi...
3,1.0,DOC for Panic disorders alprazolam addictive a...
4,1.0,taking entyvio now for crohn is since remicad...


## Preprocessing

In [58]:
## Removing stop words
stop = stopwords.words('english')
df_train['cleaned'] = df_train['Tweet'].apply(lambda x: ' '.join( [ word for word in x.split() if word not in (stop) ] ))
df_val['cleaned'] = df_val['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [59]:
## Stemming 
stemmer = stm.PorterStemmer()
df_train['cleaned'] = df_train['cleaned'].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split(" ")]))
df_val['cleaned'] = df_val['cleaned'].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split(" ")]))

## Countvectorizer

In [60]:

count_vect = CountVectorizer(ngram_range=(1,3) , stop_words='english')

count_vectorized_train = count_vect.fit_transform(df_train.cleaned)
count_vectorized_val = count_vect.transform(df_val.cleaned)

## TF-IDF Vectorizer

In [61]:
tfidf_vect = TfidfVectorizer(stop_words='english', use_idf=True)

tfidf_vectorized_train = tfidf_vect.fit_transform(df_train.cleaned)
tfidf_vectorized_val = tfidf_vect.transform(df_val.cleaned)

In [63]:
print(count_vectorized_train.shape)
print(count_vectorized_val.shape)

(5974, 75761)
(3460, 75761)


In [64]:
print(tfidf_vectorized_train.shape)
print(tfidf_vectorized_val.shape)

(5974, 8846)
(3460, 8846)


In [65]:
X_train_count = count_vectorized_train
y_train_count = df_train.ADR

X_test_count = count_vectorized_val
y_test_count = df_val.ADR

In [88]:
X_train_tfidf = tfidf_vectorized_train
y_train_tfidf = df_train.ADR

X_test_tfidf = tfidf_vectorized_val
y_test_tfidf = df_val.ADR

In [66]:
X_train_count.shape

(5974, 75761)

In [67]:
len(y_train_count)

5974

In [68]:
X_test_count.shape

(3460, 75761)

In [70]:
len(y_test_count)

3460

## Models

## 1. Multinommial NB

In [82]:
## GRID SEARCH FOR COUNT VECTORIZER
parameters = {'alpha':[0.01, 0.1, 1.0, 10.0, 100.0]}
nb_count = MultinomialNB()
nb_grid_count = GridSearchCV( nb_count , param_grid = parameters , scoring='f1' , cv = 5 )
nb_grid_count.fit(X_train_count, y_train_count)
print(nb_grid_count.best_params_)
print(nb_grid_count.cv_results_['mean_test_score'])

{'alpha': 1.0}
[0.81024334 0.81860755 0.82762594 0.80876471 0.77690775]


In [86]:
nb_count = MultinomialNB( alpha = 1.0)
nb_count.fit( X_train_count , y_train_count)
predictions_count = nb_count.predict(X_test_count)
print(accuracy_score(y_test_count, predictions_count))
print(precision_score(y_test_count ,predictions_count, average='binary'))
print(recall_score(y_test_count ,predictions_count, average='binary'))
print(f1_score(y_test_count ,predictions_count, average='binary'))

0.7063583815028902
0.23813169984686064
0.9367469879518072
0.3797313797313797


In [95]:
### GRID SEARCH FOR TFIDF
parameters = {'alpha':[0.01, 0.1, 1.0, 10.0, 100.0]}
nb_tfidf = MultinomialNB()
nb_grid_tfidf = GridSearchCV( nb_tfidf , param_grid = parameters , scoring='f1' , cv = 5)
nb_grid_tfidf.fit(X_train_tfidf , y_train_tfidf )
print(nb_grid_tfidf.best_params_)
print(nb_grid_tfidf.cv_results_['mean_test_score'])

{'alpha': 1.0}
[0.79501209 0.80651422 0.81010657 0.79022128 0.77614524]


In [90]:
nb_tfidf = MultinomialNB( alpha = 1.0)
nb_tfidf.fit( X_train_tfidf , y_train_tfidf )
predictions_tfidf = nb_tfidf.predict(X_test_tfidf)
print(accuracy_score(y_test_tfidf , predictions_tfidf ))
print(precision_score(y_test_tfidf ,predictions_tfidf , average = 'binary'))
print(recall_score(y_test_tfidf ,predictions_tfidf , average = 'binary'))
print(f1_score(y_test_tfidf ,predictions_tfidf , average = 'binary'))


0.6919075144508671
0.22855029585798817
0.9307228915662651
0.3669833729216152


## 2. Logistic Regression 

In [91]:
from sklearn.linear_model import LogisticRegression

In [92]:
from sklearn.model_selection import GridSearchCV

## GRID SEARCH FOR COUNT VECTORIZER
parameters = { 'C':[0.001,0.01,0.1,1,10,100] }
lr_count = LogisticRegression( penalty = 'l2')
lr_grid_count = GridSearchCV( lr_count  ,  parameters ,  scoring = 'f1' , cv = 5)
lr_grid_count.fit(X_train_count, y_train_count)
print(lr_grid_count.best_params_)
print(lr_grid_count.cv_results_['mean_test_score'])




{'C': 1}
[0.75683966 0.76242132 0.81526726 0.83332916 0.83208507 0.82964249]


In [93]:
lr_count = LogisticRegression( C = 1 , penalty = 'l2')
lr_count.fit( X_train_count , y_train_count)
predictions_count = lr_count.predict(X_test_count)
print(accuracy_score(y_test_count, predictions_count))
print(precision_score(y_test_count ,predictions_count, average='binary'))
print(recall_score(y_test_count ,predictions_count, average='binary'))
print(f1_score(y_test_count ,predictions_count, average='binary'))

0.85
0.3748326639892905
0.8433734939759037
0.5189990732159407




In [94]:
### GRID SEARCH FOR TFIDF
parameters = { 'C':[0.001,0.01,0.1,1,10,100] }

lr_tfidf = LogisticRegression( penalty = 'l2')
lr_grid_tfidf = GridSearchCV( lr_tfidf  ,  parameters ,  scoring = 'f1' , cv = 5)
lr_grid_tfidf.fit(X_train_tfidf, y_train_tfidf)
print(lr_grid_tfidf.best_params_)
print(lr_grid_tfidf.cv_results_['mean_test_score'])




{'C': 10}
[0.77313195 0.75181834 0.76015655 0.79418491 0.82340614 0.81973023]


In [96]:
lr_tfidf = LogisticRegression( C = 10 , penalty = 'l2')
lr_tfidf.fit( X_train_tfidf , y_train_tfidf )
predictions_tfidf = lr_tfidf.predict(X_test_tfidf)
print(accuracy_score(y_test_tfidf, predictions_tfidf ))
print(precision_score(y_test_tfidf ,predictions_tfidf, average='binary'))
print(recall_score(y_test_tfidf ,predictions_tfidf, average='binary'))
print(f1_score(y_test_tfidf ,predictions_tfidf, average='binary'))

0.8135838150289018
0.3231638418079096
0.8614457831325302
0.47000821692686934




## 3. SVM

In [97]:
from sklearn import svm
## GRID SEARCH FOR COUNT VECTORIZER

Cs = [0.01, 0.1, 1, 10]
gammas = [0.001, 0.1, ]
parameters = {'C': Cs, 'gamma' : gammas}

svm_count = svm.SVC(kernel = 'rbf')
grid_svm_count = GridSearchCV( svm_count , parameters,  scoring = 'f1' , cv = 5)
grid_svm_count.fit(X_train_count, y_train_count)
print(grid_svm_count.best_params_)
print(grid_svm_count.cv_results_['mean_test_score'])


{'C': 10, 'gamma': 0.1}
[0.56391153 0.66926115 0.56391153 0.67420864 0.54854047 0.79625872
 0.78094628 0.80616809]


In [98]:
svm_count = svm.SVC( C = 10 , gamma =  0.1 , kernel = 'rbf')
svm_count.fit( X_train_count , y_train_count)
predictions_count = svm_count.predict(X_test_count)
print(accuracy_score(y_test_count, predictions_count))
print(precision_score(y_test_count ,predictions_count, average='binary'))
print(recall_score(y_test_count ,predictions_count, average='binary'))
print(f1_score(y_test_count ,predictions_count, average='binary'))

0.8713872832369942
0.41294298921417566
0.8072289156626506
0.54638124362895


In [99]:
### GRID SEARCH FOR TFIDF
Cs = [0.01, 0.1, 1, 10]
gammas = [0.001, 0.1, ]
parameters = {'C': Cs, 'gamma' : gammas}

svm_tfidf = svm.SVC(kernel = 'rbf')
grid_svm_tfidf = GridSearchCV( svm_tfidf , parameters,  scoring = 'f1' , cv = 5)
grid_svm_tfidf.fit(X_train_tfidf, y_train_tfidf)
print(grid_svm_tfidf.best_params_)
print(grid_svm_tfidf.cv_results_['mean_test_score'])

{'C': 10, 'gamma': 0.1}
[0.48234078 0.50407485 0.48234078 0.50407485 0.48234078 0.75889053
 0.48234078 0.82217387]


In [100]:
svm_tfidf = svm.SVC( C = 10 , gamma =  0.1 , kernel = 'rbf')
svm_tfidf.fit( X_train_tfidf , y_train_tfidf )
predictions_tfidf = svm_tfidf.predict(X_test_tfidf)
print(accuracy_score(y_test_tfidf , predictions_tfidf))
print(precision_score(y_test_tfidf ,predictions_tfidf, average='binary'))
print(recall_score(y_test_tfidf ,predictions_tfidf, average='binary'))
print(f1_score(y_test_tfidf ,predictions_tfidf, average='binary'))

0.8078034682080925
0.3160220994475138
0.8614457831325302
0.46240905416329825


## 4. Decision Trees

In [101]:
from sklearn.tree import DecisionTreeClassifier


In [104]:
min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)
print(min_samples_splits)

[0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1. ]


In [105]:
## GRID SEARCH FOR COUNT VECTORIZER

parameters = {'min_samples_split' : [ 10 , 50 , 150],'max_depth': [1,2, 5 , 20]}

tree_count = DecisionTreeClassifier()
tree_grid_count = GridSearchCV(tree_count , parameters, scoring = 'f1' , cv = 5)
tree_grid_count.fit(X_train_count, y_train_count)
print(tree_grid_count.best_params_)
print(tree_grid_count.cv_results_['mean_test_score'])

{'max_depth': 20, 'min_samples_split': 150}
[0.16721654 0.16721654 0.16721654 0.25766085 0.25766085 0.25728127
 0.4031897  0.40209795 0.40365013 0.65979864 0.66270986 0.68063069]


In [106]:
tree_count = DecisionTreeClassifier( max_depth =  20 , min_samples_split = 150 )
tree_count.fit( X_train_count , y_train_count )
predictions_count_tree = tree_count.predict(X_test_count)

print(accuracy_score( y_test_count , predictions_count_tree ))
print(precision_score(y_test_count , predictions_count_tree, average = 'binary'))
print(recall_score(y_test_count , predictions_count_tree , average = 'binary'))
print(f1_score(y_test_count ,predictions_count_tree , average = 'binary'))

0.7341040462427746
0.21232876712328766
0.6536144578313253
0.3205317577548006


In [107]:
## GRID SEARCH FOR TF-IDF

parameters = {'min_samples_split' : [ 10 , 50 , 150],'max_depth': [1,2, 5 , 20]}

tree_tfidf = DecisionTreeClassifier()
tree_grid_tfidf = GridSearchCV(tree_tfidf , parameters , scoring = 'f1' , cv = 5)
tree_grid_tfidf.fit(X_train_tfidf , y_train_tfidf )
print(tree_grid_tfidf.best_params_)
print(tree_grid_tfidf.cv_results_['mean_test_score'])

{'max_depth': 20, 'min_samples_split': 150}
[0.14002645 0.14002645 0.14002645 0.25720186 0.25720186 0.25720186
 0.38183927 0.38160712 0.38350667 0.66098008 0.65877136 0.67037368]


In [108]:
tree_tfidf = DecisionTreeClassifier( max_depth =  20 , min_samples_split = 150 )
tree_tfidf.fit( X_train_tfidf , y_train_tfidf )
predictions_tfidf_tree = tree_tfidf.predict( X_test_tfidf )

print(accuracy_score( y_test_tfidf , predictions_tfidf_tree ))
print(precision_score(y_test_tfidf , predictions_tfidf_tree, average = 'binary'))
print(recall_score(y_test_tfidf , predictions_tfidf_tree , average = 'binary'))
print(f1_score(y_test_tfidf ,predictions_tfidf_tree , average = 'binary'))

0.7604046242774567
0.2223463687150838
0.5993975903614458
0.3243683781581092
