In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk.stem as stm

import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import string

from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import confusion_matrix,roc_auc_score,log_loss
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from xgboost import XGBClassifier
seed = 4353
from sklearn.metrics import f1_score

In [2]:
df_train = pd.read_csv(r'C:\Users\hp\Desktop\SMM4H_DATASETS\trn_cls.csv' , header = None )
df_train = df_train.rename(columns = { 0 : "ADR" ,  1 : "Tweet"} )

df_val = pd.read_csv(r'C:\Users\hp\Desktop\SMM4H_DATASETS\val_cls.csv' , header = None )
df_val = df_val.rename(columns = { 0 : "ADR" ,  1 : "Tweet"} )


In [27]:
## Removing stop words
stop = stopwords.words('english')
df_train['cleaned'] = df_train['Tweet'].apply(lambda x: ' '.join( [ word for word in x.split() if word not in (stop) ] ))
df_val['cleaned'] = df_val['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## Stemming 
stemmer = stm.PorterStemmer()
df_train['cleaned'] = df_train['cleaned'].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split(" ")]))
df_val['cleaned'] = df_val['cleaned'].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split(" ")]))

count_vect = CountVectorizer(ngram_range=(1,3) , stop_words='english')

count_vectorized_train = count_vect.fit_transform(df_train.cleaned)
count_vectorized_val = count_vect.transform(df_val.cleaned)

tfidf_vect = TfidfVectorizer(stop_words='english', use_idf=True)

tfidf_vectorized_train = tfidf_vect.fit_transform(df_train.cleaned)
tfidf_vectorized_val = tfidf_vect.transform(df_val.cleaned)

In [28]:
print(count_vectorized_train.shape)
print(count_vectorized_val.shape)

(5974, 75761)
(3460, 75761)


In [29]:
print(tfidf_vectorized_train.shape)
print(tfidf_vectorized_val.shape)

(5974, 8846)
(3460, 8846)


In [30]:
X_train_count = count_vectorized_train
y_train_count = df_train.ADR

X_test_count = count_vectorized_val
y_test_count = df_val.ADR

In [31]:
X_train_tfidf = tfidf_vectorized_train
y_train_tfidf = df_train.ADR

X_test_tfidf = tfidf_vectorized_val
y_test_tfidf = df_val.ADR


In [33]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

## COUNT VECTORIZER

## Logistic Regression

In [44]:
## FOR COOUNT VECTORIZER 
from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(X_train_count, y_train_count , random_state = 0)

lr_count = LogisticRegression( C = 100 , penalty = 'l2')
lr_count.fit( X_train , y_train)
##on validation set
predictions_count = lr_count.predict(X_val)
print("ON VALIDATION SET : ")
print(accuracy_score(y_val , predictions_count))

## on test set
predictions_count = lr_count.predict(X_test_count)
print("ON TEST SET : ")
print(accuracy_score(y_test_count, predictions_count))
print(precision_score(y_test_count ,predictions_count, average='binary'))
print(recall_score(y_test_count ,predictions_count, average='binary'))
print(f1_score(y_test_count ,predictions_count, average='binary'))


ON VALIDATION SET : 
0.8306559571619813
ON TEST SET : 
0.8384393063583815
0.34805890227576974
0.7831325301204819
0.48192771084337355




## SVM

In [65]:
## FOR COOUNT VECTORIZER 
from sklearn import svm

X_train, X_val, y_train, y_val = train_test_split(X_train_count, y_train_count , random_state = 0)

svm_count = svm.SVC( C = 100 , gamma =  0.1 , kernel = 'linear')
svm_count.fit( X_train , y_train )

##on validation set
predictions_count = svm_count.predict( X_val )
print("ON VALIDATION SET : ")
print(accuracy_score( y_val , predictions_count))

## on test set
print("ON TEST SET :")
predictions_count = svm_count.predict(X_test_count)
print(accuracy_score(y_test_count, predictions_count))
print(precision_score(y_test_count ,predictions_count, average='binary'))
print(recall_score(y_test_count ,predictions_count, average='binary'))
print(f1_score(y_test_count ,predictions_count, average='binary'))

ON VALIDATION SET : 
0.821285140562249
ON TEST SET :
0.8317919075144509
0.33638743455497383
0.7740963855421686
0.468978102189781


## Naive Bayes

In [63]:
from sklearn.naive_bayes import MultinomialNB

X_train, X_val, y_train, y_val = train_test_split(X_train_count, y_train_count , random_state = 0)

nb_count = MultinomialNB( alpha = 0.01 )
nb_count.fit( X_train , y_train )

#On validation set
predictions_count = nb_count.predict( X_val )
print("ON VALIDATION SET : ")
print(accuracy_score( y_val , predictions_count))

## on test set
predictions_count = nb_count.predict(X_test_count)
print("ON TEST SET :")
print(accuracy_score(y_test_count, predictions_count))
print(precision_score(y_test_count ,predictions_count, average='binary'))
print(recall_score(y_test_count ,predictions_count, average='binary'))
print(f1_score(y_test_count ,predictions_count, average='binary'))

ON VALIDATION SET : 
0.7965194109772423
ON TEST SET :
0.7083815028901734
0.22941646682653877
0.8644578313253012
0.36260265319014534


In [67]:
from sklearn.ensemble import VotingClassifier

vclf = VotingClassifier( estimators = [ ('clf1' , LogisticRegression(C = 100 , dual = False) ) , ( 'clf2' , svm.SVC(C = 100 , gamma = 0.1 , kernel = 'rbf', probability = True ) ) , ( 'clf3' , MultinomialNB( alpha = 0.01 ) ) ] , voting = 'soft' )
vclf.fit(X_train , y_train)

predictions_count = vclf.predict( X_val )
print("ON VALIDATION SET : ")
print(accuracy_score( y_val , predictions_count))

## on test set
print("ON TEST SET :")
predictions_count = vclf.predict(X_test_count)
print(accuracy_score(y_test_count, predictions_count))
print(precision_score(y_test_count ,predictions_count, average='binary'))
print(recall_score(y_test_count ,predictions_count, average='binary'))
print(f1_score(y_test_count ,predictions_count, average='binary'))



ON VALIDATION SET : 
0.8273092369477911
ON TEST SET :
0.8378612716763005
0.34993446920052423
0.8042168674698795
0.4876712328767123


## TFIDF 

In [71]:
X_train, X_val, y_train, y_val = train_test_split( X_train_tfidf , y_train_tfidf , random_state = 0)

In [72]:
from sklearn.ensemble import VotingClassifier

vclf = VotingClassifier( estimators = [ ('clf1' , LogisticRegression(C = 100 , dual = False) ) , ( 'clf2' , svm.SVC(C = 100 , gamma = 0.1 , kernel = 'rbf', probability = True ) ) , ( 'clf3' , MultinomialNB( alpha = 0.01 ) ) ] , voting = 'soft' )
vclf.fit(X_train , y_train)

predictions_count = vclf.predict( X_val )
print("ON VALIDATION SET : ")
print(accuracy_score( y_val , predictions_count))

## on test set
print("ON TEST SET :")
predictions_tfidf = vclf.predict( X_test_tfidf )
print(accuracy_score(y_test_tfidf , predictions_tfidf ))
print(precision_score(y_test_tfidf ,predictions_tfidf, average='binary'))
print(recall_score(y_test_tfidf ,predictions_tfidf, average='binary'))
print(f1_score(y_test_tfidf  ,predictions_tfidf, average='binary'))



(4480, 8846)
ON VALIDATION SET : 
0.8259705488621151
ON TEST SET :
0.7893063583815029
0.28905419766206164
0.8192771084337349
0.4273369992144541


## AdaBoost

In [74]:
from sklearn.ensemble import AdaBoostClassifier , RandomForestClassifier

model = AdaBoostClassifier(RandomForestClassifier(n_estimators = 1000),
                         algorithm="SAMME",
                         n_estimators = 100 )
model.fit(X_train , y_train)
predictions_tfidf = model.predict( X_val )
print("ON VALIDATION SET : ")
print(accuracy_score( y_val , predictions_tfidf))

## on test set
print("ON TEST SET :")
predictions_tfidf = model.predict( X_test_tfidf )
print(accuracy_score(y_test_tfidf , predictions_tfidf ))
print(precision_score(y_test_tfidf ,predictions_tfidf, average='binary'))
print(recall_score(y_test_tfidf ,predictions_tfidf, average='binary'))
print(f1_score(y_test_tfidf  ,predictions_tfidf, average='binary'))

ON VALIDATION SET : 
0.8279785809906292
ON TEST SET :
0.7985549132947977
0.2996706915477497
0.822289156626506
0.43925985518905875
