In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk.stem as stm

import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import string

from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import confusion_matrix,roc_auc_score,log_loss
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from xgboost import XGBClassifier
seed = 4353

In [2]:
df_train = pd.read_csv(r'C:\Users\hp\Desktop\SMM4H_DATASETS\trn_cls.csv' , header = None )
df_train = df_train.rename(columns = { 0 : "ADR" ,  1 : "Tweet"} )

df_val = pd.read_csv(r'C:\Users\hp\Desktop\SMM4H_DATASETS\val_cls.csv' , header = None )
df_val = df_val.rename(columns = { 0 : "ADR" ,  1 : "Tweet"} )



## Meta features

In [57]:

def text_len(df):
    #i = ['text']
    df['num_words'] = df['Tweet'].apply(lambda x: len(str(x).split()))
    df['num_uniq_words'] = df['Tweet'].apply(lambda x: len(set(str(x).split())))
    df['num_chars'] = df['Tweet'].apply(lambda x: len(str(x)))
    df['num_stopwords'] = df['Tweet'].apply(lambda x: len([w for w in str(x).lower().split() 
                                                          if w in set(stopwords.words('english'))]))
    df['num_punctuations'] = df['Tweet'].apply(lambda x: len([w for w in str(x) if w in string.punctuation]))
    df['num_words_upper'] = df['Tweet'].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    df['num_words_title'] = df['Tweet'].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df['mean_word_len'] = df['Tweet'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [58]:

text_len(df_train)
text_len(df_val)


In [59]:
df_train.head(2)

Unnamed: 0,ADR,Tweet,cleaned,num_words,num_uniq_words,num_chars,num_stopwords,num_punctuations,num_words_upper,num_words_title,mean_word_len
0,1.0,do any of my friends who take pristiq have pro...,friend take pristiq problem nausea?,12,12,65,7,1,0,0,4.416667
1,1.0,humira helped my crohn s but then respiratory...,humira help crohn respiratori issu ensu amp ta...,22,20,119,9,0,0,0,4.409091


## Preprocessing for Vectorizer

In [3]:
## Removing stop words
stop = stopwords.words('english')
df_train['cleaned'] = df_train['Tweet'].apply(lambda x: ' '.join( [ word for word in x.split() if word not in (stop) ] ))
df_val['cleaned'] = df_val['Tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

## Stemming 
stemmer = stm.PorterStemmer()
df_train['cleaned'] = df_train['cleaned'].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split(" ")]))
df_val['cleaned'] = df_val['cleaned'].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split(" ")]))

In [4]:
count_vect = CountVectorizer(ngram_range=(1,3) , stop_words='english')

count_vectorized_train = count_vect.fit_transform(df_train.cleaned)
count_vectorized_val = count_vect.transform(df_val.cleaned)

tfidf_vect = TfidfVectorizer(stop_words='english', use_idf=True)

tfidf_vectorized_train = tfidf_vect.fit_transform(df_train.cleaned)
tfidf_vectorized_val = tfidf_vect.transform(df_val.cleaned)

In [5]:
print(count_vectorized_train.shape)
print(count_vectorized_val.shape)

(5974, 75761)
(3460, 75761)


In [6]:
print(tfidf_vectorized_train.shape)
print(tfidf_vectorized_val.shape)

(5974, 8846)
(3460, 8846)


In [7]:
X_train_count = count_vectorized_train
y_train_count = df_train.ADR

X_test_count = count_vectorized_val
y_test_count = df_val.ADR

In [8]:
X_train_tfidf = tfidf_vectorized_train
y_train_tfidf = df_train.ADR

X_test_tfidf = tfidf_vectorized_val
y_test_tfidf = df_val.ADR

In [9]:
count_vectorized_train.shape

(5974, 75761)

In [10]:
y_train_count.shape

(5974,)

In [11]:
tfidf_vectorized_train.shape

(5974, 8846)

## Model ( without meta features )

In [45]:
## COUNT FEATURES

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
from sklearn.metrics import f1_score

In [22]:
kf = KFold(n_splits = 5 , shuffle = True , random_state = seed)
pred_val_full = 0
cv_score = []
i = 1

for train_index , test_index in kf.split( X_train_count ):
    print("K = " , i)
    X_trn , X_val = X_train_count[train_index], X_train_count[test_index]
    y_trn , y_val = y_train_count[train_index], y_train_count[test_index]
    
    model = XGBClassifier(objective='binary:logistic', eval_metric = 'error' , learning_rate = 0.3 , max_depth = 4 ,
        subsample = 0.8,
        colsample_bytree = 0.8,
        min_child_weight =  6,
        reg_alpha = 1.5, 
        reg_lambda = 5,
        scale_pos_weight = 1,          
        n_thread = -1 ,
        gamma = 5)
    

    model.fit(X_trn, y_trn)    
   
    y_pred = model.predict(X_val)
    predictions = [round(value) for value in y_pred]
    
    f1 = f1_score( y_val , predictions , average='binary')
    print(f1)
    cv_score.append(f1)
    i+=1

K =  1
0.6932409012131715
K =  2
0.7114695340501793
K =  3
0.7198641765704583
K =  4
0.6954177897574124
K =  5
0.7291311754684837


In [23]:
print('Mean f1_score score on validation set',np.mean(cv_score))

Mean f1_score score on validation set 0.7098247154119411


In [24]:
## On validation set

y_pred = model.predict(X_test_count)
predictions = [round(value) for value in y_pred]
# evaluate predictions
f1 = f1_score(y_test_count, predictions)
print(f1)

0.3421052631578947


In [25]:
## TF IDF

kf = KFold(n_splits = 5 , shuffle = True , random_state = seed)
pred_val_full = 0
cv_score = []
i = 1

for train_index , test_index in kf.split( X_train_tfidf ):
    print("K = " , i)
    X_trn , X_val = X_train_tfidf[train_index], X_train_tfidf[test_index]
    y_trn , y_val = y_train_tfidf[train_index], y_train_tfidf[test_index]
    
    model = XGBClassifier(objective='binary:logistic', eval_metric = 'error' , learning_rate = 0.3 , max_depth = 4 ,
        subsample = 0.8,
        colsample_bytree = 0.8,
        min_child_weight =  6,
        reg_alpha = 1.5, 
        reg_lambda = 5,
        scale_pos_weight = 1,          
        n_thread = -1 ,
        gamma = 5)
    model.fit(X_trn, y_trn)    
   
    y_pred = model.predict(X_val)
    predictions = [round(value) for value in y_pred]
    
    f1 = f1_score( y_val , predictions , average='binary')
    print(f1)
    cv_score.append(f1)
    i+=1

K =  1
0.6944198405668733
K =  2
0.7133640552995391
K =  3
0.7084048027444254
K =  4
0.716549295774648
K =  5
0.7101321585903083


In [26]:
print('Mean f1_score score on validation set',np.mean(cv_score))

Mean f1_score score on validation set 0.7085740305951588


In [27]:
## On validation set

y_pred = model.predict(X_test_tfidf)
predictions = [round(value) for value in y_pred]
# evaluate predictions
f1 = f1_score(y_test_tfidf , predictions)
print(f1)

0.3553054662379421
