## ML models 

In [76]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from sklearn.utils import shuffle
from nltk.corpus import stopwords
import nltk.stem as stm # Import stem class from nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

In [37]:
with open(r"C:\Users\hp\Desktop\pos.txt" , encoding="utf8") as f:
    pos = f.readlines()
    

In [38]:
df1 = pd.DataFrame(pos)

In [39]:
df1.shape

(3319, 1)

In [40]:
df1['ADR'] = 1

In [41]:
with open(r"C:\Users\hp\Desktop\neg.txt" , encoding="utf8") as f:
    neg = f.readlines()
df2 = pd.DataFrame(neg) 
df2['ADR'] = 0
    

In [42]:
df = pd.concat([df1 , df2] , ignore_index = True)

In [43]:
df = df.rename(columns = { 0 : "tweet" })

In [44]:
tknzr = TweetTokenizer()
df['tweet'] = df['tweet'].map(lambda x, tknzr=tknzr: " ".join(tknzr.tokenize(x))) 

In [45]:
train = df

In [46]:
###shuffling the dataset

df_new = shuffle(df)

In [47]:
df.head()

Unnamed: 0,tweet,ADR
0,little did i know that i would go through with...,1
1,today amp the past 5 days are so have not been...,1
2,So damn sleepy . This seroquel is fucking me u...,1
3,lmao . i loved geodon until i started passing ...,1
4,Just wait for the weight gain to set in . I wa...,1


In [48]:
df_new = df_new.reset_index()

In [49]:
df_new.shape

(34592, 3)

In [50]:
train = df_new[:30000]
test = df_new[30000:]

In [51]:
y_test = test.ADR 

In [52]:
len(y_test)

4592

## Trying models


In [53]:
## Removing stop words
stop = stopwords.words('english')
train['cleaned'] = train['tweet'].apply(lambda x: ' '.join( [ word for word in x.split() if word not in (stop) ] ))
test['cleaned'] = test['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [54]:
train.head(2)

Unnamed: 0,index,tweet,ADR,cleaned
0,22691,shrink put us on yet another pill olanzapine ....,0,shrink put us yet another pill olanzapine . sa...
1,2971,i hate saphris so much it is the worst thing i...,1,hate saphris much worst thing ever prescribed ...


In [55]:
## Stemming 
stemmer = stm.PorterStemmer()

train['cleaned'] = train['cleaned'].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split(" ")]))
test['cleaned'] = test['cleaned'].apply(lambda text: " ".join([stemmer.stem(word) for word in text.split(" ")]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [56]:
"""
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems
"""   

"\ndef tokenize_and_stem(text):\n    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token\n    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]\n    filtered_tokens = []\n    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)\n    for token in tokens:\n        if re.search('[a-zA-Z]', token):\n            filtered_tokens.append(token)\n    stems = [stemmer.stem(t) for t in filtered_tokens]\n    return stems\n"

## Count vectorizer

In [57]:
# Instantiate the count vectorizer with an NGram Range from 1 to 3 and english for stop words.
count_vect = CountVectorizer(ngram_range=(1,3),stop_words='english')

count_vectorized_train = count_vect.fit_transform(train.cleaned)
count_vectorized_test = count_vect.transform(test.cleaned)


## TF-IDF Vectorizer

In [58]:
tfidf_vect = TfidfVectorizer(stop_words='english', use_idf=True)

tfidf_vectorized_train = tfidf_vect.fit_transform(train.cleaned)
tfidf_vectorized_test = tfidf_vect.transform(test.cleaned)

In [59]:
print(count_vectorized_train.shape)
print(count_vectorized_test.shape)

(30000, 290148)
(4592, 290148)


In [60]:
print(tfidf_vectorized_train.shape)
print(tfidf_vectorized_test.shape)

(30000, 19747)
(4592, 19747)


In [61]:
X_train_count = count_vectorized_train
y_train_count = train.ADR

X_test_count = count_vectorized_test
y_test_count = test.ADR

In [62]:
X_train_tfidf = tfidf_vectorized_train
y_train_tfidf = train.ADR

X_test_tfidf = tfidf_vectorized_test
y_test_tfidf = test.ADR

# Models :
## 1. Multinommial NB

In [175]:


## GRID SEARCH FOR COUNT VECTORIZER
parameters = {'alpha':[0.01, 0.1, 1.0, 10.0, 100.0]}
nb_count = MultinomialNB()
nb_grid_count = GridSearchCV( nb_count , param_grid = parameters , scoring='neg_log_loss')
nb_grid_count.fit(X_train_count, y_train_count)
nb_grid_count.best_params_





{'alpha': 1.0}

In [177]:
### GRID SEARCH FOR TFIDF

nb_tfidf = MultinomialNB()
nb_grid_tfidf = GridSearchCV( nb_tfidf , param_grid = parameters , scoring='neg_log_loss')
nb_grid_tfidf.fit(X_train_tfidf , y_train_tfidf )
nb_grid_tfidf.best_params_



{'alpha': 0.1}

In [78]:
## COUNT VECTORIZER ( PREDICTIONS )
nb_count = MultinomialNB(alpha = 1.0)
nb_count.fit(X_train_count, y_train_count)
predictions_count = nb_count.predict(X_test_count)


In [79]:
## TFIDF ( PREDICTIONS )

nb_tfidf = MultinomialNB(alpha = 0.1)
nb_tfidf.fit(X_train_tfidf , y_train_tfidf)
predictions_tfidf = nb_tfidf.predict(X_test_tfidf)

In [80]:
## ACCURACY

accuracy_count = accuracy_score(y_test_count, predictions_count)
accuracy_tfidf = accuracy_score(y_test_tfidf, predictions_tfidf)

print('Count Vectorized Words Accuracy :', accuracy_count)
print('TfIdf Vectorized Words Accuracy:', accuracy_tfidf)

Count Vectorized Words Accuracy : 0.9457752613240418
TfIdf Vectorized Words Accuracy: 0.9233449477351916


In [81]:
## PRECISION

print(precision_score(y_test_count ,predictions_count, average='binary'))
print(precision_score(y_test_tfidf ,predictions_tfidf, average='binary'))

0.958904109589041
0.9454545454545454


In [82]:
from sklearn.metrics import recall_score

## RECALL
print(recall_score( y_test_count , predictions_count , average='binary'))
print(recall_score( y_test_tfidf , predictions_tfidf , average='binary'))

0.4666666666666667
0.2311111111111111


In [83]:
from sklearn.metrics import f1_score

##F1 SCORE

print(f1_score( y_test_count , predictions_count , average='binary'))
print(f1_score( y_test_tfidf , predictions_tfidf , average='binary'))

0.6278026905829597
0.3714285714285714


## 2. KNN

In [187]:
"""
## GRID SEARCH FOR COUNT VECTORIZER
parameters = {'n_neighbors':[7,8,9,10]}
knn_count = KNeighborsClassifier()
knn_grid_count = GridSearchCV( knn_count , param_grid = parameters)
knn_grid_count.fit(X_train_count, y_train_count)
knn_grid_count.best_params_
"""

"\n## GRID SEARCH FOR COUNT VECTORIZER\nparameters = {'n_neighbors':[7,8,9,10]}\nknn_count = KNeighborsClassifier()\nknn_grid_count = GridSearchCV( knn_count , param_grid = parameters)\nknn_grid_count.fit(X_train_count, y_train_count)\nknn_grid_count.best_params_\n"

In [None]:
"""
## GRID SEARCH FOR TFIDF
parameters = {'n_neighbors':[7,8,9,10]}
knn_tfidf = KNeighborsClassifier()
knn_grid_tfidf = GridSearchCV( knn_tfidf , param_grid = parameters)
knn_grid_tfidf.fit(X_train_tfidf , y_train_tfidf)
knn_grid_tfidf.best_params_
"""

In [85]:
from sklearn.neighbors import KNeighborsClassifier

model_count_kmn = KNeighborsClassifier(n_neighbors=10)
model_tfidf_kmn = KNeighborsClassifier(n_neighbors=10)

model_count_kmn.fit(X_train_count, y_train_count)
model_tfidf_kmn.fit(X_train_tfidf, y_train_tfidf)

predictions_count_kmn = model_count_kmn.predict(X_test_count)
predictions_tfidf_kmn = model_tfidf_kmn.predict(X_test_tfidf)


accuracy_count_kmn = accuracy_score( y_test_count , predictions_count_kmn)
accuracy_tfidf_kmn = accuracy_score( y_test_tfidf , predictions_tfidf_kmn)
print('Count Vectorized Words Accuracy:', accuracy_count_kmn)
print('TfIdf Vectorized Words Accuracy:', accuracy_tfidf_kmn)

Count Vectorized Words Accuracy: 0.9020034843205574
TfIdf Vectorized Words Accuracy: 0.9113675958188153


In [86]:
## PRECISION FOR COUNT
precision_score(y_test_count ,predictions_count_kmn, average='binary')

  'precision', 'predicted', average, warn_for)


0.0

In [87]:
## PRECISION FOR TFIDF
precision_score(y_test_tfidf ,predictions_tfidf_kmn, average='binary')

0.8307692307692308

In [88]:
## RECALL FOR COUNT
recall_score( y_test_count , predictions_count_kmn , average='binary')

0.0

In [89]:
## RECALL FOR TFIDF
recall_score( y_test_tfidf , predictions_tfidf_kmn , average='binary')

0.12

In [90]:
## F1 SCORE FOR COUNT
f1_score( y_test_count , predictions_count_kmn , average='binary')  

  'precision', 'predicted', average, warn_for)


0.0

In [91]:
## F1 SCORE FOR TFIDF
f1_score( y_test_tfidf , predictions_tfidf_kmn , average='binary') 

0.20970873786407765

## Logistic Regression

In [92]:
from sklearn.linear_model import LogisticRegression

In [164]:
from sklearn.model_selection import GridSearchCV

## GRID SEARCH FOR COUNT VECTORIZER
parameters = { 'C':[0.001,0.01,0.1,1,10,100] }
lr_count = LogisticRegression( penalty = 'l2')
clf_count = GridSearchCV( lr_count  ,  parameters)
clf_count.fit(X_train_count, y_train_count)
clf_count.best_params_



{'C': 10}

In [165]:
from sklearn.model_selection import GridSearchCV

## GRID SEARCH FOR TFIDF
parameters = { 'C':[0.001,0.01,0.1,1,10,100] }
lr_tfidf = LogisticRegression( penalty = 'l2')
clf_tfidf = GridSearchCV( lr_tfidf ,parameters)
clf_tfidf.fit(X_train_tfidf , y_train_tfidf )
clf_tfidf.best_params_



{'C': 100}

In [93]:
## PREDICTIONS FOR TFIDF

lr_tfidf = LogisticRegression( penalty = 'l2' ,  C = 100)
lr_tfidf.fit( X_train_tfidf , y_train_tfidf )
predictions_tfidf_lr = lr_tfidf.predict(X_test_tfidf)
print(accuracy_score( y_test_tfidf , predictions_tfidf_lr ))




0.9398954703832753


In [95]:
print(precision_score( y_test_tfidf , predictions_tfidf_lr , average = 'binary'))
print(recall_score( y_test_tfidf , predictions_tfidf_lr, average = 'binary'))
print(f1_score( y_test_tfidf ,predictions_tfidf_lr , average='binary')) 

0.7265625
0.62
0.6690647482014388


In [96]:
## PREDICITONS FOR COUNT VECTORIZER

lr_count = LogisticRegression( penalty = 'l2' ,  C = 10)
lr_count.fit( X_train_count , y_train_count)
predictions_count_lr = lr_count.predict(X_test_count)
print(accuracy_score( y_test_count , predictions_count_lr ))




0.955574912891986


In [97]:
precision_score(y_test_count ,predictions_count_lr , average = 'binary')

0.9270833333333334

In [98]:
recall_score( y_test_count ,predictions_count_lr, average = 'binary')

0.5933333333333334

In [99]:
f1_score(y_test_count ,predictions_count_lr , average='binary') 

0.7235772357723578

## 4. Decision Trees



In [101]:
from sklearn.tree import DecisionTreeClassifier


In [189]:

## GRID SEARCH FOR COUNT VECTORIZER
parameters = {'min_samples_split' : range(10,500,20),'max_depth': range(1,2,20)}
tree_count = DecisionTreeClassifier()
tree_grid_count = GridSearchCV(tree_count , parameters)
tree_grid_count.fit(X_train_count, y_train_count)
tree_grid_count.best_params_



{'max_depth': 1, 'min_samples_split': 10}

In [190]:
## GRID SEARCH FOR TFIDF

parameters = {'min_samples_split' : range(10,500,20),'max_depth': range(1,2,20)}
tree_tfidf = DecisionTreeClassifier()
tree_grid_tfidf = GridSearchCV(tree_tfidf , parameters)
tree_grid_tfidf.fit(X_train_tfidf, y_train_tfidf)
tree_grid_tfidf.best_params_



{'max_depth': 1, 'min_samples_split': 10}

In [102]:
## PREDICTIONS FOR COUNT

tree_count = DecisionTreeClassifier( max_depth =  1, min_samples_split = 10 )
tree_count.fit( X_train_count , y_train_count )
predictions_count_tree = tree_count.predict(X_test_count)
print(accuracy_score( y_test_count , predictions_count_tree ))


0.9063588850174216


In [103]:
tree_count.feature_importances_

array([0., 0., 0., ..., 0., 0., 0.])

In [104]:
sum(tree_count.feature_importances_)

1.0

In [105]:
print(precision_score( y_test_count , predictions_count_tree , average = 'binary'))
print(recall_score( y_test_count , predictions_count_tree , average = 'binary'))
print(f1_score( y_test_count ,predictions_count_tree , average='binary')) 

0.7941176470588235
0.06
0.11157024793388431


In [106]:
## PREDICTIONS FOR TFIDF

tree_tfidf = DecisionTreeClassifier( max_depth =  1, min_samples_split = 10 )
tree_tfidf.fit( X_train_tfidf , y_train_tfidf )
predictions_tfidf_tree = tree_tfidf.predict(X_test_tfidf)
print(accuracy_score( y_test_tfidf , predictions_tfidf_tree ))


0.9059233449477352


In [107]:
print(precision_score( y_test_tfidf , predictions_tfidf_tree , average = 'binary'))
print(recall_score( y_test_tfidf , predictions_tfidf_tree , average = 'binary'))
print(f1_score( y_test_tfidf ,predictions_tfidf_tree , average='binary')) 

0.78125
0.05555555555555555
0.10373443983402489


## 5. SVM

In [206]:
from sklearn import svm
## GRID SEARCH FOR COUNT VECTORIZER
Cs = [0.01, 0.1, 1, 10]
gammas = [0.001, 0.1, ]
parameters = {'C': Cs, 'gamma' : gammas}

svm_count = svm.SVC(kernel = 'rbf')
grid_svm_count = GridSearchCV( svm_count , parameters)
grid_svm_count.fit(X_train_count, y_train_count)
grid_svm_count.best_params_



{'C': 10, 'gamma': 0.1}

In [31]:
from sklearn.model_selection import GridSearchCV

from sklearn import svm
## GRID SEARCH FOR COUNT VECTORIZER
Cs = [0.01, 0.1, 1, 10]
gammas = [0.001, 0.1, ]
parameters = {'C': Cs, 'gamma' : gammas}

svm_tfidf = svm.SVC(kernel = 'rbf')
grid_svm_tfidf = GridSearchCV( svm_tfidf , parameters)
grid_svm_tfidf.fit(X_train_tfidf, y_train_tfidf)
grid_svm_tfidf.best_params_




{'C': 10, 'gamma': 0.1}

In [108]:
## PREDICTIONS FOR COUNT

svm_count = svm.SVC( C = 10 , gamma = 0.1 )
svm_count.fit( X_train_count , y_train_count )
predictions_count_svm = svm_count.predict(X_test_count)
print(accuracy_score( y_test_count , predictions_count_svm ))


0.9501306620209059


In [109]:
## PREDICTIONS FOR TFIDF

svm_tfidf = svm.SVC( C = 10 , gamma = 0.1 )
svm_tfidf.fit( X_train_tfidf , y_train_tfidf )
predictions_tfidf_svm = svm_tfidf.predict(X_test_tfidf)
print(accuracy_score( y_test_tfidf , predictions_tfidf_svm ))

0.9366289198606271


In [110]:
print(precision_score( y_test_count , predictions_count_svm , average = 'binary'))
print(recall_score( y_test_count , predictions_count_svm , average = 'binary'))
print(f1_score( y_test_count ,predictions_count_svm , average='binary')) 

0.9169811320754717
0.54
0.6797202797202798


In [111]:
print(precision_score( y_test_count , predictions_tfidf_svm , average = 'macro'))
print(recall_score( y_test_count , predictions_tfidf_svm , average = 'macro'))
print(f1_score( y_test_count ,predictions_tfidf_svm , average='macro')) 

0.872498266697481
0.7291576801330544
0.779323191863043
