In [1]:
import pandas as pd

In [2]:
# 1) 
#Read the data into a pandas DataFrame
import os
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        fhr = open(path+f, 'r', encoding='utf-8', errors='ignore') 
        t = fhr.read()
        text.append(t)
        fhr.close()
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

df_neg = data2df('HealthProNonPro/NonPro/', 0) # NonPro
df_pos = data2df('HealthProNonPro/Pro/', 1) # Pro

df = pd.concat([df_pos, df_neg], axis=0)
df.sample(frac=0.009)

Unnamed: 0,file,text,class
17,a114.txt,i think there are different questions for your...,0
1216,a61516.txt,"It isnt your fault, youre doing nothing wrong....",0
575,a24807.txt,If the problem with the butter is just the sod...,0
507,ans1455.txt,The previous injury that you described at the ...,1
352,ans1315.txt,The burning feeling in the buttocks might be d...,1
1581,a7369.txt,Restrain yourself to eat during the hours that...,0
1545,a7333.txt,It means << Electro Magnetic Source >> in elec...,0
1462,a69597.txt,no,0
1158,ans354.txt,"Headache that present after waking up, is asso...",1
160,a24392.txt,FIRST OFF WHO ARE YOU? YOU ARE A CHILD OF THE...,0


In [3]:
# 2) 
#Setup the data for Training/Testing. Use 20% for testing.
X, y = df['text'], df['class']

from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [4]:
#3) 
#Use Spacy to preprocess the data. Explore and pick appropriate preprocessing steps.
def custom_tokenizer(doc):

    # use spacy to filter out noise
    tokens = [token.lemma_.lower() 
                        for token in doc 
                               if (
                                    len(token) >= 2 and # only preserve tokens that are greater than 2 characters long
                                    #token.pos_ in ['PROPN', 'NOUN', 'ADJ', 'VERB', 'ADV'] and # only preserve selected pos
                                    #token.text in nlp.vocab and # check if token in vocab 
                                    token.is_alpha and # only preserve tokens that are fully alpha (not numeric or alpha-numeric)
                                    #not token.is_digit and # get rid of tokens that are fully numeric
                                    not token.is_punct and # get rid of tokens that are punctuations
                                    not token.is_space and # get rid of tokens that are spaces
                                    not token.is_stop and # get rid of tokens that are stop words
                                    not token.is_currency # get rid of tokens that denote currencies
                                )
                   ]

    # return cleaned-up text
    return ' '.join(tokens)

In [5]:
#Preprocessing Xtrain using Spacy
import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(Xtrain))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtrain = pd.Series(clean_corpus,index=Xtrain.index)
Xtrain.head()

439     common cause itchy palm contact dermatitis exp...
720     take money support famlys need like food shelt...
307                      not sign married doctor approval
87      speed amphetamine psychostimulant commonly abu...
1066                                          tantric sex
dtype: object

In [6]:
#4)
#Setup a Pipeline with TfidfVectorizer and Naïve Bayes. 

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

nb=Pipeline(steps=[('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])

In [7]:
#5) 
#Do Grid Search with 4-fold Cross Validation to search for the best values for the following two hyper-parameters (and any additional hyper parameters you may want to tune):
# sublinear_tf in TfidfVectorizer 
# alpha in Naïve Bayes 

from sklearn.model_selection import GridSearchCV
param_grid = {
    'mnb__alpha': [0.2,0.5,0.7], # getting best alpha
    'tfidf__sublinear_tf':[True,False], # fublinear_tf from tfidf
    'tfidf__norm':['l1','l2'] #finding best norm
}
gscv = GridSearchCV(nb, param_grid, cv=4, return_train_score=False)

In [8]:
# 6) 
#Use the Best Estimator resulting from the Grid Search for Prediction/Evaluation. Print the following evaluation metrics:
# Accuracy score
# Confusion matrix
# Classification report

gscv.fit(Xtrain, ytrain)

print ("-"*50)
print(gscv.best_estimator_, "\n")
print ("-"*50)
print(gscv.best_score_, "\n")
print ("-"*50)
print(gscv.best_params_, "\n")
print ("-"*50)
print(gscv.cv_results_, "\n")
print ("-"*50)

--------------------------------------------------
Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('mnb',
                 MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True))],
         verbose=

In [9]:
# Preprocessing Xtest using Spacy

import spacy
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])
corpus = nlp.pipe(list(Xtest))
clean_corpus = [custom_tokenizer(doc) for doc in corpus]
Xtest = pd.Series(clean_corpus,index=Xtest.index)
Xtest.head()

956     procrastinating inevitable drinking problem da...
1247    understand concern pain knee surgery pain weig...
102     swallow complex act involve mouth throat area ...
994                                          course break
dtype: object

In [10]:
# Predict/Evaluate Best Estimator on Xtest

ypred = gscv.best_estimator_.predict(Xtest)

from sklearn import metrics
print (metrics.accuracy_score(ytest, ypred))
print (metrics.confusion_matrix(ytest, ypred))
print (metrics.classification_report(ytest, ypred))

0.9427012278308322
[[322  36]
 [  6 369]]
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       358
           1       0.91      0.98      0.95       375

    accuracy                           0.94       733
   macro avg       0.95      0.94      0.94       733
weighted avg       0.95      0.94      0.94       733



In [11]:
# 7) 
#Extract the true negatives (TN), false positives (FP), false negatives (FN), and true positives (TP)

TN, FP, FN, TP = metrics.confusion_matrix(y_true=ytest, y_pred=ypred).ravel()
print("TP:",TP,"\t"
      'FP:',FP,"\t" 
      'TN:',TN,"\t" 
      'TP:',TP)


TP: 369 	FP: 36 	TN: 322 	TP: 369


In [12]:
#Overall_Accuracy

Overall_Accuracy = (TP + TN) / (TP + TN + FP + FN)
print("Overall Accuracy:",Overall_Accuracy)

Overall Accuracy: 0.9427012278308322


In [13]:
#Precision for Class 0 and Class 1

Pre_Cl_0 = TN / (TN + FN)
Pre_Cl_1 = TP / (TP + FP)

print("Precision for Class 0:",Pre_Cl_0)
print("Precision for Class 1:",Pre_Cl_1)

Precision for Class 0: 0.9817073170731707
Precision for Class 1: 0.9111111111111111


In [14]:
#Recall for Class 0 and Class 1

Rec_Cl_0 = TN / (TN + FP)
Rec_Cl_1 = TP / (TP + FN)

print("Recall for Class 0:",Rec_Cl_0)
print("Recall for Class 1:",Rec_Cl_1)

Recall for Class 0: 0.8994413407821229
Recall for Class 1: 0.984


In [15]:
#F1-Score for Class 0 and Class 1

F1scr_Cl_0 = (2*Rec_Cl_0*Pre_Cl_0)/(Rec_Cl_0+Pre_Cl_0)
F1scr_Cl_1 = (2*Rec_Cl_1*Pre_Cl_1)/(Rec_Cl_1+Pre_Cl_1)

print("F1-score for Class 0:",F1scr_Cl_0)
print("F1-score for Class 1:",F1scr_Cl_1)

F1-score for Class 0: 0.9387755102040817
F1-score for Class 1: 0.9461538461538462
