In [3]:
import matplotlib
from textblob import TextBlob
import re
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, f1_score, roc_curve, auc
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import os
from time import strftime

In [4]:
# Set up random seed, for reproductability of randomness
np.random.seed(18)
### 1. Basic data preparation and first classifier
# Import dataset
df1 = pd.read_csv("E:/Desktop old/RM/NEW RESEARCH/spam_ham_dataset.csv")

df1 = df1[['text', 'label_num']]
df1

Unnamed: 0,text,label_num
0,enron methanol ; meter # 988291\r\nthis is a f...,0
1,"hpl nom for january 9 , 2001\r\n( see attached...",0
2,"neon retreat\r\nho ho ho , we ' re around to t...",0
3,"photoshop , windows , office . cheap . main tr...",1
4,re indian springs\r\nthis deal is to book the ...,0
5,ehronline web address change\r\nthis message i...,0
6,spring savings certificate - take 30 % off\r\n...,0
7,looking for medication ? we ` re the best sour...,1
8,noms / actual flow for 2 / 26\r\nwe agree\r\n-...,0
9,"nominations for oct . 21 - 23 , 2000\r\n( see ...",0


In [5]:
# Getting rid of empty lines
df1 = df1[df1.text.isna() == False]
length_df1 = len(df1)

In [6]:
# Build sublist of original df1, contains # lines picked at random, out of 20671 possible
random_indexes = list(np.random.choice(length_df1 - 2, 3000, replace=False))
df1 = df1.iloc[random_indexes]

In [7]:
# Function dissects text i, attributes polarity scores, positive/negative/neutral, polarity or not, and subject
def sentiment_analyzer(dataframe):
    sid = SentimentIntensityAnalyzer()
    scores = [sid.polarity_scores(i) for i in dataframe.text]
    compounds = np.array([i['compound'] for i in scores], dtype='float32')
    abs_compounds = np.array([np.sqrt(i ** 2) for i in compounds], dtype='float32')
    negs = np.array([i['neg'] for i in scores], dtype='float32')
    poss = np.array([i['pos'] for i in scores], dtype='float32')
    neus = np.array([i['neu'] for i in scores], dtype='float32')
    sent = dataframe['text'].apply(lambda x: TextBlob(x).sentiment)
    pol = np.array([s[0] for s in sent], dtype='float32')
    abs_pol = np.array([np.sqrt(i ** 2) for i in pol], dtype='float32')
    subj = np.array([s[1] for s in sent], dtype='float32')

    return compounds, abs_compounds, negs, poss, neus, sent, pol, abs_pol, subj


compounds, abs_compounds, negs, poss, neus, sent, pol, abs_pol, subj = sentiment_analyzer(df1)

In [8]:
# Adding columns to df1, matching them with newly created variables
df1['compounds'] = compounds
df1['abs_compounds'] = abs_compounds
df1['negs'] = negs
df1['neus'] = neus
df1['poss'] = poss
df1['pol'] = pol
df1['abs_pol'] = abs_pol
df1['subj'] = subj

X = df1[['compounds', 'negs', 'neus', 'poss', 'pol', 'subj']]
y = df1['label_num']

In [9]:
# First classifier
lrxtrain, lrxtest, lrytrain, lrytest = train_test_split(X, y)
lr = LogisticRegression()
lr.fit(lrxtrain, lrytrain)
lrpreds = lr.predict(lrxtest)
accuracy = accuracy_score(lrytest, lrpreds)
f1 = f1_score(lrytest, lrpreds)
# First attempt gives accuracy and f1 score of (0.748, 0.422)
print(accuracy, f1)



0.748 0.4220183486238532


In [32]:
x_values = df1[['text', 'compounds', 'abs_compounds', 'negs', 'neus', 'poss', 'pol', 'abs_pol', 'subj']]
y_values = df1['label_num']
xtrain, xtest, ytrain, ytest = train_test_split(x_values, y_values,test_size = 0.25, random_state = 42)

In [33]:
### 2. Improving our classifier

# Cleans article from numbers, capital letters, punctuation and spaces for better classifier results
def clean_article(article):
    art = re.sub("[^A-Za-z0-9' ]", '', str(article))
    art2 = re.sub("[( ' )(' )( ')]", ' ', str(art))
    art3 = re.sub("\s[A-Za-z]\s", ' ', str(art2))
    return art3.lower()
# Stop_words will ignore common english words which are noise (the / a / an / etc.)
# Max_df / min_df : ignore words which frequencies are above/under those thresholds
bow = CountVectorizer(stop_words='english', ngram_range=(1, 2), max_features=998, max_df=1.0, min_df=1, binary=False)
bow

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=998, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [34]:
training_data = bow.fit_transform(xtrain.text)
test_data = bow.transform(xtest.text)

In [35]:
dftrain = pd.DataFrame(training_data.toarray())
dftrain.columns = bow.get_feature_names()
dftrain.columns

Index(['00', '00 pm', '000', '000 enron', '000 hpl', '000 mmbtu', '01', '02',
       '03', '04',
       ...
       'wynne', 'wynne hou', 'xls', 'xls hplno', 'xp', 'year', 'years',
       'young', 'zero', 'zone'],
      dtype='object', length=998)

In [36]:
dftest = pd.DataFrame(test_data.toarray())
dftest.columns = bow.get_feature_names()
dftest.columns

Index(['00', '00 pm', '000', '000 enron', '000 hpl', '000 mmbtu', '01', '02',
       '03', '04',
       ...
       'wynne', 'wynne hou', 'xls', 'xls hplno', 'xp', 'year', 'years',
       'young', 'zero', 'zone'],
      dtype='object', length=998)

In [37]:
#improvising the model
lr2 = LogisticRegression()
lr2.fit(dftrain, ytrain)
lr2_preds = lr2.predict(dftest)
accuracy = accuracy_score(ytest, lr2_preds)
f1 = f1_score(ytest, lr2_preds)
# improvised accuracy is (0.9746,0.9574)
print(accuracy, f1)

0.9706666666666667 0.9481132075471698




In [38]:
gnb = GaussianNB()
y_pred = gnb.fit(dftrain, ytrain)

In [41]:
a = y_pred.predict(dftest)

In [47]:
accuracy = accuracy_score(ytest,a)*100
f1score = f1_score(ytest,a)*100
print(accuracy,f1score)

94.0 89.79591836734694


In [80]:
from sklearn.tree import DecisionTreeClassifier
#Decision Tree
clf = DecisionTreeClassifier()
clf = clf.fit(dftrain, ytrain)
train_acc = clf.score(dftrain, ytrain)*100 # mean acc on train data
test_acc = clf.score(dftest, ytest)*100 # mean acc on test data
y_pred = clf.predict(dftest) # make prediction
print("Training accuracy is:", train_acc )
print("Testing accuracy is:", test_acc)
Tree_f1 = f1_score(ytest, y_pred, average="macro")*100
print("f1score is:", Tree_f1)

Training accuracy is: 99.86666666666667
Testing accuracy is: 93.60000000000001
f1score is: 91.86962491869626


In [81]:
from sklearn.ensemble import RandomForestClassifier
#Random Forest
clf = RandomForestClassifier()
clf.fit(dftrain,ytrain)
train_acc = clf.score(dftrain, ytrain)*100 # mean acc on train data
test_acc = clf.score(dftest, ytest)*100 # mean acc on test data
y_pred = clf.predict(dftest) # make prediction
print("Training accuracy is:", train_acc )
print("Testing accuracy is:", test_acc)
RandomForest_f1 = f1_score(ytest, y_pred, average="macro")*100
print("f1score is:",RandomForest_f1 )



Training accuracy is: 99.73333333333333
Testing accuracy is: 96.8
f1score is: 95.97225330051467


In [86]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(objective="binary:logistic", n_estimators=100, random_state=50, eval_metric=["auc", "error", "error@0.6"])
xgb_model.fit(dftrain, ytrain, eval_set=[(dftest, ytest)])

y_pred = xgb_model.predict(dftest)
accuracy = accuracy_score(ytest,y_pred)*100
f1score = f1_score(ytest,y_pred)*100
print(accuracy,f1score)



[0]	validation_0-auc:0.92829	validation_0-error:0.14400	validation_0-error@0.6:0.18800
[1]	validation_0-auc:0.94625	validation_0-error:0.10800	validation_0-error@0.6:0.18267
[2]	validation_0-auc:0.96851	validation_0-error:0.06667	validation_0-error@0.6:0.06933
[3]	validation_0-auc:0.97420	validation_0-error:0.06933	validation_0-error@0.6:0.05733
[4]	validation_0-auc:0.97531	validation_0-error:0.08800	validation_0-error@0.6:0.05467
[5]	validation_0-auc:0.98153	validation_0-error:0.04800	validation_0-error@0.6:0.05067
[6]	validation_0-auc:0.98298	validation_0-error:0.06133	validation_0-error@0.6:0.04933
[7]	validation_0-auc:0.98388	validation_0-error:0.05733	validation_0-error@0.6:0.04533
[8]	validation_0-auc:0.98440	validation_0-error:0.05067	validation_0-error@0.6:0.04800
[9]	validation_0-auc:0.98412	validation_0-error:0.04533	validation_0-error@0.6:0.04667
[10]	validation_0-auc:0.98524	validation_0-error:0.04933	validation_0-error@0.6:0.04667
[11]	validation_0-auc:0.98621	validation_0

[94]	validation_0-auc:0.99473	validation_0-error:0.03200	validation_0-error@0.6:0.02933
[95]	validation_0-auc:0.99476	validation_0-error:0.03200	validation_0-error@0.6:0.02933
[96]	validation_0-auc:0.99478	validation_0-error:0.03200	validation_0-error@0.6:0.02933
[97]	validation_0-auc:0.99474	validation_0-error:0.03200	validation_0-error@0.6:0.02933
[98]	validation_0-auc:0.99482	validation_0-error:0.03200	validation_0-error@0.6:0.02933
[99]	validation_0-auc:0.99482	validation_0-error:0.03200	validation_0-error@0.6:0.02933
96.8 94.23076923076923
