In [1]:
import pickle
import json
import random
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [2]:
with open("datasets/test.json", 'r') as f:
    tweets = json.load(f)
random.shuffle(tweets)
train = tweets[:int(round(4*len(tweets)/5))]
test = tweets[int(round(4*len(tweets)/5)):len(tweets)]
xtrain = []
ytrain = []
xtest = []
ytest = []

for tweet in train:
    xtrain.append(tweet['content'])
    ytrain.append(tweet['label'])
    
for tweet in test:
    xtest.append(tweet['content'])
    ytest.append(tweet['label'])

    
print(xtrain[:5])
print(ytrain[:5])
print(xtest[:5])
print(ytest[:5])

["Some things even spell check can't fix ", 'just woke up, what an overcast day ', 'RT @wikileaks: DoJ Assistant Attorney Peter Kadzik outed as a mole for Hillary Clinton campaign https://t.co/MNHzJ310Nl https://t.co/uTQF6b…', 'RT @true_pundit: #TruePundit is now back on #Facebook ! Go check us out &amp; give us a like for some exclusive content https://t.co/xuC0vBs3vB…', 'RT @DistantDistant: @Enjoneer01 @Dubzzzinyaface #ItsNotAWasteIf you call shredded animal bits, "a filler"']
[0, 0, 1, 1, 1]
['#TopNews Pressure on Trump likely to be intense at second debate with Clinton', '@gotobekiddingme I tried.....and failed ', "RT @TelcoJ: Black Trump Representative Brings Down the House on Chicago's WGN | RedState https://t.co/u2eEOraGFO", "@TonyWade aww thanks! I don't play harp  I play violin very badly! think I'll stick to singing!when I need a fanclub I'll be onto you ;-)", "RT @theolcaper: #ProbableTrumpsTweets I'm not only a client I'm the U.S. President"]
[1, 0, 1, 0, 1]


In [3]:
# preprocessklearn.py file (you are actually supposed to import this but for demonstration i have just put this in)
# you can do this by putting the line "from preprocesssklearn import *"
import re
import nltk

def removespchar(text):
    pattern=r'[^a-zA-Z0-9\s]'
    text=re.sub(pattern,'',text)
    return text.lower()

def stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


In [4]:
for i in range(len(xtrain)):
    xtrain[i] = removespchar(xtrain[i])
    xtrain[i] = stemmer(xtrain[i])
for i in range(len(xtest)):
    xtest[i] = removespchar(xtest[i])
    xtest[i] = stemmer(xtest[i])
    
print(xtrain[:5])
print(xtest[:5])

['some thing even spell check cant fix', 'just woke up what an overcast day', 'rt wikileak doj assist attorney peter kadzik out as a mole for hillari clinton campaign httpstcomnhzj310nl httpstcoutqf6b', 'rt truepundit truepundit is now back on facebook go check us out amp give us a like for some exclus content httpstcoxuc0vbs3vb', 'rt distantdist enjoneer01 dubzzzinyafac itsnotawasteif you call shred anim bit a filler']
['topnew pressur on trump like to be intens at second debat with clinton', 'gotobekiddingm i triedand fail', 'rt telcoj black trump repres bring down the hous on chicago wgn redstat httpstcou2eeoragfo', 'tonywad aww thank i dont play harp i play violin veri badli think ill stick to singingwhen i need a fanclub ill be onto you', 'rt theolcap probabletrumpstweet im not onli a client im the us presid']


In [5]:
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
xtrain=cv.fit_transform(xtrain)
xtest=cv.transform(xtest)

In [None]:
print(xtrain[0])

In [None]:
%matplotlib inline

In [6]:
svm=LinearSVC()
svm.fit(xtrain,ytrain)
svmpred = svm.predict(xtest)
print(svmpred[:5])
print(ytest[:5])
print("Support Vector Machine Accuracy Score -> ",accuracy_score(svmpred, ytest)*100)

[1 1 1 1 1]
[1, 0, 1, 0, 1]
Support Vector Machine Accuracy Score ->  71.23333333333333


In [None]:
array = confusion_matrix(ytest,svmpred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.show()

In [7]:
nb=naive_bayes.MultinomialNB()
nb.fit(xtrain,ytrain)
nbpred = nb.predict(xtest)
print(nbpred[:5])
print(ytest[:5])
print("Naive Bayes Accuracy Score -> ",accuracy_score(nbpred, ytest)*100)

[1 0 0 0 1]
[1, 0, 1, 0, 1]
Naive Bayes Accuracy Score ->  74.6


In [None]:
array = confusion_matrix(ytest,nbpred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.show()

In [8]:
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
lr.fit(xtrain,ytrain)
lrpred = lr.predict(xtest)
print(lrpred[:5])
print(ytest[:5])
print("Logistic Regression Accuracy Score -> ",accuracy_score(lrpred, ytest)*100)

[0 0 0 0 1]
[1, 0, 1, 0, 1]
Logistic Regression Accuracy Score ->  60.71666666666666


In [None]:
array = confusion_matrix(ytest,lrpred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.show()

In [9]:
pickle.dump(cv, open("savedModel/sklearnpynb/BoW.sav","wb"))
pickle.dump(nb, open("savedModel/sklearnpynb/nb.sav","wb"))
pickle.dump(svm, open("savedModel/sklearnpynb/svm.sav","wb"))
pickle.dump(lr, open("savedModel/sklearnpynb/lr.sav","wb"))

In [None]:
import pickle
from preprocesssklearn import *
n = int(input("no of sentences: "))
sentences = [str(input("enter sentence:")) for _ in range(n)]

In [None]:
for i in range(len(sentences)):
    sentences[i] = removespchar(sentences[i])
    sentences[i] = stemmer(sentences[i])

In [None]:
cv = pickle.load(open("savedModel/sklearnpynb/BoW.sav","rb"))
nb = pickle.load(open("savedModel/sklearnpynb/nb.sav","rb"))
svm = pickle.load(open("savedModel/sklearnpynb/svm.sav","rb"))
lr = pickle.load(open("savedModel/sklearnpynb/lr.sav","rb"))

In [None]:
sentences = cv.transform(sentences)
print(svm.predict(sentences))
print(nb.predict(sentences))
print(lr.predict(sentences))