# Twitter Troll Detection (sklearn algos)

The full version of the project is available on https://github.com/kamperemu/twitter-troll-detection

Now we will install the modules used in the program

In [None]:
!pip install seaborn==0.11.1
!pip install matplotlib==3.3.2
!pip install nltk==3.5
!pip install sklearn
!pip install pandas

## Importing Modules

In [None]:
import pickle
import json
import random
import seaborn as sn
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

## Loading the dataset
The dataset is shuffled and the split into xtrain, ytrain, xtest and ytest. The output of the cell shows first five of the training text and its corresponding labels and the first five of the testing text and its corresponidng labels.

In [None]:
with open("datasets/test.json", 'r') as f:
    tweets = json.load(f)
random.shuffle(tweets)
train = tweets[:int(round(4*len(tweets)/5))]
test = tweets[int(round(4*len(tweets)/5)):len(tweets)]
xtrain = []
ytrain = []
xtest = []
ytest = []

for tweet in train:
    xtrain.append(tweet['content'])
    ytrain.append(tweet['label'])
    
for tweet in test:
    xtest.append(tweet['content'])
    ytest.append(tweet['label'])

    
print(xtrain[:5])
print(ytrain[:5])
print(xtest[:5])
print(ytest[:5])

## Pre-processing
The special characters in text data are removed and made lowercase. Then all the words are converted to the root form.

In [None]:
# preprocessklearn.py file (you are actually supposed to import this but for demonstration i have just put this in)
# you can do this by putting the line "from preprocesssklearn import *"
import re
import nltk

def removespchar(text):
    pattern=r'[^a-zA-Z0-9\s]'
    text=re.sub(pattern,'',text)
    return text.lower()

def stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text


The output shows the pre-processed data of the first five training text and first five testing text.

In [None]:
for i in range(len(xtrain)):
    xtrain[i] = removespchar(xtrain[i])
    xtrain[i] = stemmer(xtrain[i])
for i in range(len(xtest)):
    xtest[i] = removespchar(xtest[i])
    xtest[i] = stemmer(xtest[i])
    
print(xtrain[:5])
print(xtest[:5])

## Encoding
Created BoW and Tfidf encoded arrays from the pre-processed text data.

In [None]:
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
cvxtrain=cv.fit_transform(xtrain)
cvxtest=cv.transform(xtest)

In [None]:
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
tvxtrain=tv.fit_transform(xtrain)
tvxtest=tv.transform(xtest)

## Classifier - training and testing
We train and test the Support Vector Machine, Naive Bayes and Logistic Regression algorithms first with the BoW vectorizer and then with the Tfidf Vectorizer. Each cell gives the output of the predicted values of the first five testing data followed by the acutal value of the testing data. Finally each cell also has an accuracy score.

In [None]:
cvsvm=LinearSVC()
cvsvm.fit(cvxtrain,ytrain)
cvsvmpred = cvsvm.predict(cvxtest)
print(cvsvmpred[:5])
print(ytest[:5])
print("Support Vector Machine Accuracy Score -> ",accuracy_score(cvsvmpred, ytest)*100)

In [None]:
cvnb=naive_bayes.MultinomialNB()
cvnb.fit(cvxtrain,ytrain)
cvnbpred = cvnb.predict(cvxtest)
print(cvnbpred[:5])
print(ytest[:5])
print("Naive Bayes Accuracy Score -> ",accuracy_score(cvnbpred, ytest)*100)

In [None]:
cvlr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
cvlr.fit(cvxtrain,ytrain)
cvlrpred = cvlr.predict(cvxtest)
print(cvlrpred[:5])
print(ytest[:5])
print("Logistic Regression Accuracy Score -> ",accuracy_score(cvlrpred, ytest)*100)

In [None]:
tvsvm=LinearSVC()
tvsvm.fit(tvxtrain,ytrain)
tvsvmpred = tvsvm.predict(tvxtest)
print(tvsvmpred[:5])
print(ytest[:5])
print("Support Vector Machine Accuracy Score -> ",accuracy_score(tvsvmpred, ytest)*100)

In [None]:
tvnb=naive_bayes.MultinomialNB()
tvnb.fit(tvxtrain,ytrain)
tvnbpred = tvnb.predict(tvxtest)
print(tvnbpred[:5])
print(ytest[:5])
print("Naive Bayes Accuracy Score -> ",accuracy_score(tvnbpred, ytest)*100)

In [None]:
tvlr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
tvlr.fit(tvxtrain,ytrain)
tvlrpred = tvlr.predict(tvxtest)
print(tvlrpred[:5])
print(ytest[:5])
print("Logistic Regression Accuracy Score -> ",accuracy_score(tvlrpred, ytest)*100)

Confusion matrix of each of the algorithms is outputed in the cells below.

In [None]:
%matplotlib inline

In [None]:
array = confusion_matrix(ytest,cvsvmpred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.show()

In [None]:
array = confusion_matrix(ytest,cvnbpred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.show()

In [None]:
array = confusion_matrix(ytest,cvlrpred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.show()

In [None]:
array = confusion_matrix(ytest,tvsvmpred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.show()

In [None]:
array = confusion_matrix(ytest,tvnbpred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.show()

In [None]:
array = confusion_matrix(ytest,tvlrpred,labels=[1,0])
df_cm = pd.DataFrame(array, range(2), range(2))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16})
plt.show()

## Saving the models
We dump all the variable used for the algorithm for further use using the pickle dump function.

In [None]:
pickle.dump(cv, open("savedModel/sklearnpynb/BoW.sav","wb"))
pickle.dump(cvnb, open("savedModel/sklearnpynb/cvnb.sav","wb"))
pickle.dump(cvsvm, open("savedModel/sklearnpynb/cvsvm.sav","wb"))
pickle.dump(cvlr, open("savedModel/sklearnpynb/cvlr.sav","wb"))
pickle.dump(tv, open("savedModel/sklearnpynb/Tfidf.sav","wb"))
pickle.dump(tvnb, open("savedModel/sklearnpynb/tvnb.sav","wb"))
pickle.dump(tvsvm, open("savedModel/sklearnpynb/tvsvm.sav","wb"))
pickle.dump(tvlr, open("savedModel/sklearnpynb/tvlr.sav","wb"))

The code below loads the previously saved variables and uses and classifies new text data.

In [None]:
import pickle
# supposed to put "from preprocesssklearn import *" from cell 4
n = int(input("no of sentences: "))
sentences = [str(input("enter sentence:")) for _ in range(n)]

In [None]:
for i in range(len(sentences)):
    sentences[i] = removespchar(sentences[i])
    sentences[i] = stemmer(sentences[i])

In [None]:
cv = pickle.load(open("savedModel/sklearnpynb/BoW.sav","rb"))
cvnb = pickle.load(open("savedModel/sklearnpynb/cvnb.sav","rb"))
cvsvm = pickle.load(open("savedModel/sklearnpynb/cvsvm.sav","rb"))
cvlr = pickle.load(open("savedModel/sklearnpynb/cvlr.sav","rb"))
tv = pickle.load(open("savedModel/sklearnpynb/Tfidf.sav","rb"))
tvnb = pickle.load(open("savedModel/sklearnpynb/tvnb.sav","rb"))
tvsvm = pickle.load(open("savedModel/sklearnpynb/tvsvm.sav","rb"))
tvlr = pickle.load(open("savedModel/sklearnpynb/tvlr.sav","rb"))

In [None]:
cvsentences = cv.transform(sentences)
print(cvsvm.predict(cvsentences))
print(cvnb.predict(cvsentences))
print(cvlr.predict(cvsentences))
tvsentences = tv.transform(sentences)
print(tvsvm.predict(cvsentences))
print(tvnb.predict(cvsentences))
print(tvlr.predict(cvsentences))