In [50]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
import warnings
import pickle

In [51]:
warnings.filterwarnings('ignore')

In [52]:
data = pd.read_csv(r'C:\Users\DULA\PycharmProjects\Big Data Project\data\twitter_training.csv',header=None)
v_data = pd.read_csv(r'C:\Users\DULA\PycharmProjects\Big Data Project\data\twitter_validation.csv',header=None)

In [53]:
data

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [54]:
v_data

Unnamed: 0,0,1,2,3
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [55]:
# columns names
data.columns = ['#', 'refers to', 'sentiment', 'text']
v_data.columns = ['#', 'refers to', 'sentiment', 'text']


In [56]:
data = data.drop(['#', 'refers to' ], axis=1)
v_data = v_data.drop(['#', 'refers to' ], axis=1)

In [57]:
data.dropna(inplace=True, axis=0)

In [58]:
data=data.drop_duplicates()
v_data=v_data.drop_duplicates()

In [59]:
def data_processing(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+https\S+", '',text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#','',text)
    text = re.sub(r'[^\w\s]','',text)
    text_tokens = word_tokenize(text)
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

In [60]:
data.text = data['text'].apply(data_processing)

In [61]:
v_data.text = v_data['text'].apply(data_processing)

In [62]:
stemmer = PorterStemmer()
def stemming(data):
    text = [stemmer.stem(word) for word in data]
    return data

In [63]:
data['text'] = data['text'].apply(lambda x: stemming(x))
v_data['text'] = v_data['text'].apply(lambda x: stemming(x))

In [64]:
vect = CountVectorizer(ngram_range=(1,2)).fit(data['text'])

In [65]:
# save vectorizer model
filename_vect = r'C:\Users\DULA\PycharmProjects\Big Data Project\models\vectors_model.pkl'
pickle.dump(vect, open(filename_vect, 'wb'))

In [41]:
feature_names = vect.get_feature_names()

In [49]:
feature_names

['00',
 '00 00',
 '00 000',
 '00 11',
 '00 125',
 '00 30',
 '00 ad',
 '00 bit',
 '00 borderlands',
 '00 bst',
 '00 central',
 '00 cest',
 '00 cet',
 '00 cs',
 '00 cst',
 '00 die',
 '00 edt',
 '00 en',
 '00 enus',
 '00 est',
 '00 go',
 '00 kind',
 '00 less',
 '00 mad',
 '00 month',
 '00 per',
 '00 pm',
 '00 ps4pricecheck',
 '00 received',
 '00 show',
 '00 sure',
 '00 thank',
 '00 thestockobserver',
 '00 tickerreport',
 '00 uk',
 '00 utc',
 '00 well',
 '000',
 '000 1comchanneluc_ufy',
 '000 374g',
 '000 400',
 '000 65',
 '000 85',
 '000 amazon',
 '000 british',
 '000 bullet',
 '000 crates',
 '000 credits',
 '000 displaced',
 '000 families',
 '000 female',
 '000 followers',
 '000 game',
 '000 gb',
 '000 gold',
 '000 golden',
 '000 got',
 '000 http',
 '000 innocent',
 '000 letter',
 '000 million',
 '000 patient',
 '000 pay',
 '000 pcqc',
 '000 per',
 '000 playapex_clips',
 '000 points',
 '000 pound',
 '000 pts',
 '000 telugubullet',
 '000 till',
 '000 violent',
 '000 years',
 '00011',
 '00

In [42]:
x_train =  vect.transform(data['text'])
y_train = data['sentiment']
x_test =  vect.transform(v_data['text'])
y_test = v_data['sentiment']

In [43]:
print("Size of x_train:", (x_train.shape))
print("Size of y_train:", (y_train.shape))
print("Size of x_test:", (x_test.shape))
print("Size of y_test:", (y_test.shape))

Size of x_train: (69769, 330220)
Size of y_train: (69769,)
Size of x_test: (999, 330220)
Size of y_test: (999,)


In [44]:
x_train

<69769x330220 sparse matrix of type '<class 'numpy.int64'>'
	with 1461617 stored elements in Compressed Sparse Row format>

In [45]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
logreg_pred = logreg.predict(x_test)
logreg_acc = accuracy_score(logreg_pred, y_test)
print("test accuracy: {:.2f}%".format(logreg_acc*100))

test accuracy: 98.40%


In [69]:
logreg_pred

array(['Irrelevant', 'Neutral', 'Negative', 'Negative', 'Neutral',
       'Negative', 'Positive', 'Positive', 'Positive', 'Negative',
       'Positive', 'Positive', 'Negative', 'Neutral', 'Negative',
       'Positive', 'Positive', 'Negative', 'Positive', 'Negative',
       'Negative', 'Neutral', 'Irrelevant', 'Negative', 'Neutral',
       'Neutral', 'Negative', 'Irrelevant', 'Irrelevant', 'Negative',
       'Positive', 'Positive', 'Negative', 'Positive', 'Negative',
       'Neutral', 'Neutral', 'Irrelevant', 'Positive', 'Neutral',
       'Positive', 'Neutral', 'Neutral', 'Neutral', 'Positive', 'Neutral',
       'Negative', 'Negative', 'Negative', 'Neutral', 'Positive',
       'Negative', 'Negative', 'Positive', 'Positive', 'Positive',
       'Positive', 'Positive', 'Negative', 'Irrelevant', 'Negative',
       'Positive', 'Positive', 'Irrelevant', 'Negative', 'Neutral',
       'Negative', 'Irrelevant', 'Neutral', 'Negative', 'Positive',
       'Negative', 'Positive', 'Positive', 'Positi

In [46]:
print(confusion_matrix(y_test, logreg_pred))
print("\n")
print(classification_report(y_test, logreg_pred))

[[168   1   1   2]
 [  0 262   1   3]
 [  0   1 280   4]
 [  1   1   1 273]]


              precision    recall  f1-score   support

  Irrelevant       0.99      0.98      0.99       172
    Negative       0.99      0.98      0.99       266
     Neutral       0.99      0.98      0.99       285
    Positive       0.97      0.99      0.98       276

    accuracy                           0.98       999
   macro avg       0.99      0.98      0.98       999
weighted avg       0.98      0.98      0.98       999



In [48]:
# save LogisticRegression model to models folder
filename = r'C:\Users\DULA\PycharmProjects\Big Data Project\models\LogisticRegression_model.sav'
pickle.dump(logreg, open(filename, 'wb'))

In [29]:
SVCmodel = LinearSVC()
SVCmodel.fit(x_train, y_train)

LinearSVC()

In [30]:
svc_pred = SVCmodel.predict(x_test)
svc_acc = accuracy_score(svc_pred, y_test)
print("test accuracy: {:.2f}%".format(svc_acc*100))

test accuracy: 98.80%


In [31]:
print(confusion_matrix(y_test, svc_pred))
print("\n")
print(classification_report(y_test, svc_pred))

[[170   1   0   1]
 [  0 262   2   2]
 [  0   0 281   4]
 [  1   0   1 274]]


              precision    recall  f1-score   support

  Irrelevant       0.99      0.99      0.99       172
    Negative       1.00      0.98      0.99       266
     Neutral       0.99      0.99      0.99       285
    Positive       0.98      0.99      0.98       276

    accuracy                           0.99       999
   macro avg       0.99      0.99      0.99       999
weighted avg       0.99      0.99      0.99       999

