<a href="https://colab.research.google.com/github/kamijoseph/Twitter-Sentiment-Analysis/blob/main/research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install kaggle



In [6]:
# configuring the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [7]:
# api to fetch the datasset from kaggle
!kaggle datasets download kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 2.76GB/s]


In [8]:
# extracting the data
from zipfile import ZipFile
dataset = "/content/sentiment140.zip"
with ZipFile(dataset, "r") as zip:
  zip.extractall()
  print("the dataset is extracted")

the dataset is extracted


In [48]:
# libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [10]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
en_stopwords = stopwords.words("english")
print(en_stopwords)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [12]:
len(en_stopwords)

198

## data processing

In [13]:
twitter_data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1")
twitter_data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [14]:
twitter_data.shape

(1599999, 6)

In [15]:
twitter_data.columns

Index(['0', '1467810369', 'Mon Apr 06 22:19:45 PDT 2009', 'NO_QUERY',
       '_TheSpecialOne_',
       '@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D'],
      dtype='object')

In [16]:
column_names = ["target", "id", "date", "flag", "user", "text"]
twitter_data = pd.read_csv("/content/training.1600000.processed.noemoticon.csv", names=column_names, encoding="ISO-8859-1")
twitter_data.shape

(1600000, 6)

In [17]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [18]:
twitter_data.columns.tolist()

['target', 'id', 'date', 'flag', 'user', 'text']

In [19]:
twitter_data.tail()

Unnamed: 0,target,id,date,flag,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [20]:
# missing values
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [21]:
# checking duplicates
twitter_data.duplicated().sum()

np.int64(0)

In [22]:
# target distribution
twitter_data["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


In [23]:
# converting the 4 to 1
twitter_data["target"] = twitter_data["target"].replace(
    {4:1}
)
twitter_data["target"].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


### stemming

In [24]:
port_stem = PorterStemmer()

In [25]:

# stemming process helper function
def stemming(content):
  stemmed_content = re.sub("[^a-zA-Z]",' ', content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in en_stopwords]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [26]:
# steeming th content and assigning it to a new column "stemmed content"
twitter_data["stemmed_content"] = twitter_data["text"].apply(stemming)

In [27]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [28]:
twitter_data.shape

(1600000, 7)

In [29]:
twitter_data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0
stemmed_content,0


In [30]:
print(twitter_data["stemmed_content"])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [32]:
print(twitter_data["target"])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [33]:
twitter_data["flag"].value_counts()

Unnamed: 0_level_0,count
flag,Unnamed: 1_level_1
NO_QUERY,1600000


In [34]:
# data and labels separation
X, y = twitter_data["stemmed_content"].values, twitter_data["target"].values
print(X.shape, y.shape)

(1600000,) (1600000,)


In [35]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [36]:
print(y)

[0 0 0 ... 1 1 1]


In [37]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    stratify = y,
    random_state = 21
)
print(X.shape, X_train.shape, X_test.shape,)

(1600000,) (1280000,) (320000,)


In [38]:
print(X_train)

['meaningoftruth chri ahhahah u exagger' 'honigkek much xxx'
 'anticip season' ...
 'prepar spend long time isol far far away gonna hold candl'
 'watch we craven dracula' 'miss last pub night ever also miss stanford']


In [39]:
print(X_test)

['thee phone makk watchingg georg lopezz bbi oh ye tha wayi naynay amazingli cute'
 'lolkat come sizzler' 'suebe see follow' ...
 'ileocastro aww someth wrong'
 'blarg hate wake whole bodi ach bad that less everi morn'
 'kali roll cours problem see nice believ']


### vectorization feature extraction

In [40]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [41]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9454888 stored elements and shape (1280000, 461579)>
  Coords	Values
  (0, 258538)	0.5615958742402243
  (0, 71491)	0.3414461272789828
  (0, 6634)	0.5662331016512556
  (0, 125432)	0.4973979199764929
  (1, 169179)	0.8399422882616726
  (1, 279030)	0.2985215528137782
  (1, 452097)	0.45319072684179645
  (2, 20061)	0.8177584951290275
  (2, 357922)	0.575561502920677
  (3, 318030)	0.4654757875788117
  (3, 411468)	0.3076815313685869
  (3, 436781)	0.34192345656464807
  (3, 111726)	0.56344040438043
  (3, 302274)	0.5042693992253218
  (4, 260878)	0.5133389820696035
  (4, 259167)	0.5516665194734077
  (4, 332040)	0.5987757849242197
  (4, 86847)	0.27132029074181696
  (5, 146051)	0.12244572738442693
  (5, 249344)	0.29728399289147717
  (5, 392223)	0.2665795399269735
  (5, 134644)	0.24380495056901758
  (5, 298750)	0.22118553934592527
  (5, 100639)	0.20604388265516682
  (5, 231289)	0.19595180867566553
  :	:
  (1279996, 168072)	0.450226779727133

In [42]:
print(X_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2287406 stored elements and shape (320000, 461579)>
  Coords	Values
  (0, 13793)	0.2844498202150151
  (0, 34723)	0.28528052858448505
  (0, 86847)	0.1857054283639171
  (0, 145550)	0.25742073485308153
  (0, 249389)	0.4098326495569945
  (0, 285847)	0.3894889625219963
  (0, 299268)	0.13816451987464065
  (0, 315183)	0.16886500264845214
  (0, 400346)	0.24452302368924736
  (0, 402336)	0.27361522167777597
  (0, 436825)	0.34324738564702945
  (0, 437240)	0.302308080962376
  (0, 453597)	0.16067470390970423
  (1, 78708)	0.2532236864793903
  (1, 239957)	0.7313003778770011
  (1, 369941)	0.6333068149978671
  (2, 135495)	0.7549619961653404
  (2, 358352)	0.6557685447976631
  (3, 2467)	0.3140512588234394
  (3, 164017)	0.21218492773934838
  (3, 239310)	0.2715901508769489
  (3, 316637)	0.3437830168978159
  (3, 358113)	0.3440996800110967
  (3, 369673)	0.25503440582172143
  (3, 416193)	0.18932348059956183
  :	:
  (319996, 146051)	0.24858044582410

## training the model(s)

### logistic regression

In [50]:
model = LogisticRegression(
    C=1.0, # inverse of regularization strength
    penalty='l2',
    solver='liblinear',
    max_iter=1000
)
model.fit(X_train, y_train)

In [46]:
# evaluation on training
train_pred = model.predict(X_train)
train_acc = accuracy_score(y_train, train_pred)
train_report = classification_report(y_train, train_pred)
print("accuracy on training: ", train_acc)
print("classification report:\n", train_report)

accuracy on training:  0.79665


In [49]:
# evaluation on test data
test_pred = model.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
test_report = classification_report(y_test, test_pred)
print("accuracy on testing: ", test_acc)
print("classification report:\n", test_report)

accuracy on testing:  0.775434375
classification report:
               precision    recall  f1-score   support

           0       0.79      0.76      0.77    160000
           1       0.77      0.79      0.78    160000

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000



### saving the model

In [53]:
import pickle

with open("logreg_model.sav", "wb") as f:
  pickle.dump(model, f)
with open("stemmer.sav", "wb") as f:
  pickle.dump(port_stem, f)
with open("vectorizer.sav", "wb") as f:
  pickle.dump(vectorizer, f)

## pHASE ONE COMPLETE, CHECK DL.ipynb FOR PHASE TWO