In [1]:
import kaggle

# Verify authentication by listing available datasets
!kaggle datasets list


401 - Unauthorized - Unauthenticated


In [2]:
#API to fetch Twitter dataset from Kaggle
#!/bin/bash
!kaggle datasets download kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
from zipfile import ZipFile

# Path to the ZIP file (Make sure this is correct)
zip_file_path = "sentiment140.zip"  # Update this if necessary

# Extract the files
with ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall("sentiment140_extracted")  # Extract to a folder
    print("Extraction complete.")

Extraction complete.


In [4]:
# Importing dependencies
import pandas as pd #dataframe
import numpy as np
import re #pattern matching search etc.
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer #nlp libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score #calculate model performance

In [5]:
# Stopwords - Ignore common words like 'i', 'the', 'a'
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Stopwords are like filler words, they do have meaning for formation but not influential for the sentence context.
# The model would not consider these words as part of context during preprocessing.
# We will remove these types of words from the dataset and keep only important words.
# Shortening the huge dataset extracting only important features

In [7]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
data = pd.read_csv('sentiment140_extracted/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')

In [9]:
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [10]:
#naming the columns
col = ['target', 'ids', 'date', 'flag', 'user', 'text']
data = pd.read_csv('sentiment140_extracted/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', names=col)

In [11]:
data.shape

(1600000, 6)

In [12]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [13]:
#checking for and counting missing values
data.isnull().sum()

target    0
ids       0
date      0
flag      0
user      0
text      0
dtype: int64

In [14]:
#checking distribution of target column (how many positive, negative, neutral tweets)
data['target'].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [15]:
# No neutral tweet, evenly distributed positive and negative tweets. Upsampling or downsampling not required.
# Convert the target "4" to "1" and "0" to "0" for better understanding.

In [16]:
data.replace({'target':{4:1}}, inplace = True)

In [17]:
# 0 --> negative tweet
# 1 --> positive tweet
data['target'].value_counts()

target
0    800000
1    800000
Name: count, dtype: int64

In [18]:
#Stemming
portstem = PorterStemmer()

In [19]:
# Load stopwords once outside the loop
stop_words = set(stopwords.words('english'))

def stemming(content):
    """
    Optimized stemming function
    """
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)  # remove characters that are not a-z
    stemmed_content = stemmed_content.lower()  # convert all the words to lowercase
    stemmed_content = stemmed_content.split()  # splitting all words into a list
    stemmed_content = [portstem.stem(word) for word in stemmed_content if word not in stop_words]  # reduced word form
    stemmed_content = ' '.join(stemmed_content)  # joining the words back
    return stemmed_content

In [20]:
data['stemmed_text'] = data['text'].apply(stemming)

In [21]:
data.head()

Unnamed: 0,target,ids,date,flag,user,text,stemmed_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [22]:
#separating data and model
x = data['stemmed_text'].values
y = data['target'].values

In [23]:
print(x)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [24]:
print(y)

[0 0 0 ... 1 1 1]


In [25]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 42)
# 80% of the data is used for training and 20% for testing
# stratify parameter makes sure that the distribution of target variable is even in both training and testing datasets
# random state is used to reproduce the same results every time the code is run

In [26]:
print(x.shape, x_train.shape, x_test.shape)

(1600000,) (1280000,) (320000,)


In [27]:
print(x_train)

['paisleypaisley lol get idea far advanc even june yet need third knitter summer group'
 'worst headach ever'
 'ewaniesciuszko sad wont see miss alreadi yeah perfect come back th' ...
 'got home meet talk endlessli one coolest guy ever met smile'
 'bought chocol bar quot win free bar quot label win either'
 'misecia said hope dm email sunday']


In [28]:
print(x_test)

['stm denali ye black red fav color realli want color def look awesom jare'
 'qu buy open hous weekend pm best valu one bedroom lic long island citi bd http tinyurl com pt nqd'
 'ginoandfran fran greet air okay hahahaha thank' ...
 'la brat follow also hope atleast get also wish get well soon'
 'feel like decent swell sinc last fall hope wave myrtl beach week either least golf'
 'relaxin busi day']


In [29]:
# Feature Extraction

# Convert text to numerical data using Vectorization.
# using TfidfVectorizer 

In [30]:
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train) #fitting the model and transforming the text data to numerical data
x_test = vectorizer.transform(x_test) #transforming the test data

In [31]:
# will consider each word in the text and assign a unique number to it. 
# The model will consider these numbers for prediction.
# also the model will consider the frequency of the words in the text.
# The model will consider the importance of the word in the text.

In [32]:
print(x_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9453607 stored elements and shape (1280000, 461280)>
  Coords	Values
  (0, 307108)	0.46206048815324474
  (0, 239679)	0.15130037108228483
  (0, 146067)	0.12929728405657018
  (0, 175252)	0.224070805470346
  (0, 128605)	0.22108856600702773
  (0, 4832)	0.317074267861159
  (0, 124524)	0.18318401951949756
  (0, 205794)	0.24140229063801746
  (0, 454381)	0.20169626473577715
  (0, 286478)	0.16123218610004272
  (0, 406297)	0.2978221095272138
  (0, 220296)	0.43015677907624866
  (0, 388138)	0.20555120011808467
  (0, 154767)	0.26976607043258233
  (1, 445870)	0.6361096685891185
  (1, 161801)	0.5778049407933611
  (1, 124611)	0.5113765148324884
  (2, 125319)	0.6383069130836649
  (2, 349409)	0.22232944888223494
  (2, 444761)	0.30331529032956345
  (2, 358186)	0.19837942712286838
  (2, 267649)	0.19309660201644555
  (2, 12436)	0.2529872032123258
  (2, 453420)	0.2347069337186747
  (2, 312657)	0.3154702974657607
  :	:
  (1279997, 124611)	0.253778

In [33]:
print(x_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2288367 stored elements and shape (320000, 461280)>
  Coords	Values
  (0, 28874)	0.1778395103911245
  (0, 43712)	0.23562815302828183
  (0, 78636)	0.5158100011206617
  (0, 96399)	0.255967788489452
  (0, 97585)	0.4019235611854435
  (0, 129417)	0.25650960779862714
  (0, 189057)	0.31324918577405797
  (0, 240451)	0.15341308097014625
  (0, 334643)	0.14719329779308424
  (0, 335577)	0.22602158147814247
  (0, 384697)	0.3281164007446601
  (0, 435956)	0.14183025329879742
  (0, 453357)	0.1781708363247895
  (1, 35118)	0.3128685670821343
  (1, 36669)	0.2729958846742327
  (1, 39445)	0.16867093960211466
  (1, 57115)	0.19043301054662504
  (1, 74274)	0.21148120876702692
  (1, 78790)	0.13386322067407883
  (1, 170374)	0.17525273735418329
  (1, 171245)	0.12468774856570086
  (1, 183279)	0.24586158827112847
  (1, 233854)	0.3852709938491561
  (1, 240223)	0.1674195650536303
  (1, 301683)	0.13212235134015302
  :	:
  (319997, 135536)	0.218099649775323

In [34]:
# Training the ML model using Logistic Regression
# Logistic Regression is a classification algorithm used to predict the probability of a target variable.
# It is used to predict the probability of a target variable.
# We will use the Logistic Regression model to predict the sentiment using probability of positive or negative tweet.

In [35]:
model = LogisticRegression(max_iter = 1000)

In [36]:
model.fit(x_train, y_train)

In [37]:
# Model Evaluation
# Accuracy Score
# Accuracy is the ratio of correctly predicted instances to the total instances in the dataset.
# It is used to measure the performance (how many correct predictions) of the model.

In [38]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(y_train, x_train_prediction)

In [39]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7999984375


In [40]:
# Checking accuracy of the test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(y_test, x_test_prediction)

In [41]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.776796875


In [42]:
# saving the vectorizer
import pickle
filename = 'vectorizer.pkl'
pickle.dump(vectorizer, open(filename, 'wb'))

In [43]:
# Saving the trained model to a file
import pickle

In [44]:
filename = 'model.pkl'
pickle.dump(model, open(filename, 'wb'))
# wb - write binary

In [45]:
# checking and using saved model for future use

In [46]:
loaded_model = pickle.load(open(filename, 'rb'))
# rb - read binary

In [47]:
x_new = x_test[3]
print(y_test[3])

1


In [48]:
prediction = loaded_model.predict(x_new)
print(prediction)
# stored in list

[1]


In [49]:
if(prediction[0] == 0):
  print('Negative Tweet')
else:
    print('Positive Tweet')

Positive Tweet


In [50]:
# Recap
# 1. Imported the Kaggle API (kaggle.json)
# 2. Downloaded the Twitter dataset from Kaggle
# 3. Extracted the dataset
# 4. Imported the required libraries - pandas, numpy, re, nltk, TfidfVectorizer, train_test_split, LogisticRegression, accuracy_score
# 5. Read the dataset using pandas
# 6. Preprocessed the data - Removed missing values, converted the target column, performed stemming
# 7. Split the data into training and testing datasets
# 8. Converted the text data to numerical data using TfidfVectorizer
# 9. Trained the Logistic Regression model (max_iter = 1000), supervised learning due to target variable
# 10. Evaluated the model using accuracy score - Training and Testing data (both > 75%)
# 11. Saved the trained model to a file using pickle
# 12. Loaded the saved model and used it for prediction (Positive or Negative Tweet)