In [2]:
!pip install kaggle



In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [6]:
#API to fetch the dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140


Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 73% 59.0M/80.9M [00:00<00:00, 215MB/s]
100% 80.9M/80.9M [00:00<00:00, 209MB/s]


In [7]:
from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [89]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [62]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [51]:
column_names=['target','id','date','flag','user','text']
data=pd.read_csv("/content/training.1600000.processed.noemoticon.csv",names=column_names,encoding='ISO-8859-1')
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [52]:
data['target'].unique() # negative---> 0 , positive ---> 4

array([0, 4])

In [53]:
data["target"]=data["target"].replace(4,1)
data['target'].unique()

array([0, 1])

In [54]:
print(data['target'].value_counts())


target
0    800000
1    800000
Name: count, dtype: int64


In [55]:
sample_size_per_class = 250000
positive_samples = data[data['target'] == 1].sample(n=sample_size_per_class, random_state=42)
negative_samples = data[data['target'] == 0].sample(n=sample_size_per_class, random_state=42)
balanced_data = pd.concat([positive_samples, negative_samples])
print(balanced_data['target'].value_counts())


target
1    250000
0    250000
Name: count, dtype: int64


In [56]:
balanced_data.head()


Unnamed: 0,target,id,date,flag,user,text
1012188,1,1881179620,Fri May 22 03:51:54 PDT 2009,NO_QUERY,tarawade,Is lookin 4ward to a long weekend really dont...
1099036,1,1970537555,Sat May 30 04:15:49 PDT 2009,NO_QUERY,Millie_stillie,#myweakness Is music and i live to meet the p...
1275978,1,2001154935,Tue Jun 02 00:00:21 PDT 2009,NO_QUERY,zsangel,figured out the Internet on my new iPod
1388988,1,2053074174,Sat Jun 06 03:10:08 PDT 2009,NO_QUERY,krisignacio,@hillsongunited can't wait to worship with you...
938859,1,1793548492,Thu May 14 03:06:55 PDT 2009,NO_QUERY,_DrInE_,@sillybeggar Congrats James !! I'm sure the bo...


In [67]:
def process_text(content):
    content = re.sub(r"http\S+", "", content)
    content = re.sub(r"@\w+", "", content)
    content = re.sub(r"#\w+", "", content)
#tokenization
    tokens = word_tokenize(content)
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [token.lower() for token in tokens]
#lemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
#stemming
    porter_stemmer = PorterStemmer()
    tokens = [porter_stemmer.stem(word) for word in tokens ]
    content = ' '.join(tokens)
    return content
balanced_data['processed_text'] = balanced_data['text'].apply(process_text)
balanced_data.head()


Unnamed: 0,target,id,date,flag,user,text,processed_text
1012188,1,1881179620,Fri May 22 03:51:54 PDT 2009,NO_QUERY,tarawade,Is lookin 4ward to a long weekend really dont...,lookin long weekend realli dont want go work t...
1099036,1,1970537555,Sat May 30 04:15:49 PDT 2009,NO_QUERY,Millie_stillie,#myweakness Is music and i live to meet the p...,music live meet peopl make
1275978,1,2001154935,Tue Jun 02 00:00:21 PDT 2009,NO_QUERY,zsangel,figured out the Internet on my new iPod,figur internet new ipod
1388988,1,2053074174,Sat Jun 06 03:10:08 PDT 2009,NO_QUERY,krisignacio,@hillsongunited can't wait to worship with you...,ca wait worship guy tonight much fun
938859,1,1793548492,Thu May 14 03:06:55 PDT 2009,NO_QUERY,_DrInE_,@sillybeggar Congrats James !! I'm sure the bo...,congrat jame sure book go huge success


In [45]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [68]:
balanced_data=balanced_data.drop(['id','date','flag','user','text'],axis=1)
balanced_data.head()

Unnamed: 0,target,processed_text
1012188,1,lookin long weekend realli dont want go work t...
1099036,1,music live meet peopl make
1275978,1,figur internet new ipod
1388988,1,ca wait worship guy tonight much fun
938859,1,congrat jame sure book go huge success


In [70]:
balanced_data.shape

(500000, 2)

In [73]:
X=balanced_data['processed_text'].values
Y=balanced_data['target'].values
X

array(['lookin long weekend realli dont want go work tho x',
       'music live meet peopl make', 'figur internet new ipod', ...,
       'ouchhhhh swollen ankl', 'mcdo never call think come',
       'go drill first thing way start day'], dtype=object)

In [77]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=2)
print(x_train.shape,x_test.shape)
print(x_train)

(400000,) (100000,)
['wow im realli go watch pcd wednesday bought ticket earlier today excit'
 'happi mother day everyon' 'look forward' ... 'lt'
 'kid class go field trip today smart enough volunt help'
 'ick inch rain today']


In [78]:
vectorizer=TfidfVectorizer()

In [79]:
x_train=vectorizer.fit_transform(x_train)
x_test=vectorizer.transform(x_test)
print(x_train)
print(x_test)

  (0, 85367)	0.2751428883162777
  (0, 36344)	0.20834328425138532
  (0, 62083)	0.20764374182516931
  (0, 29907)	0.16031349918042798
  (0, 82853)	0.2131590551119701
  (0, 56901)	0.47146265600916165
  (0, 83217)	0.34761117834997757
  (0, 9623)	0.31734236809645927
  (0, 76188)	0.31372988231209714
  (0, 21862)	0.33873405927119443
  (0, 76766)	0.1919234661207177
  (0, 24238)	0.2698476938453039
  (1, 32358)	0.4722710632591138
  (1, 49933)	0.6202281563676426
  (1, 18263)	0.3517218584814509
  (1, 24014)	0.5182362502983126
  (2, 44430)	0.5951199973265487
  (2, 27040)	0.8036368513091274
  (3, 60040)	0.36766027452040656
  (3, 51732)	0.26341197923667786
  (3, 13062)	0.36340143103643474
  (3, 57988)	0.36219283040642053
  (3, 29635)	0.47832355366273677
  (3, 63443)	0.55081973754442
  (4, 29907)	0.21457485912795196
  :	:
  (399994, 74163)	0.5349810704913426
  (399995, 85210)	1.0
  (399996, 29174)	0.18960165033416676
  (399996, 2709)	0.235290310515868
  (399996, 71818)	0.23602668477661531
  (399996, 67

In [85]:
model=LogisticRegression(max_iter=1500)
model.fit(x_train,y_train)

In [86]:
x_train_prediction=model.predict(x_train)
training_data_accuracy=accuracy_score(y_train,x_train_prediction)
print("Accuracy on training data:",training_data_accuracy)

Accuracy on training data: 0.7936


In [87]:
x_test_prediction=model.predict(x_test)
test_data_accuracy=accuracy_score(y_test,x_test_prediction)
print("Accuracy on test data:",test_data_accuracy)

Accuracy on test data: 0.77105


In [90]:
y_pred=model.predict(x_test)
print("Classification Report:")
print(classification_report(y_test,x_test_prediction))

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.77     50201
           1       0.76      0.79      0.77     49799

    accuracy                           0.77    100000
   macro avg       0.77      0.77      0.77    100000
weighted avg       0.77      0.77      0.77    100000

