In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kazanova/sentiment140")
print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/kaidashova/.cache/kagglehub/datasets/kazanova/sentiment140/versions/2


In [2]:
import pandas as pd
import os
df = pd.read_csv(os.path.join(path,'training.1600000.processed.noemoticon.csv'), encoding='latin-1', header=None)
df = df[[0, 5]]
df.columns = ['sentiment', 'text']
df

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [4]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaidashova/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = re.sub(r"http\S+|@\w+|#\w+", "", text)
    text = re.sub(r"[^\w\s]", "", text.lower())   
    tokens = [stemmer.stem(w) for w in text.split() if w not in stop_words]
    return " ".join(tokens)

df['text'] = df['text'].apply(clean_text)

Why is Preprocessing Important?
Standardization: Removing unnecessary characters like punctuation and URLs makes sure the model focuses on the important words.
Noise Reduction: By removing stopwords, we remove words that don’t help with sentiment analysis, making the model more efficient.
Stemming: Reducing words to their base form allows the model to recognize the same word in different forms (e.g., “run” and “running”).
Lowercase: Ensures that "Happy" and "happy" are treated as the same word.

In [6]:
df

Unnamed: 0,sentiment,text
0,0,awww that bummer shoulda got david carr third day
1,0,upset cant updat facebook text might cri resul...
2,0,dive mani time ball manag save 50 rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad cant see
...,...,...
1599995,4,woke school best feel ever
1599996,4,thewdbcom cool hear old walt interview â
1599997,4,readi mojo makeov ask detail
1599998,4,happi 38th birthday boo alll time tupac amaru ...


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

print(classification_report(y_test, model.predict(X_test_vec)))


joblib.dump(model, "../model/sentiment_model.pkl")
joblib.dump(vectorizer, "../model/vectorizer.pkl")


              precision    recall  f1-score   support

           0       0.78      0.75      0.77    159998
           4       0.76      0.79      0.78    160002

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



['../model/vectorizer.pkl']