In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [4]:
data = pd.read_csv('../artifacts/sentiment_analysis.csv')

In [5]:
 data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


## Data Preprocessing

In [6]:
data.shape


(7920, 3)

In [7]:
data.duplicated().sum()

np.int64(0)

In [8]:
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

### Text Preprocessing

In [9]:
import re
import string

convert uppercase to lowercase

In [10]:
 data["tweet"] = data["tweet"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [11]:
 data["tweet"].head(5)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

remove links


In [12]:
 data["tweet"] = data["tweet"].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))

In [13]:
 data["tweet"].head(5)

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [14]:
data["tweet"] = data["tweet"].apply(lambda text: re.sub(r'https?://\S+', '', text))

remove punctuations

In [15]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,'')
    return text

data["tweet"] = data["tweet"].apply(remove_punctuations)

In [17]:
 data["tweet"].head(5)

0    fingerprint pregnancy test  android apps beaut...
1    finally a transparant silicon case  thanks to ...
2    we love this would you go talk makememories un...
3    im wired i know im george i was made that way ...
4    what amazing service apple wont even talk to m...
Name: tweet, dtype: object

remove numbers

In [21]:
data["tweet"] = data['tweet'].str.replace('\d+','',regex=True)

In [22]:
data["tweet"].tail(10)

7910    perfect match instagood applewatch red instagr...
7911    i am completely in love with the new iphone em...
7912    tune in turn on drop out  gtd in one app  mobi...
7913    ok so my galaxy crashed after one day now i ha...
7914    gain followers rt this must follow me i follow...
7915    live out loud lol liveoutloud selfie smile son...
7916    we would like to wish you an amazing day make ...
7917    helping my lovely  year old neighbor with her ...
7918    finally got my smart pocket wifi stay connecte...
7919    apple barcelona apple store bcn barcelona trav...
Name: tweet, dtype: object

In [23]:
%autosave 60

Autosaving every 60 seconds


In [27]:
pip install nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 4.0 MB/s eta 0:00:00
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2026.1.15-cp310-cp310-win_amd64.whl (277 kB)
     -------------------------------------- 277.8/277.8 kB 2.4 MB/s eta 0:00:00
Collecting click
  Downloading click-8.3.1-py3-none-any.whl (108 kB)
     ---------------------------------------- 108.3/108.3 kB ? eta 0:00:00
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.3.1 nltk-3.9.2 regex-2026.1.15 tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.1 -> 25.3
[notice] To update, run: C:\Users\User\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


In [28]:
import nltk

In [30]:
nltk.download('stopwords',download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [31]:
with open('../static/model/corpora/stopwords/english', 'r') as file:
    sw = file.read().splitlines()

In [32]:
sw

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [33]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [35]:
 data["tweet"].head(5)

0    fingerprint pregnancy test android apps beauti...
1    finally transparant silicon case thanks uncle ...
2    love would go talk makememories unplug relax i...
3    im wired know im george made way iphone cute d...
4    amazing service apple wont even talk question ...
Name: tweet, dtype: object

stemming(take the base word of the words)

In [37]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [38]:
data["tweet"] = data["tweet"].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))

In [39]:
data["tweet"].head(5)

0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax iph...
3    im wire know im georg made way iphon cute dave...
4    amaz servic appl wont even talk question unles...
Name: tweet, dtype: object

## Building Vocabulary

In [40]:
from collections import Counter
vocab = Counter()

In [41]:
vocab

Counter()

In [42]:
for sentence in data['tweet']:
    vocab.update(sentence.split())

In [43]:
len(vocab)

15904

In [44]:
data.shape

(7920, 3)

In [45]:
tokens = [key for key in vocab if vocab[key]>10]

In [46]:
len(tokens)

1146

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# Step 1: Vectorize tweets (TF-IDF)
vectorizer = TfidfVectorizer(vocabulary=tokens)  # use your filtered tokens
X = vectorizer.fit_transform(data['tweet'])
y = data['label']  # your target variable

# Step 2: Select top K features using Chi-Square
selector = SelectKBest(chi2, k=500)  # keep top 500 most informative features
X_selected = selector.fit_transform(X, y)

# Optional: Get selected feature names
tokens = [feature for feature, selected in zip(vectorizer.get_feature_names_out(), selector.get_support()) if selected]

In [51]:
len(tokens)

500

In [52]:
tokens

['android',
 'beauti',
 'cute',
 'health',
 'iger',
 'iphoneonli',
 'iphonesia',
 'iphon',
 'final',
 'case',
 'yay',
 'soni',
 'xperia',
 'love',
 'would',
 'go',
 'relax',
 'smartphon',
 'connect',
 'im',
 'know',
 'home',
 'amaz',
 'servic',
 'appl',
 'wont',
 'even',
 'pay',
 'stupid',
 'support',
 'softwar',
 'updat',
 'fuck',
 'phone',
 'time',
 'happi',
 'instap',
 'instadaili',
 'xperiaz',
 'new',
 'type',
 'charger',
 'cabl',
 'amazon',
 'newyear',
 'start',
 'technolog',
 'iphonex',
 'shop',
 'photo',
 'fun',
 'selfi',
 'camera',
 'picoftheday',
 'sun',
 'instagood',
 'boy',
 'make',
 'ipod',
 'dont',
 'color',
 'crash',
 'everi',
 'need',
 'realli',
 'drop',
 'anoth',
 'lol',
 'work',
 'batteri',
 'charg',
 'dead',
 'saturday',
 'summer',
 'share',
 'want',
 'instagram',
 'photooftheday',
 'tweegram',
 'reason',
 'one',
 'suck',
 'agre',
 'fact',
 'store',
 'screen',
 'art',
 'dear',
 'friend',
 'email',
 'seem',
 'pie',
 'ive',
 'day',
 'button',
 'broke',
 'goe',
 'complet

In [53]:
def save_vocabulary(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding="utf-8")
    file.write(data)
    file.close()

save_vocabulary(tokens, '../static/model/vocabulary.txt')

### Dividing Dataset

In [54]:
x = data['tweet']
y =data['label']

In [56]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, 
    test_size=0.2, 
    random_state=42
)

# Vectorization

In [57]:
def vectorizer(ds, vocabulary):
    vectorized_lst = []
    
    for sentence in ds:
        sentence_lst = np.zeros(len(vocabulary))
        
        for i in range(len(vocabulary)):
            if vocabulary[i] in sentence.split():
                sentence_lst[i] = 1
        
        vectorized_lst.append(sentence_lst)
    
    vectorized_lst_new = np.asarray(vectorized_lst, dtype=np.float32)
    return vectorized_lst_new

In [63]:
vectorized_x_train=vectorizer(x_train,tokens)

In [64]:
vectorized_x_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(6336, 500), dtype=float32)

In [65]:
vectorized_x_test=vectorizer(x_test,tokens)

In [66]:
vectorized_x_test

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(1584, 500), dtype=float32)

### Handle imbalanced dataset 

In [67]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
vectorized_x_train_smote, y_train_smote = smote.fit_resample(vectorized_x_train, y_train)
print(vectorized_x_train_smote.shape, y_train_smote.shape)

(9484, 500) (9484,)


In [68]:
vectorized_x_train_smote

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(9484, 500), dtype=float32)

In [69]:
y_train_smote

0       0
1       0
2       1
3       0
4       1
       ..
9479    1
9480    1
9481    1
9482    1
9483    1
Name: label, Length: 9484, dtype: int64

In [70]:
vectorized_x_test

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(1584, 500), dtype=float32)

In [71]:
y_test

4896    0
7539    1
1677    0
1964    0
3025    0
       ..
1419    0
3939    0
7834    1
5137    1
4434    0
Name: label, Length: 1584, dtype: int64

## Model building and Evaluation

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [73]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

## Logistic Regression

In [85]:
lr = LogisticRegression()
lr.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred =lr.predict(vectorized_x_train_smote)

y_test_pred = lr.predict(vectorized_x_test)

training_scores(y_train_smote,y_train_pred)

validation_scores(y_test,y_test_pred)

Training Scores:
	Accuracy = 0.904
	Precision = 0.872
	Recall = 0.947
	F1-Score = 0.908
Testing Scores:
	Accuracy = 0.875
	Precision = 0.717
	Recall = 0.896
	F1-Score = 0.796


## Naive Bayes

In [86]:
mnb = MultinomialNB()
mnb.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred =mnb.predict(vectorized_x_train_smote)

y_test_pred = mnb.predict(vectorized_x_test)

training_scores(y_train_smote,y_train_pred)

validation_scores(y_test,y_test_pred)

Training Scores:
	Accuracy = 0.891
	Precision = 0.854
	Recall = 0.944
	F1-Score = 0.897
Testing Scores:
	Accuracy = 0.879
	Precision = 0.712
	Recall = 0.938
	F1-Score = 0.809


## Desicison Tree

In [87]:
dt = DecisionTreeClassifier()
dt.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred =dt.predict(vectorized_x_train_smote)

y_test_pred = dt.predict(vectorized_x_test)

training_scores(y_train_smote,y_train_pred)

validation_scores(y_test,y_test_pred)

Training Scores:
	Accuracy = 0.994
	Precision = 0.993
	Recall = 0.995
	F1-Score = 0.994
Testing Scores:
	Accuracy = 0.816
	Precision = 0.685
	Recall = 0.6
	F1-Score = 0.64


## Random Forest

In [88]:
rf = RandomForestClassifier()

rf.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred =rf.predict(vectorized_x_train_smote)

y_test_pred = rf.predict(vectorized_x_test)

training_scores(y_train_smote,y_train_pred)

validation_scores(y_test,y_test_pred)

Training Scores:
	Accuracy = 0.994
	Precision = 0.992
	Recall = 0.996
	F1-Score = 0.994
Testing Scores:
	Accuracy = 0.856
	Precision = 0.771
	Recall = 0.671
	F1-Score = 0.718


## Support Vector Machine

In [92]:
svm = SVC()

svm.fit(vectorized_x_train_smote,y_train_smote)

y_train_pred =svm.predict(vectorized_x_train_smote)

y_test_pred = svm.predict(vectorized_x_test)

training_scores(y_train_smote,y_train_pred)

validation_scores(y_test,y_test_pred)

Training Scores:
	Accuracy = 0.953
	Precision = 0.927
	Recall = 0.983
	F1-Score = 0.954
Testing Scores:
	Accuracy = 0.887
	Precision = 0.75
	Recall = 0.88
	F1-Score = 0.809


In [95]:
import pickle

with open('../static/model/model.pickle','wb') as file:
    pickle.dump(svm,file)