# Sentimental Analysis Application

In [31]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

## Load dataset

In [32]:
df = pd.read_csv('../data/csv/tweets.csv')

## Data inspection

In [33]:
df.head()

Unnamed: 0,sentence,sentiment
0,awww that s a bummer you shoulda got david car...,0
1,is upset that he can t update his facebook by ...,0
2,i dived many times for the ball managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1524693 entries, 0 to 1524692
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentence   1524693 non-null  object
 1   sentiment  1524693 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 23.3+ MB


In [35]:
df["sentiment"].value_counts().reset_index()

Unnamed: 0,sentiment,count
0,0,767413
1,1,757280


In [36]:
df.isnull().sum()

sentence     0
sentiment    0
dtype: int64

So no null values. That is good

## Data cleaning

### Check for duplicate values

In [38]:
df.duplicated().sum()

365

Alright, time to drop duplicate values then

In [39]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,sentence,sentiment
0,awww that s a bummer you shoulda got david car...,0
1,is upset that he can t update his facebook by ...,0
2,i dived many times for the ball managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0
...,...,...
1524329,after using latex a lot any other typeset math...,1
1524330,on that note i hate word i hate pages i hate l...,0
1524331,ahhh back in a real text editing environment i...,1
1524332,trouble in iran i see hmm iran iran so far awa...,0


In [40]:
df.duplicated().sum()

0

### Clean text

In [42]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    return text

df["sentence"] = df["sentence"].apply(clean_text)
df

Unnamed: 0,sentence,sentiment
0,awww that s a bummer you shoulda got david car...,0
1,is upset that he can t update his facebook by ...,0
2,i dived many times for the ball managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0
...,...,...
1524329,after using latex a lot any other typeset math...,1
1524330,on that note i hate word i hate pages i hate l...,0
1524331,ahhh back in a real text editing environment i...,1
1524332,trouble in iran i see hmm iran iran so far awa...,0


Now we move on to the next thing

## Model training

In [43]:
# Train text split
X_train, X_test, y_train, y_test = train_test_split(df["sentence"], df["sentiment"], test_size=0.2, random_state=42)

In [44]:
vectorizer = TfidfVectorizer(max_features=5000)  # Limit features to 5000
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

### Initialize model

In [45]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

### Train the model to get accuracy score

In [46]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")
print(classification_report(y_test, y_pred))

Model Accuracy: 0.7865
              precision    recall  f1-score   support

           0       0.79      0.78      0.79    152962
           1       0.78      0.80      0.79    151904

    accuracy                           0.79    304866
   macro avg       0.79      0.79      0.79    304866
weighted avg       0.79      0.79      0.79    304866



### Save model

In [47]:
joblib.dump(model, "../models/sentiment_model.pkl")
joblib.dump(vectorizer, "../models/tfidf_vectorizer.pkl")

['../models/tfidf_vectorizer.pkl']

In [49]:
# Load the model and vectorizer
model = joblib.load("../models/sentiment_model.pkl")
vectorizer = joblib.load("../models/tfidf_vectorizer.pkl")

def predict_sentiment(text):
    text = clean_text(text)  # Apply the same preprocessing
    text_tfidf = vectorizer.transform([text])
    prediction = model.predict(text_tfidf)[0]
    return "Positive" if prediction == 1 else "Negative"

# Example
print(predict_sentiment("This product is just there!"))  # Positive
print(predict_sentiment("I really did not enjoy it."))  # Negative

Positive
Negative
