In [3]:
# Sentiment Analysis project over a movie review dataset
# Author: Muhammad Humayun Khan

import pandas as pd
import numpy as np

dataset_path = "../datasets/sentiment_analysis/train.csv"

df = pd.read_csv(dataset_path,encoding = 'latin-1')

df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [6]:
# the dataset having extra features and we will keep the important one
df = df[['text','sentiment']]
df.sample(10)

Unnamed: 0,text,sentiment
26825,hardware store guy told me the screw would be ...,neutral
5608,woops! I only just realized my DMs to you are...,neutral
14309,http://twitpic.com/67hvr - This picture is fro...,negative
22594,GAH! I have the headache from hell. Reminds me...,neutral
2080,yay Happy Mothers Day to me Screw burnt brea...,positive
17349,Hiyaa! How was Tour? Really disappointed that...,negative
12352,_PingPong Awww I`m teaching my son (when I hav...,negative
18209,LMAO! listening to the great Bob Marley! Wow ...,positive
9655,"That`s not good, sorry to hear that, where di...",negative
124,not well,negative


In [9]:
df.shape

(27481, 2)

In [7]:
# checking the essentials in the dataset
df.isnull().sum()

text         1
sentiment    0
dtype: int64

In [None]:
# finding percentages of the class for the imbalanced dataset
class_count = df['sentiment'].value_counts()

# calculate the percentage as well
class_percentage = round((class_count / len(df)) * 100, 2).astype(str) + '%'

# combine both into dataframe
class_summary = pd.DataFrame({
    'Class':class_count.index,
    'Count':class_count.values,
    'Cent':class_percentage.values
})

class_summary

Unnamed: 0,Class,Count,Cent
0,neutral,11118,40.46%
1,positive,8582,31.23%
2,negative,7781,28.31%


In [18]:
# text preprocessing 
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove URLs, mentions, hashtags
    text = re.sub(r"http\S+|www\S+|@\S+|#\S+", "", text)

    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return ' '.join(cleaned_tokens)

# Drop rows with missing text
df = df.dropna(subset=['text'])

# Apply to DataFrame
df['clean_text'] = df['text'].apply(preprocess_text)

# View cleaned output
df[['text', 'clean_text']].sample(10)


Unnamed: 0,text,clean_text
13805,hahahaha your watching 106 too... i miss Ocea...,hahahaha watching miss ocean
11671,very talented Sam,talented sam
5268,hehehe hello!!! how are you? good to see you ...,hehehe hello good see twitter
6708,thanks glad you like it,thanks glad like
1850,I hate waiting in lines,hate waiting line
6320,This world makes me sad,world make sad
4726,graandma`s houseee i havee too leave now! <3,graandmas houseee havee leave
21704,Enjoying 'Gears of War' on my PC ! This game i...,enjoying gear war pc game really gooooood
14170,"Until brings back our EVERYONE button, you c...",brings back everyone button access public time...
9353,California,california


In [None]:
# Text Vectorization process in the NLP pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the cleaned text
X = vectorizer.fit_transform(df['clean_text'])

# Target variable (labels)
y = df['sentiment']

# Shape of transformed data
print("TF-IDF Shape:", X.shape)


TF-IDF Shape: (27480, 5000)


In [20]:
# As the data is converted into the numerical now we need the train/test split
from sklearn.model_selection import train_test_split

# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y      # for balanced dataset
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (21984, 5000)
Test shape: (5496, 5000)


In [21]:
# train the model with the LogisticRegression
from sklearn.linear_model import LogisticRegression

# Initialize and train the model
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)


In [22]:
# evaluate the result
from sklearn.metrics import classification_report, confusion_matrix

# Evaluation metrics
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Classification Report:

              precision    recall  f1-score   support

    negative       0.66      0.69      0.67      1556
     neutral       0.65      0.66      0.65      2223
    positive       0.76      0.72      0.74      1717

    accuracy                           0.69      5496
   macro avg       0.69      0.69      0.69      5496
weighted avg       0.69      0.69      0.69      5496

Confusion Matrix:

[[1067  405   84]
 [ 447 1458  318]
 [ 100  375 1242]]


In [None]:
# lets predict the new text
def predict_sentiment(text):
    # Preprocess
    clean = preprocess_text(text)
    
    # TF-IDF transform
    vector = vectorizer.transform([clean])
    
    # Predict
    pred = model.predict(vector)[0]
    return pred

print(predict_sentiment("I absolutely love this product!"))
print(predict_sentiment("This is the worst thing ever."))
print(predict_sentiment("It’s okay, I guess. Nothing special."))

# the output is good in terms of the normal sentences



positive
negative
neutral


In [None]:
# let's try the sarcasm sentences
print(predict_sentiment("Oh great, another Monday. Just what I needed")) # the output should be negative as sarcasm sentence
print(predict_sentiment("I don’t think this is a bad movie")) # the output should be positive as negative handling sentence
print(predict_sentiment("It was fun, but the ending ruined everything")) # the output should be neutral/negative as mixed/ambigous sentence
print(predict_sentiment("Yeah, sure whatever!")) # the output should be negative as passive aggressive tone

positive
negative
negative
neutral
