# Importing Libraries and Data Loading

In [40]:
import pandas as pd
import numpy as np
import re
import string
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

try:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    print("NLTK resources downloaded successfully.")
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")

NLTK resources downloaded successfully.


In [41]:
df = pd.read_csv("sentiment_data.csv")
print(f"Data loaded successfully Shape: {df.shape}")

Data loaded successfully Shape: (31014, 6)


## Analzying the Data 

In [42]:
# Display the first few rows and column info to check structure
print("First 5 Rows:")
display(df.head())

First 5 Rows:


Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria
3,01082688c6,happy bday!,positive,morning,46-60,Andorra
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola


In [43]:
df['sentiment'].unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [44]:
print("Column Information:")
df.info()

Column Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31014 entries, 0 to 31013
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         31014 non-null  object
 1   text           31014 non-null  object
 2   sentiment      31014 non-null  object
 3   Time of Tweet  31014 non-null  object
 4   Age of User    31014 non-null  object
 5   Country        31014 non-null  object
dtypes: object(6)
memory usage: 1.4+ MB


In [45]:
#Check for Missing Values (Nulls)
print("Missing values check:")
print(df.isnull().sum())

Missing values check:
textID           0
text             0
sentiment        0
Time of Tweet    0
Age of User      0
Country          0
dtype: int64


# PreProcessing the Data 

In [46]:
#Handling Potential Duplicates
print(f"Total rows before removing duplicates: {df.shape[0]}")
df.drop_duplicates(subset=None, keep='first', inplace=False)
print(f"Total rows after removing duplicates: {df.shape[0]}")

Total rows before removing duplicates: 31014
Total rows after removing duplicates: 31014


In [47]:
# Initializing global NLP objects
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [48]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    
    tokens = nltk.word_tokenize(text)

    cleaned_tokens = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if token.isalpha() and token not in stop_words
    ]
    
    return ' '.join(cleaned_tokens)

# Apply the cleaning function to the 'text' column
df['clean_text'] = df['text'].apply(clean_text)

print("Data cleaning complete. Added 'clean_text' column.")
print("\nSample of Cleaned Data:")
print(df[['text', 'clean_text', 'sentiment']].head())


Data cleaning complete. Added 'clean_text' column.

Sample of Cleaned Data:
                                                text  \
0  Last session of the day  http://twitpic.com/67ezh   
1   Shanghai is also really exciting (precisely -...   
2  Recession hit Veronique Branquinho, she has to...   
3                                        happy bday!   
4             http://twitpic.com/4w75p - I like it!!   

                                          clean_text sentiment  
0                                   last session day   neutral  
1  shanghai also really exciting precisely skyscr...  positive  
2  recession hit veronique branquinho quit compan...  negative  
3                                         happy bday  positive  
4                                               like  positive  


In [49]:
df.isnull().sum()

textID           0
text             0
sentiment        0
Time of Tweet    0
Age of User      0
Country          0
clean_text       0
dtype: int64

# Model Training of the Data

In [33]:
X = df['clean_text']
y = df['sentiment']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")


Training set size: 24811 samples
Testing set size: 6203 samples


## Using Tfid Vectorizer

In [None]:
# Vectorization: Fitting TfidfVectorizer on training data
tfidf_vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1, 2))

# Fitting and transforming the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transforming the test data using the fitted vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF features created. Training matrix shape: {X_train_tfidf.shape}")

TF-IDF features created. Training matrix shape: (24811, 50000)


In [35]:
#Model Training: Logistic Regression
sentiment_model = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000)

# Train the model on the TF-IDF features
sentiment_model.fit(X_train_tfidf, y_train)

print("Logistic Regression Model Training Complete.")



Logistic Regression Model Training Complete.


In [38]:
#Model Evaluation
y_pred = sentiment_model.predict(X_test_tfidf)

# Displaying the Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.73      0.59      0.65      1756
     neutral       0.65      0.76      0.70      2510
    positive       0.78      0.75      0.76      1937

    accuracy                           0.71      6203
   macro avg       0.72      0.70      0.70      6203
weighted avg       0.71      0.71      0.71      6203



In [39]:
# Saving the vectorizer and model
vectorizer_path = 'tfidf_vectorizer.joblib'
model_path = 'sentiment_model.joblib'

# Saving the TfidfVectorizer
joblib.dump(tfidf_vectorizer, vectorizer_path)
print(f"TfidfVectorizer saved to {vectorizer_path}")

# Saving the LogisticRegression Model
joblib.dump(sentiment_model, model_path)
print(f"LogisticRegression Model saved to {model_path}")

print("All files are saved and ready for analysis on real time data")


TfidfVectorizer saved to tfidf_vectorizer.joblib
LogisticRegression Model saved to sentiment_model.joblib
All files are saved and ready for analysis on real time data
