## Sentiment Analysis: Model Training and Prediction

This notebook walks through the process of training sentiment analysis models for different sources and making predictions.

In [2]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import joblib
import os

In [3]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
#Load the datasets

train_data = pd.read_csv("F:/NLP Sentiment Analysis/data/twitter_training.csv", names=['serial_number',"Source","Sentiment","Text"])
val_data = pd.read_csv("F:/NLP Sentiment Analysis/data/twitter_validation.csv", names=['serial_number',"Source","Sentiment","Text"])

In [11]:
print(f"Training data shape: {train_data.shape}")
display(train_data.head())

Training data shape: (74682, 4)
Validation data shape: (1000, 4)

Training data sample:


Unnamed: 0,serial_number,Source,Sentiment,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [13]:
print(f"Validation data shape: {val_data.shape}")
display(val_data.head())


Validation data shape: (1000, 4)


Unnamed: 0,serial_number,Source,Sentiment,Text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [14]:
train_data.columns

Index(['serial_number', 'Source', 'Sentiment', 'Text'], dtype='object')

In [15]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]','',text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [18]:
train_data['Processed_Text'] = train_data['Text'].apply(preprocess_text)
display(train_data[['Text','Processed_Text']])

Unnamed: 0,Text,Processed_Text
0,im getting on borderlands and i will murder yo...,im getting borderlands murder
1,I am coming to the borders and I will kill you...,coming borders kill
2,im getting on borderlands and i will kill you ...,im getting borderlands kill
3,im coming on borderlands and i will murder you...,im coming borderlands murder
4,im getting on borderlands 2 and i will murder ...,im getting borderlands murder
...,...,...
74677,Just realized that the Windows partition of my...,realized windows partition mac like years behi...
74678,Just realized that my Mac window partition is ...,realized mac window partition years behind nvi...
74679,Just realized the windows partition of my Mac ...,realized windows partition mac years behind nv...
74680,Just realized between the windows partition of...,realized windows partition mac like years behi...


In [None]:
val_data['Processed_Text'] = val_data['Text'].apply(preprocess_text)
display(val_data[['Text','Processed_Text']]).head()