In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords  #for removing stopwords
from nltk.stem import PorterStemmer #for stemming
from nltk.tokenize import word_tokenize #for tokenizing sentence into words
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,classification_report
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [2]:
#Loading Dataset
df=pd.read_csv("twitter_training.csv")

In [3]:
df.head()

Unnamed: 0,ID,Topic,Sentiment,Comment
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
#Extracting Columns
df=df[['Sentiment','Comment']]

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentiment  74682 non-null  object
 1   Comment    73996 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [6]:
#Handling Null values
df=df.dropna()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73996 entries, 0 to 74681
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentiment  73996 non-null  object
 1   Comment    73996 non-null  object
dtypes: object(2)
memory usage: 1.7+ MB


In [8]:
#Preprocessing data
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aksha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
ps=PorterStemmer()
stop_words=set(stopwords.words("english"))

In [10]:
def preprocess_text(text):
    words=word_tokenize(text.lower())
    #removing stopwords and stemming
    filtered_words=[word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(filtered_words)

In [11]:
df['Comment']=df['Comment'].apply(preprocess_text)

In [12]:
df['Sentiment'].value_counts()

Negative      22359
Positive      20654
Neutral       18108
Irrelevant    12875
Name: Sentiment, dtype: int64

In [13]:
df['Sentiment'].value_counts()

Negative      22359
Positive      20654
Neutral       18108
Irrelevant    12875
Name: Sentiment, dtype: int64

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 73996 entries, 0 to 74681
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentiment  73996 non-null  object
 1   Comment    73996 non-null  object
dtypes: object(2)
memory usage: 1.7+ MB


In [16]:
#Label Encoding
encoder=LabelEncoder()
df['Sentiment'] = encoder.fit_transform(df['Sentiment'])

In [17]:
df['Sentiment'].value_counts()

1    22359
3    20654
2    18108
0    12875
Name: Sentiment, dtype: int64

In [18]:
#Feature Extraction->vectorizing comment
vectorizer = TfidfVectorizer(max_features=73996)
X = vectorizer.fit_transform(df['Comment']).toarray()
y = df['Sentiment']

In [19]:
#Train Test Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
#Models->
models = {
    "Logistic Regression": LogisticRegression(),
    "Naïve Bayes": MultinomialNB(),
    "SVM": SVC(kernel='linear'),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "LightGBM": LGBMClassifier()
}

In [None]:
# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.4f}")