In [None]:
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings("ignore")
warnings.resetwarnings()
warnings.filterwarnings("ignore", category=DeprecationWarning)

#Data

In [None]:
test=pd.read_csv('/content/twitter_training1.csv',names = ['id','source','sentiment','tweet'])
test.head()

Unnamed: 0,id,source,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
train=pd.read_csv('/content/twitter_validation1.csv',names = ['id','source','sentiment','tweet'])
train.head()

Unnamed: 0,id,source,sentiment,tweet
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [None]:
df = train

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  int64 
 1   source     74682 non-null  object
 2   sentiment  74682 non-null  object
 3   tweet      73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [None]:
df.describe()

Unnamed: 0,id
count,74682.0
mean,6432.586165
std,3740.42787
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [None]:
df.isnull().sum()

id             0
source         0
sentiment      0
tweet        686
dtype: int64

In [None]:
df = df.dropna(axis=0)

In [None]:
df.isnull().sum()


id           0
source       0
sentiment    0
tweet        0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 73996 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         73996 non-null  int64 
 1   source     73996 non-null  object
 2   sentiment  73996 non-null  object
 3   tweet      73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.8+ MB


In [None]:
test.isnull().sum()

id           0
source       0
sentiment    0
tweet        0
dtype: int64

In [None]:
test = test.dropna(axis=0)

#Data PreProcessing

In [None]:
import re

def clean_text(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)

    # Remove mentions and hashtags
    tweet = re.sub(r'@[A-Za-z0-9_]+|#[A-Za-z0-9_]+', '', tweet)

    # Remove special characters, numbers, and punctuation
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)

    # Remove 'RT' (Retweet) indicator
    tweet = re.sub(r'\bRT\b', '', tweet)

    return tweet.lower()

In [None]:
df.loc[: ,'tweet'] = df['tweet'].apply(clean_text)
df.head()

Unnamed: 0,id,source,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,i am coming to the borders and i will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you all
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands and i will murder y...


In [None]:
test.loc[:,'tweet'] = test['tweet'].apply(clean_text)

#StopWords


In [None]:
from nltk.corpus import stopwords

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def remove_stopwords(tweet):
    words = tweet.split(' ')
    words = [word for word in words if word not in set(stopwords.words('english'))]
    tweet = ' '.join(words)
    return tweet

In [None]:
df.loc[:,'tweet'] = df['tweet'].apply(remove_stopwords)
df.head()

Unnamed: 0,id,source,sentiment,tweet
0,2401,Borderlands,Positive,im getting borderlands murder
1,2401,Borderlands,Positive,coming borders kill
2,2401,Borderlands,Positive,im getting borderlands kill
3,2401,Borderlands,Positive,im coming borderlands murder
4,2401,Borderlands,Positive,im getting borderlands murder


In [None]:
test.loc[:,'tweet'] = test['tweet'].apply(remove_stopwords)

#Modelling


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix



In [None]:
models = [
    MultinomialNB(),
    LogisticRegression(),
    RandomForestClassifier(n_jobs=-1),
    GradientBoostingClassifier()
]

# Feature extraction methods
vectorizers = [
    ('TF-IDF', TfidfVectorizer()),
    ('Count Vectorizer', CountVectorizer())
]

In [None]:
X_train = df['tweet']
y_train = df['sentiment']
X_test = test['tweet']
y_test = test['sentiment']

In [None]:
for model in models:
    for vec_name, vec in vectorizers:
        pipeline = Pipeline([
            ('vectorizer',vec),( 'classifier', model)
        ])

        pipeline.fit(X_train,y_train)

        y_pred = pipeline.predict(X_test)
        report = classification_report(y_test, y_pred)
        cf = confusion_matrix(y_test,y_pred)

        print(f"\nModel: {model.__class__.__name__}, Vectorizer: {vec_name}")
        print("Confusion Matrix:\n", cf)
        print("Classification Report:\n", report)


Model: MultinomialNB, Vectorizer: TF-IDF
Confusion Matrix:
 [[100  35   5  32]
 [  1 250   3  12]
 [  2  49 199  35]
 [  2  20   4 251]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.95      0.58      0.72       172
    Negative       0.71      0.94      0.81       266
     Neutral       0.94      0.70      0.80       285
    Positive       0.76      0.91      0.83       277

    accuracy                           0.80      1000
   macro avg       0.84      0.78      0.79      1000
weighted avg       0.83      0.80      0.80      1000


Model: MultinomialNB, Vectorizer: Count Vectorizer
Confusion Matrix:
 [[122  21   4  25]
 [  3 239   7  17]
 [  7  32 214  32]
 [  5  14   7 251]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.89      0.71      0.79       172
    Negative       0.78      0.90      0.84       266
     Neutral       0.92      0.75      0.83       285
    Positive  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model: LogisticRegression, Vectorizer: TF-IDF
Confusion Matrix:
 [[149  11   2  10]
 [  4 251   5   6]
 [  9  16 250  10]
 [  6   9   4 258]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.89      0.87      0.88       172
    Negative       0.87      0.94      0.91       266
     Neutral       0.96      0.88      0.92       285
    Positive       0.91      0.93      0.92       277

    accuracy                           0.91      1000
   macro avg       0.91      0.90      0.90      1000
weighted avg       0.91      0.91      0.91      1000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model: LogisticRegression, Vectorizer: Count Vectorizer
Confusion Matrix:
 [[153   7   1  11]
 [  2 253   3   8]
 [  9   6 264   6]
 [  6   6   2 263]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.90      0.89      0.89       172
    Negative       0.93      0.95      0.94       266
     Neutral       0.98      0.93      0.95       285
    Positive       0.91      0.95      0.93       277

    accuracy                           0.93      1000
   macro avg       0.93      0.93      0.93      1000
weighted avg       0.93      0.93      0.93      1000






Model: RandomForestClassifier, Vectorizer: TF-IDF
Confusion Matrix:
 [[166   2   0   4]
 [  1 261   2   2]
 [  1   5 275   4]
 [  2   2   3 270]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.98      0.97      0.97       172
    Negative       0.97      0.98      0.97       266
     Neutral       0.98      0.96      0.97       285
    Positive       0.96      0.97      0.97       277

    accuracy                           0.97      1000
   macro avg       0.97      0.97      0.97      1000
weighted avg       0.97      0.97      0.97      1000


Model: RandomForestClassifier, Vectorizer: Count Vectorizer
Confusion Matrix:
 [[167   1   0   4]
 [  2 260   1   3]
 [  1   4 274   6]
 [  1   2   1 273]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.98      0.97      0.97       172
    Negative       0.97      0.98      0.98       266
     Neutral       0.99      0.96      0.98       