# Student Information

* Сандикха Рахарди РИМ-130908
* Мухин Виктор Александрович РИМ-130908

# About Dataset

We are using this dataset from kaggle :
[Download Dataset](https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis)

### Overview
This is an entity-level sentiment analysis dataset of twitter. Given a message and an entity, the task is to judge the sentiment of the message about the entity. There are three classes in this dataset: Positive, Negative and Neutral. We regard messages that are not relevant to the entity (i.e. Irrelevant) as Neutral.

# Import Libraries

In [None]:
import re
import time
import torch
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from collections import Counter
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
import warnings
warnings.filterwarnings('ignore')

# Dataset

In [None]:
with zipfile.ZipFile('data/twitter_training.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

In [5]:
twitter_training_df = pd.read_csv('data/twitter_training.csv', names=['id', 'entity', 'sentiment', 'content'])
twitter_training_df

Unnamed: 0,id,entity,sentiment,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [6]:
twitter_validation_df = pd.read_csv('data/twitter_validation.csv', names=['id', 'entity', 'sentiment', 'content'])
twitter_validation_df

Unnamed: 0,id,entity,sentiment,content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


# Check Data Info

### Train Dataset

In [7]:
total_data = len(twitter_training_df)
twitter_training_df.dropna(axis=0, inplace=True)
positive_records = len(twitter_training_df[twitter_training_df['sentiment'] == 'Positive'])
negative_records = len(twitter_training_df[twitter_training_df['sentiment'] == 'Negative'])
neutral_records = len(twitter_training_df[twitter_training_df['sentiment'] == 'Neutral'])
irrelevant_records = len(twitter_training_df[twitter_training_df['sentiment'] == 'Irrelevant'])

print(f"Total of Dataset: {total_data}\n")
print(f"Positive Records: {positive_records}")
print(f"Negative Records: {negative_records}")
print(f"Neutral Records: {neutral_records}")
print(f"Irrelevant Records: {irrelevant_records}")
print("==========================")
print(f"Total Records: {positive_records + negative_records + neutral_records + irrelevant_records}")

Total of Dataset: 74682

Positive Records: 20655
Negative Records: 22358
Neutral Records: 18108
Irrelevant Records: 12875
Total Records: 73996


### Validation Dataset

In [8]:
total_data = len(twitter_validation_df)
twitter_validation_df.dropna(axis=0, inplace=True)
positive_records = len(twitter_validation_df[twitter_validation_df['sentiment'] == 'Positive'])
negative_records = len(twitter_validation_df[twitter_validation_df['sentiment'] == 'Negative'])
neutral_records = len(twitter_validation_df[twitter_validation_df['sentiment'] == 'Neutral'])
irrelevant_records = len(twitter_validation_df[twitter_validation_df['sentiment'] == 'Irrelevant'])

print(f"Total of Dataset: {total_data}\n")
print(f"Positive Records: {positive_records}")
print(f"Negative Records: {negative_records}")
print(f"Neutral Records: {neutral_records}")
print(f"Irrelevant Records: {irrelevant_records}")
print("==========================")
print(f"Total Records: {positive_records + negative_records + neutral_records + irrelevant_records}")

Total of Dataset: 1000

Positive Records: 277
Negative Records: 266
Neutral Records: 285
Irrelevant Records: 172
Total Records: 1000


# Clean Content

In [10]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        tokens = word_tokenize(text)
        filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        return ' '.join(filtered_tokens)
    else:
        return ''

In [11]:
df = pd.concat([twitter_training_df, twitter_validation_df], axis=0).reset_index(drop=True)
df

Unnamed: 0,id,entity,sentiment,content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74991,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
74992,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
74993,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
74994,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [12]:
df['cleaned'] = df['content'].apply(preprocess_text)

positive_texts = ' '.join(df[df['sentiment'] == 'Positive']['cleaned']).split()
negative_texts = ' '.join(df[df['sentiment'] == 'Negative']['cleaned']).split()
neutral_texts = ' '.join(df[df['sentiment'] == 'Neutral']['cleaned']).split()
irrelevant_texts = ' '.join(df[df['sentiment'] == 'Irrelevant']['cleaned']).split()

positive_counter = Counter(positive_texts)
negative_counter = Counter(negative_texts)
neutral_counter = Counter(neutral_texts)
irrelevant_counter = Counter(irrelevant_texts)

print("Most common words in positive comments:")
print(positive_counter.most_common(10))

print("\nMost common words in negative comments:")
print(negative_counter.most_common(10))

print("\nMost common words in neutral comments:")
print(neutral_counter.most_common(10))

print("\nMost common words in irrelevant comments:")
print(irrelevant_counter.most_common(10))

Most common words in positive comments:
[('game', 3159), ('love', 1833), ('im', 1796), ('good', 1638), ('like', 1394), ('really', 1305), ('new', 1218), ('time', 1157), ('play', 1139), ('best', 1136)]

Most common words in negative comments:
[('game', 4567), ('get', 1849), ('like', 1655), ('shit', 1544), ('im', 1446), ('fix', 1341), ('fuck', 1305), ('play', 1253), ('cant', 1247), ('dont', 1146)]

Most common words in neutral comments:
[('johnson', 1846), ('game', 1643), ('amazon', 1176), ('_', 1080), ('get', 1031), ('like', 960), ('u', 935), ('im', 914), ('one', 894), ('google', 869)]

Most common words in irrelevant comments:
[('player', 1145), ('game', 1128), ('like', 1032), ('see', 957), ('im', 872), ('people', 781), ('ban', 763), ('one', 720), ('love', 713), ('good', 674)]


# Split Dataset

In [14]:
X = df['cleaned']
y = df['sentiment']

validation_size = 1000
X_train = X[:-validation_size]
y_train = y[:-validation_size]
X_val = X[-validation_size:]
y_val = y[-validation_size:]

In [15]:
# Save X_train and y_train to CSV files
X_train.to_csv('data/X_train.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False)

# Save X_val and y_val to CSV files
X_val.to_csv('data/X_val.csv', index=False)
y_val.to_csv('data/y_val.csv', index=False)

# Process

In [11]:
models = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
}

In [12]:
# Evaluation every models
for model_name, model in models.items():
    pipeline = make_pipeline(TfidfVectorizer(), model)
    
    # Training Model
    pipeline.fit(X_train, y_train)
    
    # Validation Model
    y_pred = pipeline.predict(X_val)
    
    # Evaluate Model
    print(f"\n{model_name}:\n")
    print(classification_report(y_val, y_pred, target_names=['Positive', 'Negative', 'Neutral', 'Irrelevant']))


Multinomial Naive Bayes:

              precision    recall  f1-score   support

    Positive       0.98      0.62      0.76       172
    Negative       0.70      0.94      0.80       266
     Neutral       0.95      0.70      0.81       285
  Irrelevant       0.78      0.91      0.84       277

    accuracy                           0.81      1000
   macro avg       0.85      0.79      0.80      1000
weighted avg       0.84      0.81      0.81      1000


Random Forest:

              precision    recall  f1-score   support

    Positive       0.99      0.97      0.98       172
    Negative       0.97      0.97      0.97       266
     Neutral       0.98      0.98      0.98       285
  Irrelevant       0.97      0.98      0.97       277

    accuracy                           0.97      1000
   macro avg       0.98      0.97      0.97      1000
weighted avg       0.97      0.97      0.97      1000

