In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold,GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error
from xgboost import XGBClassifier, XGBRegressor
import warnings
import random
import matplotlib.pyplot as plt
import joblib
import pickle
import lightgbm as lgb
from tqdm import tqdm
import re
from itertools import combinations
import contractions
from nltk.stem import WordNetLemmatizer
import nltk
from tqdm import tqdm 
nltk.download('wordnet')  # Download the WordNet data
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hvutr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Some utility functions to print out reports 

In [2]:
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

def print_regression_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print('Mean absolute error:', mean_absolute_error(y_val, y_pred))
    print('Mean squared error:', mean_squared_error(y_val, y_pred))
    print('Root Mean squared error:', np.sqrt(mean_squared_error(y_val, y_pred)))

def print_classification_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print(f'Accuracy Score: {accuracy_score(y_val, y_pred)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_val, y_pred)}')
    print(f'Classification Report: \n {classification_report(y_val, y_pred)}')

def print_multilabel_classification_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print(f'Accuracy Score: {accuracy_score(y_val, y_pred)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_val.argmax(axis=1), y_pred.argmax(axis=1))}')
    print(f'Classification Report: \n {classification_report(y_val, y_pred)}')



## Data Preview

In [26]:
# data set preview
num_samples_twitter = 20000
num_samples_amazon = 20000
num_samples_imdb = 20000
num_samples_mcdonald = 20000
num_samples_instagram = 20000

# process imdb data
imdb_data = pd.read_csv('datasets/sentiment/imdb.csv', encoding='latin-1')
imdb_data.columns = ['Text', 'Label']
imdb_data = imdb_data.sample(num_samples_imdb)
imdb_data = imdb_data[['Label', 'Text']]
imdb_data['Label'] = imdb_data['Label'].replace(1, 'Positive')
imdb_data['Label'] = imdb_data['Label'].replace(0, 'Negative')

# process mcdonalds data
mcdonalds_data = pd.read_csv('datasets/sentiment/mcdonald.csv', encoding='latin-1')
mcdonalds_data.columns = ['Label', 'Text']
mcdonalds_data = mcdonalds_data.sample(num_samples_mcdonald)

# process twitter data
twitter_data = pd.read_csv('datasets/sentiment/twitter.csv', encoding='latin-1', header=None)
twitter_data.columns = ['Label', 'id', 'date', 'flag', 'user', 'Text']
twitter_data = twitter_data.sample(num_samples_twitter)
twitter_data = twitter_data[['Label', 'Text']]
twitter_data['Label'] = twitter_data['Label'].replace(4, 'Positive')
twitter_data['Label'] = twitter_data['Label'].replace(0, 'Negative')

# process amazon data
amazon_data = pd.read_csv('datasets/sentiment/amazon.csv', encoding='latin-1', header=None)
amazon_data.columns = ['Label', 'title', 'Text']
amazon_data = amazon_data.sample(num_samples_amazon)
amazon_data = amazon_data[['Label', 'Text']]
amazon_data['Label'] = amazon_data['Label'].replace(2, 'Positive')
amazon_data['Label'] = amazon_data['Label'].replace(1, 'Negative')

# process instagram data
instagram_data = pd.read_csv('datasets/sentiment/instagram.csv', encoding='latin-1', header=None)
instagram_data.columns = ['Text', 'Label', 'Date']
instagram_data = instagram_data.sample(num_samples_instagram)
instagram_data = instagram_data[['Label', 'Text']]
instagram_data['Label'] = instagram_data['Label'].replace('1', 'Negative')
instagram_data['Label'] = instagram_data['Label'].replace('2', 'Negative')
instagram_data['Label'] = instagram_data['Label'].replace('3', 'Neutral')
instagram_data['Label'] = instagram_data['Label'].replace('4', 'Positive')
instagram_data['Label'] = instagram_data['Label'].replace('5', 'Positive')



# process synthetic data
synthetic_data = pd.read_csv('datasets/sentiment/synthetic.csv')
combined_data = pd.concat([twitter_data, amazon_data, synthetic_data, mcdonalds_data, imdb_data, instagram_data], ignore_index=True)

for i in tqdm(range(len(combined_data))):
    combined_data['Text'][i] = contractions.fix(combined_data['Text'][i])
    tokens = nltk.word_tokenize(combined_data['Text'][i])
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    combined_data['Text'][i] = ' '.join(tokens)

combined_data.to_csv('datasets/sentiment/combined.csv', index=False)


100%|██████████| 101383/101383 [01:32<00:00, 1100.57it/s]


In [28]:
label_encoder = LabelEncoder()

def read_data(file_path):
    data = pd.read_csv(file_path, encoding='latin-1')
    data.dropna(inplace=True)
    X, y = data['Text'].values, data['Label'].values
    y = label_encoder.fit_transform(y)
    return X, y, data

X, y, data = read_data('datasets/sentiment/combined.csv')
print(label_encoder.inverse_transform([0, 1, 2]))
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'size of data: {len(data)}, shape of X: {X.shape}, shape of y: {y.shape}')

max_features = 7000
ngram_range = (1, 3)
max_depth = 4
subsample = 0.4
n_estimators = 1000
learning_rate = 0.2
common_words = []

vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range, lowercase=True, stop_words= common_words)
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)

model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, \
    max_depth=max_depth, n_jobs=-1, random_state=42, subsample=subsample)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print_classification_report(y_val, y_pred, 1)

misclassified = np.where(y_val != y_pred)
misclassified_labels = y_val[misclassified]
misclassified_predictions = y_pred[misclassified]
misclassified_text = data['Text'].values[misclassified]

indices = random.sample(range(len(misclassified_labels)), 10)

for i in indices:
    print(f'Text: {misclassified_text[i]}, Actual: {misclassified_labels[i]}, Predicted: {misclassified_predictions[i]}')

['Negative' 'Neutral' 'Positive']
size of data: 101383, shape of X: (101383,), shape of y: (101383,)
Fold: 1
Accuracy Score: 0.8017951373477339
Confusion Matrix: 
 [[8339  104 1432]
 [ 545  262  315]
 [1527   96 7657]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.80      0.84      0.82      9875
           1       0.57      0.23      0.33      1122
           2       0.81      0.83      0.82      9280

    accuracy                           0.80     20277
   macro avg       0.73      0.63      0.66     20277
weighted avg       0.79      0.80      0.79     20277

Text: ha not slept in FOR MONTHS . Why is my body ALWAYS making me get up by 8am ? ! WHYYYYYYYYY ., Actual: 2, Predicted: 1
Text: @ tasha_lnei Where are you ?, Actual: 2, Predicted: 0
Text: want to watch Jigoku Shoujo live action ! !, Actual: 1, Predicted: 0
Text: @ justroxonmute Well done Rox totally happy for you . Come over some time before the school day start dude . I

In [29]:
model.save_model('sentiment_analysis_combined')
joblib.dump(vectorizer, 'vectorizer_combined', compress=True)

['vectorizer_combined']

In [15]:
X, y, data = read_data('datasets/sentiment/combined.csv')
print(f'size of data: {len(data)}, shape of X: {X.shape}, shape of y: {y.shape}')
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range, lowercase=True, stop_words= common_words)
vectorizer.fit(X)
model = XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, \
        max_depth=max_depth, n_jobs=-1, random_state=42, subsample=subsample)
model.fit(vectorizer.transform(X), y)

y_pred = model.predict(vectorizer.transform(X))
print_classification_report(y, y_pred, 1)

model.save_model('sentiment_analysis_combined')
joblib.dump(vectorizer, 'vectorizer_combined', compress=True)

size of data: 41383, shape of X: (41383,), shape of y: (41383,)
Fold: 1
Accuracy Score: 0.9531208467245004
Confusion Matrix: 
 [[18347    13   954]
 [   45  1552   114]
 [  777    37 19544]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.96      0.95      0.95     19314
           1       0.97      0.91      0.94      1711
           2       0.95      0.96      0.95     20358

    accuracy                           0.95     41383
   macro avg       0.96      0.94      0.95     41383
weighted avg       0.95      0.95      0.95     41383



['vectorizer_combined']

In [None]:
print(label_encoder.inverse_transform([0, 1, 2]))

['Negative' 'Neutral' 'Positive']
