In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import KFold,GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_absolute_error, mean_squared_error
from xgboost import XGBClassifier, XGBRegressor
import warnings
import random
import matplotlib.pyplot as plt
import joblib
import pickle
import json


## Some utility functions to print out reports 

In [4]:
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

def print_regression_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print('Mean absolute error:', mean_absolute_error(y_val, y_pred))
    print('Mean squared error:', mean_squared_error(y_val, y_pred))
    print('Root Mean squared error:', np.sqrt(mean_squared_error(y_val, y_pred)))

def print_classification_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print(f'Accuracy Score: {accuracy_score(y_val, y_pred)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_val, y_pred)}')
    print(f'Classification Report: \n {classification_report(y_val, y_pred)}')

def print_multilabel_classification_report(y_val, y_pred, fold):
    print(f'Fold: {fold}')
    print(f'Accuracy Score: {accuracy_score(y_val, y_pred)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_val.argmax(axis=1), y_pred.argmax(axis=1))}')
    print(f'Classification Report: \n {classification_report(y_val, y_pred)}')



In [7]:
df = pd.read_csv('datasets/sentiment/stanford.csv', encoding='latin-1', header=None)
print(df.head())


             0           1                    2      3
0    body_text  phrase ids     sentiment values  label
1          ! '       22935              0.52778    NaN
2         ! ''       18235                  0.5    NaN
3       ! Alas      179257  0.44443999999999995    NaN
4  ! Brilliant       22936              0.86111    NaN


  df = pd.read_csv('datasets/sentiment/stanford.csv', encoding='latin-1', header=None)


In [9]:
label_encoder = LabelEncoder()

def read_data(file_path):
    data = pd.read_csv(file_path, encoding='latin-1', header=None)
    X, y = data['body_text'].values,  data['sentiment values'].values
    for i in range(len(y)):
        if y[i] < 0.33: 
            y[i] = 0
        elif y[i] < 0.66:
            y[i] = 1
        else:
            y[i] = 2
    y = label_encoder.fit_transform(y)
    return X, y, data

X, y, data = read_data('datasets/sentiment/stanford.csv')
#indices = random.sample(range(len(data)), sample_size)
X, y = X[indices], y[indices]

print(f'size of data: {len(X)}, shape of X: {X.shape}, shape of y: {y.shape}')

num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
fold = 1

for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2), lowercase=True)
    vectorizer.fit(X_train)

    X_train = vectorizer.transform(X_train)
    X_val = vectorizer.transform(X_val)

    model = XGBRegressor(n_estimators=1000, learning_rate=0.1, max_depth=10, n_jobs=-1, random_state=42, subsample=0.8, colsample_bytree=0.8)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    y_pred = np.round(y_pred)
    print_classification_report(y_val, y_pred, fold)
    fold += 1

    misclassified = np.where(y_val != y_pred)
    misclassified_labels = y_val[misclassified]
    misclassified_predictions = y_pred[misclassified]
    misclassified_reviews = data['text'].values[misclassified]

    indices = random.sample(range(len(misclassified_labels)), 10)

    for i in indices:
        print(f'Review: {misclassified_reviews[i]}, Actual: {misclassified_labels[i]}, Predicted: {misclassified_predictions[i]}')

  data = pd.read_csv(file_path, encoding='latin-1', header=None)


KeyError: 'body text'

In [18]:
model.save_model('sentiment_analysis.model')
joblib.dump(vectorizer, 'vectorizer.joblib', compress=True)

['vectorizer.joblib']

In [43]:
def round_to_nearest(number):
    rounded = round(number)
    if rounded < 0.5:
        return 0
    else:
        return 1

def read_data(file_path):
    data = pd.read_csv(file_path, encoding='latin-1', header=None)
    data.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
    X, y = data['text'].values,  data['target'].values
    y = label_encoder.fit_transform(y)
    return X, y, data

sample_size = 200000
X, y, data = read_data('datasets/sentiment/twitter.csv')
indices = random.sample(range(len(data)), sample_size)
X, y = X[indices], y[indices]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

print(f'size of data: {len(X)}, shape of X: {X.shape}, shape of y: {y.shape}')

vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2), lowercase=True)
vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)

model = XGBClassifier(n_estimators=1000, learning_rate=0.1, max_depth=10, n_jobs=-1, random_state=42, subsample=0.8, colsample_bytree=0.8)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
y_pred = np.round(y_pred)
print_classification_report(y_val, y_pred, fold)
fold += 1

misclassified = np.where(y_val != y_pred)
misclassified_labels = y_val[misclassified]
misclassified_predictions = y_pred[misclassified]
misclassified_reviews = data['text'].values[misclassified]

indices = random.sample(range(len(misclassified_labels)), 10)

for i in indices:
    print(f'Review: {misclassified_reviews[i]}, Actual: {misclassified_labels[i]}, Predicted: {misclassified_predictions[i]}')

size of data: 200000, shape of X: (200000,), shape of y: (200000,)
Fold: 16
Accuracy Score: 0.79135
Confusion Matrix: 
 [[15579  4456]
 [ 3890 16075]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.80      0.78      0.79     20035
           1       0.78      0.81      0.79     19965

    accuracy                           0.79     40000
   macro avg       0.79      0.79      0.79     40000
weighted avg       0.79      0.79      0.79     40000

Review: underworld will be down soon , Actual: 0, Predicted: 1
Review: off to coventry , Actual: 0, Predicted: 1
Review: @ashliewins boooo i got this tweet late. im outtie to pool parties , Actual: 1, Predicted: 0
Review: wahhh school  unbelieveably jealous of @JasmineBagci, she went on holiday to turkey yesterday!, Actual: 0, Predicted: 1
Review: Sprained ankle- doctor says I can't play tennis for 3 weeks! , Actual: 0, Predicted: 1
Review: ok shower &amp; baack to reality! , Actual: 1, Predi

In [44]:
model.save_model('sentiment_analysis_twitter.model')
joblib.dump(vectorizer, 'vectorizer_twitter.joblib', compress=True)

['vectorizer_twitter.joblib']