# Sentiment Analysis with XGBoost

* python script to find the best hyperparameters for sentiment analysis with XGBoost Classifier

## Install and import required libraries and packages

In [1]:
from google.colab import drive
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import pickle

## XGBoost classifier on labeled dataset

* train dataset: 80% of manually labeled data
* test dataset: 20% of manually labeled data

### Read dataset

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# train data
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp project/Sentiment Analysis/final datasets/biden_tweets_labeled_train.csv')

train_df

Unnamed: 0.1,Unnamed: 0,raw_text,clean_text,subjectivity,polarity
0,1320,@Sundae_Gurl Biden's first move should be to p...,sundae gurl biden first move pardon hunter let...,0,2
1,782,@cobiscool @LilNasX Do you think joe Biden dri...,cobiscool lilnasx think joe biden drink caprisun,0,2
2,1128,"@ProudGayPatriot Don't worry, in the sense tha...",proudgaypatriot worry sense not getting electi...,1,1
3,199,@kclasgens @realDonaldTrump Not alone he didn'...,kclasgens realdonaldtrump not alone right even...,1,1
4,704,How Biden’s Climate Plans Will Shake Up Global...,biden climate plan shake global energy market,0,2
...,...,...,...,...,...
1407,1130,"@CNBC @CNBCPro If Biden comes to power, the go...",cnbc cnbcpro biden come power gold price return,1,1
1408,1294,"@KDFildesMBA We have a real President, Biden a...",kdfildesmba real president biden vice harris e...,1,1
1409,860,"What I care about at least as much, maybe more...",care least much maybe whether biden trump pres...,0,2
1410,1459,Joe Biden still doesn’t realize what the voter...,joe biden still realize voter democrat lost ac...,1,0


In [4]:
train_df.columns

Index(['Unnamed: 0', 'raw_text', 'clean_text', 'subjectivity', 'polarity'], dtype='object')

In [5]:
train_df = train_df.drop('Unnamed: 0', axis=1)

train_df

Unnamed: 0,raw_text,clean_text,subjectivity,polarity
0,@Sundae_Gurl Biden's first move should be to p...,sundae gurl biden first move pardon hunter let...,0,2
1,@cobiscool @LilNasX Do you think joe Biden dri...,cobiscool lilnasx think joe biden drink caprisun,0,2
2,"@ProudGayPatriot Don't worry, in the sense tha...",proudgaypatriot worry sense not getting electi...,1,1
3,@kclasgens @realDonaldTrump Not alone he didn'...,kclasgens realdonaldtrump not alone right even...,1,1
4,How Biden’s Climate Plans Will Shake Up Global...,biden climate plan shake global energy market,0,2
...,...,...,...,...
1407,"@CNBC @CNBCPro If Biden comes to power, the go...",cnbc cnbcpro biden come power gold price return,1,1
1408,"@KDFildesMBA We have a real President, Biden a...",kdfildesmba real president biden vice harris e...,1,1
1409,"What I care about at least as much, maybe more...",care least much maybe whether biden trump pres...,0,2
1410,Joe Biden still doesn’t realize what the voter...,joe biden still realize voter democrat lost ac...,1,0


In [6]:
# test data
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp project/Sentiment Analysis/final datasets/biden_tweets_labeled_test.csv')

test_df

Unnamed: 0.1,Unnamed: 0,raw_text,clean_text,subjectivity,polarity
0,0,Sidney Powell amends court filing that said Ge...,sidney powell amends court filing said georgia...,1,1
1,1,@LisaMirandoCNN @wolfblitzer What's your thoug...,lisamirandocnn wolfblitzer thought cnn jake ta...,0,2
2,2,Breaking #FoxNews Alert : Joe Biden just revea...,breaking foxnews alert joe biden revealed goin...,0,2
3,3,"@RWPUSA Richard, have you heard there could be...",rwpusa richard heard could several republican ...,1,0
4,4,@LeafCavalier @AFJustinKG1 Is that a real-life...,leafcavalier afjustinkg real life question not...,1,0
...,...,...,...,...,...
349,349,@realDonaldTrump Thank you Biden!!!,realdonaldtrump thank biden,1,1
350,350,Kellyanne Conway acknowledges Biden as apparen...,kellyanne conway acknowledges biden apparent w...,0,2
351,351,@JeffTutorials @realDonaldTrump Ah Jeff. I ag...,jefftutorials realdonaldtrump ah jeff agree bi...,1,0
352,352,CNN Exclusive: Biden says he will ask American...,cnn exclusive biden say ask american wear mask...,0,2


In [7]:
test_df.columns

Index(['Unnamed: 0', 'raw_text', 'clean_text', 'subjectivity', 'polarity'], dtype='object')

In [8]:
test_df = test_df.drop('Unnamed: 0', axis=1)

test_df

Unnamed: 0,raw_text,clean_text,subjectivity,polarity
0,Sidney Powell amends court filing that said Ge...,sidney powell amends court filing said georgia...,1,1
1,@LisaMirandoCNN @wolfblitzer What's your thoug...,lisamirandocnn wolfblitzer thought cnn jake ta...,0,2
2,Breaking #FoxNews Alert : Joe Biden just revea...,breaking foxnews alert joe biden revealed goin...,0,2
3,"@RWPUSA Richard, have you heard there could be...",rwpusa richard heard could several republican ...,1,0
4,@LeafCavalier @AFJustinKG1 Is that a real-life...,leafcavalier afjustinkg real life question not...,1,0
...,...,...,...,...
349,@realDonaldTrump Thank you Biden!!!,realdonaldtrump thank biden,1,1
350,Kellyanne Conway acknowledges Biden as apparen...,kellyanne conway acknowledges biden apparent w...,0,2
351,@JeffTutorials @realDonaldTrump Ah Jeff. I ag...,jefftutorials realdonaldtrump ah jeff agree bi...,1,0
352,CNN Exclusive: Biden says he will ask American...,cnn exclusive biden say ask american wear mask...,0,2


In [9]:
train_df.shape, test_df.shape

((1412, 4), (354, 4))

### Vectorize data

In [10]:
# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 1,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)

In [11]:
train_vectors = vectorizer.fit_transform(train_df['clean_text'])
test_vectors = vectorizer.transform(test_df['clean_text'])

### XGBoost model

In [12]:
# define model 
model = GradientBoostingClassifier()
model_name = model.__class__.__name__
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 150, num = 3)]
learning_rate = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
max_depth = [5,20,50, None]
min_samples_split = list(range(1, 16))
min_samples_leaf = list(range(1, 16))
max_features = ['auto', 'sqrt']
grid = {'n_estimators': n_estimators,
        'learning_rate': learning_rate,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'max_features': max_features}

# compile model
XGB_classifier = RandomizedSearchCV(model, grid, cv=5, n_iter=200, random_state=0,scoring='accuracy', error_score=0, n_jobs=-1)

# fit model
t0 = time.time()
grid_search = XGB_classifier.fit(train_vectors, train_df['polarity'])
t1 = time.time()

# predict with model
prediction_linear = grid_search.predict(test_vectors)
t2 = time.time()

# get train and test timings
time_linear_train = t1-t0
time_linear_predict = t2-t1

75 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 596, in fit
    monitor,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 672, in _fit_stages
    X_csr,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 246, in _fit_stage
    tree.fit(X, residual, sample_weight=sample_weight, check_input=False)
  File "/usr/local/lib/python3.7/dist

In [13]:
prediction_linear

array([1, 0, 2, 2, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 1, 0, 0, 2, 1, 0, 0, 1,
       2, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 2, 0, 1, 2, 2, 0, 2, 0, 0, 2,
       2, 0, 1, 1, 1, 2, 2, 0, 0, 1, 0, 0, 2, 1, 2, 2, 1, 0, 2, 2, 1, 1,
       1, 2, 0, 1, 0, 2, 0, 1, 1, 0, 2, 1, 1, 0, 1, 1, 2, 2, 0, 0, 0, 1,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 1, 0, 0, 0, 2,
       0, 0, 0, 0, 2, 2, 2, 1, 2, 2, 1, 2, 0, 0, 0, 2, 1, 0, 2, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 1, 2, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 2, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 1, 2, 1, 1,
       2, 0, 0, 0, 1, 1, 1, 1, 0, 2, 0, 0, 0, 0, 2, 1, 2, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 2, 0, 0, 0, 2, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 2, 2,
       0, 2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, 0, 1, 0, 2, 0, 2, 0, 1,
       1, 2, 0, 2, 1, 0, 1, 1, 2, 0, 0, 1, 1, 2, 1,

### Export model

In [14]:
# add xgb scores to dataframe
test_df['xgb_score'] = prediction_linear.tolist()

test_df

Unnamed: 0,raw_text,clean_text,subjectivity,polarity,xgb_score
0,Sidney Powell amends court filing that said Ge...,sidney powell amends court filing said georgia...,1,1,1
1,@LisaMirandoCNN @wolfblitzer What's your thoug...,lisamirandocnn wolfblitzer thought cnn jake ta...,0,2,0
2,Breaking #FoxNews Alert : Joe Biden just revea...,breaking foxnews alert joe biden revealed goin...,0,2,2
3,"@RWPUSA Richard, have you heard there could be...",rwpusa richard heard could several republican ...,1,0,2
4,@LeafCavalier @AFJustinKG1 Is that a real-life...,leafcavalier afjustinkg real life question not...,1,0,0
...,...,...,...,...,...
349,@realDonaldTrump Thank you Biden!!!,realdonaldtrump thank biden,1,1,1
350,Kellyanne Conway acknowledges Biden as apparen...,kellyanne conway acknowledges biden apparent w...,0,2,1
351,@JeffTutorials @realDonaldTrump Ah Jeff. I ag...,jefftutorials realdonaldtrump ah jeff agree bi...,1,0,1
352,CNN Exclusive: Biden says he will ask American...,cnn exclusive biden say ask american wear mask...,0,2,2


In [15]:
# pickling the vectorizer
pickle.dump(vectorizer, open('tuned_xgb_vectorizer.sav', 'wb'))

In [16]:
# pickling the model
pickle.dump(XGB_classifier, open('tuned_xgb_classifier.sav', 'wb'))

### Evaluate model

In [17]:
print("Best Trainset Accuracy: %.2f%% using %s" % (grid_search.best_score_*100,grid_search.best_params_))

Best Trainset Accuracy: 59.84% using {'n_estimators': 150, 'min_samples_split': 11, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': None, 'learning_rate': 0.01}


In [18]:
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))

Training time: 3649.370012s; Prediction time: 0.052128s


In [19]:
acc_score = accuracy_score(test_df['polarity'], prediction_linear)

print("xgb accuracy: ", str(acc_score))

xgb accuracy:  0.7231638418079096


In [20]:
report_dict = classification_report(test_df['polarity'], prediction_linear, output_dict=True)

print('negative: ', report_dict['0'])
print('positive: ', report_dict['1'])
print('neutral: ', report_dict['2'])

negative:  {'precision': 0.7457627118644068, 'recall': 0.7904191616766467, 'f1-score': 0.7674418604651163, 'support': 167}
positive:  {'precision': 0.6631578947368421, 'recall': 0.65625, 'f1-score': 0.6596858638743456, 'support': 96}
neutral:  {'precision': 0.7439024390243902, 'recall': 0.6703296703296703, 'f1-score': 0.7052023121387282, 'support': 91}


In [21]:
report = classification_report(test_df['polarity'], prediction_linear)

print(report)

              precision    recall  f1-score   support

           0       0.75      0.79      0.77       167
           1       0.66      0.66      0.66        96
           2       0.74      0.67      0.71        91

    accuracy                           0.72       354
   macro avg       0.72      0.71      0.71       354
weighted avg       0.72      0.72      0.72       354

