**Importing .csv file**

In [148]:
import pandas as pd
import numpy as np
import re

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

**Importing tweets data**

In [149]:
df_tweets = pd.read_csv('../data/Tweets.csv', usecols=['text', 'selected_text', 'sentiment'])
pd.set_option('display.max_columns', None)

df_tweets.head()

Unnamed: 0,text,selected_text,sentiment
0,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,my boss is bullying me...,bullying me,negative
3,what interview! leave me alone,leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [150]:
df_tweets.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

**Removing records with null values**

In [151]:
df_tweets = df_tweets.drop(df_tweets[df_tweets['text'].isna()].index.tolist())
df_tweets.reset_index(drop=True, inplace=True)
print(df_tweets.isna().sum())

text             0
selected_text    0
sentiment        0
dtype: int64


In [152]:
df_tweets.head()

Unnamed: 0,text,selected_text,sentiment
0,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,my boss is bullying me...,bullying me,negative
3,what interview! leave me alone,leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


**Cleaning up the text**

In [153]:
def clean_text(df, col_name):

    processed_features = []

    for sentence in range(0, len(df)):
        # Remove all the special characters
        processed_feature = re.sub(r'\W', ' ', str(df.loc[sentence, 'text']))

        # remove all single characters
        processed_feature = re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

        # Remove single characters from the start
        processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

        # Substituting multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

        # Removing prefixed 'b'
        processed_feature = re.sub(r'^b\s+', '', processed_feature)

        # Converting to Lowercase
        processed_feature = processed_feature.lower().strip()

        processed_features.append(processed_feature)

    return processed_features

In [154]:
df_tweets['text'] = clean_text(df_tweets, 'text')
df_tweets['text'].head()

0                       d have responded if were going
1             sooo sad will miss you here in san diego
2                               my boss is bullying me
3                        what interview leave me alone
4    sons of why couldn they put them on the releas...
Name: text, dtype: object

**Creating train, val, and test datasets** 

In [155]:
df_fulltrain, df_test = train_test_split(df_tweets[['text', 'sentiment']], test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_fulltrain, test_size=0.25, random_state=1)

len(df_train), len(df_val), len(df_test), len(df_tweets)

(16488, 5496, 5496, 27480)

In [156]:
df_train.shape, df_val.shape, df_test.shape

((16488, 2), (5496, 2), (5496, 2))

In [157]:
df_train.columns

Index(['text', 'sentiment'], dtype='object')

In [158]:
y_train = df_train.sentiment.values
y_val = df_val.sentiment.values
y_test = df_test.sentiment.values

del df_train['sentiment']
del df_val['sentiment']
del df_test['sentiment']

In [159]:
df_train.head()

Unnamed: 0,text
20135,listening to fountain of youth by supastition ...
12611,no sir they did not was amazed when woke up th...
10342,awww that sucks but they re so awesome when yo...
12499,leno last show tonight
8661,thanks funny cause it true


**Converting string to bag of words**

In [160]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(df_train['text']).toarray()
X_val = tfidf.transform(df_val['text']).toarray()

In [161]:
X_train.shape

(16488, 19278)

**Training Random Forests model**

This ensemble algorithm (i.e. uses multiple ML models) fits multiple decision tree classifiers on the dataset, and picks the majority vote among all decision trees as the final ouptut of the model. Some important hyperparameters of this model are, n_estimators (number of trees), bootstrap (use subsamples to build each tree instead of all data), max_depth (defines the maximum depth of all trees), etc. 

In [16]:
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

KeyboardInterrupt: 

**Evaluating model**

In [None]:
y_pred = model.predict(X_val)

In [None]:
print(accuracy_score(y_val, y_pred))

0.6743085880640466


In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

    negative       0.72      0.51      0.60      1538
     neutral       0.61      0.78      0.68      2269
    positive       0.77      0.68      0.72      1689

    accuracy                           0.67      5496
   macro avg       0.70      0.66      0.67      5496
weighted avg       0.69      0.67      0.67      5496



**Training SVM model**

SVMs create a decision boundary, which is a plane or line, that can seperate data clusters with the largest margin (which is the sum of the distances from the decision boundary to the nearest datapoint in both data clusters being classified) possible. If the data is not linearly seperable, SVMs extend the data into a higher dimension, by using functions known as kernels, and then finds a decision boundary in this new dimension. Kernels apply certain transformations on the data, which generate new features that are used to extend the dimension of the data, thereby hopefully making them linearly seperable. For example, the polynomial kernel takes one or more of the features and raises them to some power greater than or equal to 2.

In [None]:
model_SVM = SVC(random_state=1)
model_SVM.fit(X_train, y_train)

**Evaluation**

**Training XGBoost model**

Uses gradient boosting, which is a ensemble algorithm that combines multiple decision trees in sequential order, where each subsequent tree is focused on better predicting the cases where the previous tree performed poorly. This process is repeated multiple times, which results in better predictions than if a single model was used.  

In [31]:
dv = DictVectorizer()
y_train_xgb = dv.fit_transform(pd.DataFrame(y_train).to_dict(orient='records')).toarray()
y_val_xgb = dv.transform(pd.DataFrame(y_val).to_dict(orient='records')).toarray()

In [32]:
features = tfidf.get_feature_names_out()
dtrain = xgb.DMatrix(X_train, label=y_train_xgb, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_train_xgb, feature_names=features)

In [30]:
xgb_params = { 
    'ets': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'binary:logistic',
    'nthread': 8,

    'seed': 1,
    'verbosity': 0
}

model_xgb = xgb.train(xgb_params, dtrain, num_boost_round=200)

**Evaluation**

In [None]:
y_pred_probas = model_xgb.predict(dval)

In [None]:
y_pred_probas[:5]

array([[0.12928775, 0.6394369 , 0.3456895 ],
       [0.17623104, 0.5900711 , 0.16093381],
       [0.09746474, 0.04687188, 0.9736934 ],
       [0.05580972, 0.5229359 , 0.4790336 ],
       [0.00691166, 0.29903287, 0.613098  ]], dtype=float32)

In [None]:
idx = np.argmax( y_pred_probas, axis=-1 )
y_pred_vect = np.zeros( y_pred_probas.shape )
y_pred_vect[ np.arange(y_pred_vect.shape[0]), idx ] = 1

idx_neg = np.where(y_pred_vect[:,0] == 1)[0]
idx_net = np.where(y_pred_vect[:,1] == 1)[0]
idx_pos = np.where(y_pred_vect[:,2] == 1)[0]

y_pred = pd.DataFrame(np.empty(y_pred_vect.shape[0]))

y_pred.iloc[idx_neg] = 'negative'
y_pred.iloc[idx_net] = 'neutral'
y_pred.iloc[idx_pos] = 'positive'

In [None]:
y_pred

Unnamed: 0,0
0,neutral
1,neutral
2,positive
3,neutral
4,positive
...,...
5491,neutral
5492,positive
5493,positive
5494,neutral


In [85]:
accuracy_score(y_val, y_pred)

0.6863173216885007

In [87]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

    negative       0.72      0.55      0.62      1538
     neutral       0.62      0.77      0.69      2269
    positive       0.78      0.69      0.73      1689

    accuracy                           0.69      5496
   macro avg       0.71      0.67      0.68      5496
weighted avg       0.70      0.69      0.68      5496



**Training LightGBM model**

A gradient boosting algorithm similar to XGBoost. However, LightGBM is faster at training than XGBoost, due to use of leaf-wise growth of decision trees, as compared to XGBoost's level-wise growth. This results in higher accuracy, but could also cause overfitting.

In [162]:
y_train_lgbm = pd.Series(y_train)
y_val_lgbm = pd.Series(y_val)

classes = y_train_lgbm.value_counts().index

y_train_lgbm.loc[y_train_lgbm == classes[0]] = 0
y_train_lgbm.loc[y_train_lgbm == classes[1]] = 1 
y_train_lgbm.loc[y_train_lgbm == classes[2]] = 2

y_val_lgbm.loc[y_val_lgbm == classes[0]] = 0 
y_val_lgbm.loc[y_val_lgbm == classes[1]] = 1
y_val_lgbm.loc[y_val_lgbm == classes[2]] = 2 

In [163]:
y_train_lgbm

0        0
1        0
2        0
3        0
4        1
        ..
16483    1
16484    0
16485    2
16486    2
16487    2
Length: 16488, dtype: object

In [164]:
train_data = lgbm.Dataset(X_train, label=y_train_lgbm)
val_data = lgbm.Dataset(X_val, label=y_val_lgbm)

In [165]:
lgbm_params = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': 'true',
    'boosting': 'gbdt',
    'num_leaves': 63,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 20,
    'learning_rate': 0.01,
    'verbose': -1
}

In [166]:
model_lgbm = lgbm.train(lgbm_params, train_data, valid_sets=val_data, num_boost_round=5000, early_stopping_rounds=50)



ValueError: Series.dtypes must be int, float or bool

**Evaluation**

**Training CatBoost model**

Another gradient boosting algorithm. CatBoost contains native support for many feature types, such as numerical, categroical, and text. Additionally, CatBoost uses symmetrical decision trees, which allows for faster predictions, and reduces the chances of overfitting.

**Evaluation**

**Dumping model and tfid vectorizer to an external file**

In [None]:
import pickle

In [None]:
output_file = "../models/model.bin"

f_out = open(output_file, 'wb') 
pickle.dump((tfidf, model), f_out)
f_out.close()