# Machine Learning

### Vectorization

Imports

In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

Data

In [2]:
df = pd.read_csv('../data/cleaned_tweets.csv')
df.head()

Unnamed: 0,id,tweet,sentiment,reply_count
0,1223752255846912000,fox corporation owner fox news trying bully ro...,-0.52,150
1,1223738389003952128,folk hear cornovirus deal news heck doctor rec...,-0.2,45
2,1223748267609030659,news finally excited journey potential finding...,0.5,6
3,1223739174160928773,good news person investigation novel coronavir...,-0.25,12
4,1223737953291128837,two avid golfer promised whoever died first wo...,0.166667,36


Create corpus

In [3]:
corpus = df['tweet'].values.astype(str)

Perform TF-IDF vectorization

In [4]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus)
print(f'There are {tfidf_matrix.shape[1]} unique words in the dataset.')

There are 31142 unique words in the dataset.


Save TF-IDF transformation

In [None]:
# Save the model to disk
pickle.dump(tfidf, open('../server/models/tfidf_vectorizer.pkl', 'wb'))

Sample of some features

In [58]:
features = tfidf.get_feature_names_out()
features[100:110]

array(['abner', 'abnormality', 'abo', 'aboard', 'abolish', 'abolished',
       'abolishing', 'abolition', 'abolitionist', 'abomination'],
      dtype=object)

Show TF-IDF matrix

In [63]:
tfidf_df = pd.DataFrame(tfidf_matrix.todense())
tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31132,31133,31134,31135,31136,31137,31138,31139,31140,31141
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Training

In [96]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import scipy.stats as st

In [72]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['sentiment'], test_size=0.2, random_state=42)

In [87]:
# Create classifier
clf = XGBRegressor()

# Do grid search to tune hyperparameters
param_dist = {
    'n_estimators': st.randint(10, 100),
    'learning_rate': st.uniform(0, 0.3),
    'max_depth': st.randint(10, 50),
    'lambda': st.expon(0, 10),
}
random_clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_jobs=-1, n_iter=100, scoring='neg_mean_squared_error', cv=5, verbose=10)

# train classifier
random_clf.fit(X_train, y_train)

[CV 1/5; 100/100] END lambda=44.35734297683265, learning_rate=0.09092388838675165, max_depth=39, n_estimators=71;, score=-0.197 total time=  44.0s
[CV 2/5; 100/100] END lambda=44.35734297683265, learning_rate=0.09092388838675165, max_depth=39, n_estimators=71;, score=-0.195 total time=  44.1s
[CV 4/5; 100/100] END lambda=44.35734297683265, learning_rate=0.09092388838675165, max_depth=39, n_estimators=71;, score=-0.200 total time=  43.3s
[CV 3/5; 100/100] END lambda=44.35734297683265, learning_rate=0.09092388838675165, max_depth=39, n_estimators=71;, score=-0.199 total time=  43.4s
[CV 5/5; 100/100] END lambda=44.35734297683265, learning_rate=0.09092388838675165, max_depth=39, n_estimators=71;, score=-0.186 total time=  43.5s


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categorical=False,
                                          eval_metric=None, gamma=None,
                                          gpu_id=None, grow_policy=None,
                                          importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None, max_bin=None,
                                          m...
                   param_distributions={'lambda': <scipy.stats._distn_infrastructure.rv_frozen object at 0

In [88]:
random_clf.best_params_

{'lambda': 16.83453883045462,
 'learning_rate': 0.1435953623400067,
 'max_depth': 14,
 'n_estimators': 59}

In [14]:
clf = XGBRegressor(
    n_estimators=59,
    learning_rate=0.144,
    max_depth=15,
    reg_lambda=16.835
)

grid_clf = GridSearchCV(clf, param_grid={}, scoring='neg_mean_squared_error', cv=5, verbose=5)
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END .................................., score=-0.193 total time=   2.5s
[CV 2/5] END .................................., score=-0.191 total time=   2.5s
[CV 3/5] END .................................., score=-0.195 total time=   2.7s
[CV 4/5] END .................................., score=-0.196 total time=   2.6s
[CV 5/5] END .................................., score=-0.180 total time=   2.7s


GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=None, gpu_id=None, grow_policy=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=0.144, max_bin=None,
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=15, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estimators=59,
  

### Testing

In [15]:
model = grid_clf.best_estimator_
mean_squared_error(y_test, model.predict(X_test))

0.19710004165135916

### Save Model

In [17]:
import pickle

pickle.dump(model, open('../server/models/xgboost_model.pkl', 'wb'))

In [18]:
model = open('../server/models/xgboost_model.pkl', 'rb')
model = pickle.load(model)
model = grid_clf.best_estimator_
mean_squared_error(y_test, model.predict(X_test))

0.19710004165135916