In [None]:
import pandas as pd
import numpy as np
import pickle
from tqdm import notebook
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.metrics import accuracy_score

import optuna

In [None]:
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
df = pd.read_csv('final_tweets.csv')
df.head(3)

In [None]:
df.target.value_counts()

In [None]:
#creating train test splits
df_train,df_test = train_test_split(df,test_size=0.25,stratify = df['target'])
df_train.reset_index(drop=True,inplace=True)
df_test.reset_index(drop=True,inplace=True)
df_train.shape,df_test.shape

In [None]:
train_x = df_train['text']
test_x = df_test['text']
train_y = df_train['target']
test_y = df_test['target']

In [None]:
vect = TfidfVectorizer(max_features=5000)
vect

In [None]:
train_X_dtm = vect.fit_transform(train_x.values.astype('U'))
test_X_dtm = vect.transform(test_x.values.astype('U'))

In [None]:
train_X_dtm.shape,test_X_dtm.shape

In [None]:
clf = XGBClassifier()
clf

In [None]:
clf.fit(train_X_dtm,train_y)

In [None]:
y_pred=clf.predict(train_X_dtm)
print('Model accuracy score with default hyperparameters on train set: {0:0.4f}'. format(accuracy_score(train_y, y_pred)))

In [None]:
y_pred=clf.predict(test_X_dtm)
print('Model accuracy score with default hyperparameters on test set: {0:0.4f}'. format(accuracy_score(test_y, y_pred)))

In [None]:
def objective(trial,train_x=train_x,train_y=train_y,test_x=test_x,test_y=test_y):
    
    param = {
        'lambda': trial.suggest_loguniform('lambda', 1e-4, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-4, 10.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5,1.0,log=True),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,0.5,log=True),
        'max_depth': trial.suggest_int('max_depth', 5,15,step=1),
        'n_estimators':trial.suggest_int('n_estimators', 80, 200),
    }
    max_feats = {
        'max_features':trial.suggest_int('max_features', 3000, 6000)
    }
    model =XGBClassifier(**param,use_label_encoder=False)
    vect = TfidfVectorizer(**max_feats)
    train_X_dtm = vect.fit_transform(train_x.values.astype('U'))
    test_X_dtm = vect.transform(test_x.values.astype('U'))
    model.fit(train_X_dtm,train_y,eval_set=[(test_X_dtm,test_y)],early_stopping_rounds=100,verbose=False)    
    preds = model.predict(test_X_dtm)
    accuracy = accuracy_score(test_y, preds)
    return accuracy

In [None]:
# hyperparamter optimisation using optuna
%%capture
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
import joblib
#save study
joblib.dump(study, "study.pkl")
#load study
study2 = joblib.load("texts.pkl")

In [None]:
best_params = study.best_trial.params

In [None]:
max_feats = best_params['max_features']
model_params = {i:best_params[i] for i in best_params if i !='max_features'}

In [None]:
vect = TfidfVectorizer(max_features=max_feats)
vect

In [None]:
train_X_dtm = vect.fit_transform(train_x.values.astype('U'))
test_X_dtm = vect.transform(test_x.values.astype('U'))

In [None]:
clf = XGBClassifier(**model_params)
clf

In [None]:
clf.fit(train_X_dtm,train_y)

In [None]:
y_pred=clf.predict(train_X_dtm)
print('Model accuracy score with default hyperparameters on train set: {0:0.4f}'. format(accuracy_score(train_y, y_pred)))

In [None]:
y_pred=clf.predict(test_X_dtm)
print('Model accuracy score with default hyperparameters on test set: {0:0.4f}'. format(accuracy_score(test_y, y_pred)))

In [None]:
#saving the model
pickle_out = open("XGB_model.pkl",'wb')
pickle.dump(clf,pickle_out)
pickle_out.close()