In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
import sklearn as sk
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

### We will use meta features for our xgboost

In [2]:
train_data = pd.read_csv('train_data_features.csv')
test_data = pd.read_csv('test_data_features.csv')
cols2drop = ['Unnamed: 0', 'id', 'owner_id', 'text', 'likes', 'stop_word_count', 'url_count', 
                           'mean_word_length', 'hashtag_count', 'mention_count', 'text_cleaned', 'target']
y = train_data.target
X = train_data.drop(columns=cols2drop)
y_test = test_data.target
X_test = test_data.drop(columns=cols2drop)

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, 
                                                    stratify=y, shuffle=True, random_state=42)

In [4]:
params = {'n_estimators': 1024,
          'max_depth': 3,
          'learning_rate': 0.1,
          'subsample': 0.96,
          'colsample_bytree': 0.80,
          'reg_lambda': 1,
          'reg_alpha': 1,
          'gamma': 1.4,
          'random_state': 16,
          'objective': 'binary:logistic',
          'tree_method': 'gpu_hist'}

model = XGBClassifier(**params)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
y_pred_proba = model.predict_proba(X_val)[:, 1]

print('accuracy: {:.3f}, f1 score: {:.3f}, roc_auc: {:.3f}'.format(accuracy_score(y_val, y_pred), f1_score(y_val, y_pred), roc_auc_score(y_val, y_pred_proba)))

accuracy: 0.968, f1 score: 0.969, roc_auc: 0.996


In [5]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
print('accuracy: {:.3f}, f1_score: {:.3f}, roc_auc_score: {:.3f}'.format(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred), roc_auc_score(y_test, y_pred_proba)))

accuracy: 0.965, f1_score: 0.965, roc_auc_score: 0.994


### Nice improvement! from accuracy: 0.899, f1_score: 0.897, roc_auc_score: 0.916