In [1]:
## For data handling
import pandas as pd
import numpy as np
import xgboost
from sklearn.model_selection import train_test_split, StratifiedKFold
from tqdm.notebook import tqdm

## For plotting
import matplotlib.pyplot as plt
from seaborn import set_style
set_style("whitegrid")

In [2]:
# Import our data
data = pd.read_parquet("../raw data/combined_data_with_embeddings.parquet")

In [3]:
data

Unnamed: 0,Text,Label,Original dataset,Row in original dataset,embedding
0,The idea of graduating high school in three ye...,Machine,essays,26613,"[0.01476596, -0.013095475, 0.002932728, -0.011..."
1,"Hey, I'm so excited to write this essay about ...",Machine,essays,26326,"[0.00292786, -0.013083563, 0.0047025573, -0.00..."
2,Introduction\n\nSelf-reliance is a concept tha...,Machine,essays,30579,"[-0.015756093, -0.023221416, -0.010959062, -0...."
3,"Sure, here's my attempt at writing an essay as...",Machine,essays,33547,"[0.013002162, 0.011017485, -0.03551094, 0.0290..."
4,The legalization of marijuana is a highly deba...,Machine,essays,33768,"[0.0016188276, 0.011155421, -0.004596148, 0.00..."
...,...,...,...,...,...
79995,The 1934 WANFL season was the 50th season of t...,Machine,wiki,116735,"[-0.02375175, 0.0018231793, -0.023106767, 0.02..."
79996,The Salle du Bel-Air or Salle du Bel-Air is a ...,Machine,wiki,101963,"[-0.028311426, -0.008757826, -0.0125656165, 0...."
79997,Jasmine Ser Xiang Wei (born 24 September 1987)...,Machine,wiki,92244,"[-0.0002513304, 0.004520807, -0.0046105087, -0..."
79998,The Nantuo 181 class tug is a Chinese diesel-e...,Machine,wiki,84183,"[-0.017250419, -0.012810612, -0.02905077, -0.0..."


In [4]:
# Put embedding coords into separate columns for ease of fitting below
data = pd.concat([data, pd.DataFrame(np.array(data['embedding'].to_list()))], axis = 1)
data['Label'] = data['Label'].apply(lambda x: 0 if x == 'Human' else 1)

In [5]:
# Train / test split stratified by label and original dataset
data['Label + Dataset'] = data.apply(lambda x:str(x['Label']) + "_" + x['Original dataset'], axis = 1) 
train, test = train_test_split(data, 
                               stratify = data['Label + Dataset'],
                               random_state = 406,
                              train_size = 0.8)

In [14]:
# 5-fold cross validation, stratified by label and original dataset
kfold = StratifiedKFold(n_splits = 5,
                        shuffle = True,
                        random_state = 406)
# Accuracy score
from sklearn.metrics import accuracy_score as acc

acc_scores = []

for fold, (t_index, v_index) in enumerate(kfold.split(train, train['Label + Dataset'])):
    # Get training x and labels, and validation x and labels
    tt_x = (train.iloc[t_index]).loc[:, 0:1023]
    tt_y = (train.iloc[t_index])['Label']
    vv_x = (train.iloc[v_index]).loc[:, 0:1023]
    vv_y = (train.iloc[v_index])['Label']
    print(f"Fitting fold {fold}")
    xgb = xgboost.XGBClassifier()
    xgb.fit(tt_x.values, tt_y.values.reshape(-1,1))
    print(f"Predicting fold {fold}")
    pred = xgb.predict(vv_x)
    acc_scores += [acc(vv_y.values.reshape(-1,1), pred)]

print("Final accuracy scores for each fold:")
print(acc_scores)

Fitting fold 0
Predicting fold 0
Fitting fold 1
Predicting fold 1
Fitting fold 2
Predicting fold 2
Fitting fold 3
Predicting fold 3
Fitting fold 4
Predicting fold 4
Final accuracy scores for each fold:
[0.72015625, 0.716171875, 0.717109375, 0.72359375, 0.716328125]


In [16]:
# xgboost out-of-the-box on full training data
tt, vv = train_test_split(train,
                          stratify = train['Label + Dataset'],
                          train_size = 0.8,
                          random_state = 406)
tt_x = tt.loc[:, 0:1023]
tt_y = tt['Label']
vv_x = vv.loc[:, 0:1023]
vv_y = vv['Label']
xgb = xgboost.XGBClassifier()
xgb.fit(tt_x.values, tt_y.values.reshape(-1,1))
pred = xgb.predict(vv_x)
print(f"Accuracy: {acc(vv_y.values.reshape(-1,1), pred)}")

Accuracy: 0.72484375


In [26]:
# Brief grid search

estimators = [10, 50, 100, 300, 500]
max_depths = [3, 5, 7]
accuracies = {}
for n_estimators in estimators:
    for max_depth in max_depths:
        xgb = xgboost.XGBClassifier(n_estimators = n_estimators,
                           max_depth = max_depth,
                           random_state = 406)
        xgb.fit(tt_x.values, tt_y.values.reshape(-1,1))
        pred = xgb.predict(vv_x)
        print("-----------------------------------------")
        print(f"Using n_estimators={n_estimators}, max_depth={max_depth}")
        print(f"Accuracy: {acc(vv_y.values.reshape(-1,1), pred)}")

-----------------------------------------
Using n_estimators=10, max_depth=3
Accuracy: 0.638984375
-----------------------------------------
Using n_estimators=10, max_depth=5
Accuracy: 0.660078125
-----------------------------------------
Using n_estimators=10, max_depth=7
Accuracy: 0.669609375
-----------------------------------------
Using n_estimators=50, max_depth=3
Accuracy: 0.682734375
-----------------------------------------
Using n_estimators=50, max_depth=5
Accuracy: 0.70671875
-----------------------------------------
Using n_estimators=50, max_depth=7
Accuracy: 0.70328125
-----------------------------------------
Using n_estimators=100, max_depth=3
Accuracy: 0.70859375
-----------------------------------------
Using n_estimators=100, max_depth=5
Accuracy: 0.720390625
-----------------------------------------
Using n_estimators=100, max_depth=7
Accuracy: 0.7221875
-----------------------------------------
Using n_estimators=300, max_depth=3
Accuracy: 0.72515625
------------

In [27]:
# xgboost on full training data with many estimators
tt, vv = train_test_split(train,
                          stratify = train['Label + Dataset'],
                          train_size = 0.8,
                          random_state = 406)
tt_x = tt.loc[:, 0:1023]
tt_y = tt['Label']
vv_x = vv.loc[:, 0:1023]
vv_y = vv['Label']
xgb = xgboost.XGBClassifier(n_estimators = 1500,
                           max_depth = 5,
                           random_state = 406)
xgb.fit(tt_x.values, tt_y.values.reshape(-1,1))
pred = xgb.predict(vv_x)
print(f"Accuracy: {acc(vv_y.values.reshape(-1,1), pred)}")

Accuracy: 0.7525
