In [35]:
import pandas as pd
import numpy as np
import logging
import sys
import optuna
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from tqdm import tqdm
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [57]:
ABRIDGED_RUN = False

In [58]:
data = pd.read_parquet("../raw data/embeddings-SBERT.parquet")
if ABRIDGED_RUN == True:
    data = data.head(20)
print("Number of rows in dataframe: " + str(len(data)))
data.head()


Number of rows in dataframe: 80000


Unnamed: 0,Text,Label,Model,Original dataset,Row in original dataset,embedding_light,embedding_full
0,While driverless cars present many promising b...,Machine,darragh_claude_v7,essays,13355,"[-0.0229047, 0.08687921, 0.019888217, 0.026090...","[-0.015085863, 0.02961129, -0.008236221, 0.003..."
1,Homework Clubs: The Key to Unlocking Academic ...,Machine,llama2_chat,essays,7249,"[-0.02119578, -0.028534176, 0.012626177, 0.021...","[0.02044188, 0.023948455, 0.0008347047, 0.0031..."
2,"""The legalization of marijuana has been a cont...",Machine,falcon_180b_v1,essays,2603,"[0.13941133, 0.07467412, 0.0131247295, -0.0430...","[-0.010192346, 0.12589885, 0.032203812, 0.0045..."
3,Taking the opportunity to learn new things can...,Machine,mistral7binstruct_v1,essays,3993,"[-0.0354803, 0.005210507, 0.013692067, 0.04516...","[0.0003891297, -0.0013969073, -0.013199714, -0..."
4,Working with a partner is an effective way fo...,Machine,mistral7binstruct_v2,essays,3773,"[-0.06940715, 0.047554933, 0.03059095, 0.00423...","[0.038683362, 0.020366993, -0.02159926, 0.0061..."


In [59]:
data = pd.concat([data, pd.DataFrame(np.array(data['embedding_full'].to_list()))], axis = 1)
data['Label'] = data['Label'].apply(lambda x: 0 if x == 'Human' else 1)
data['Label + Dataset'] = data.apply(lambda x:str(x['Label']) + "_" + x['Original dataset'], axis = 1)

In [60]:
data.head()

Unnamed: 0,Text,Label,Model,Original dataset,Row in original dataset,embedding_light,embedding_full,0,1,2,...,759,760,761,762,763,764,765,766,767,Label + Dataset
0,While driverless cars present many promising b...,1,darragh_claude_v7,essays,13355,"[-0.0229047, 0.08687921, 0.019888217, 0.026090...","[-0.015085863, 0.02961129, -0.008236221, 0.003...",-0.015086,0.029611,-0.008236,...,-0.016958,-0.052456,-0.009349,0.019803,-0.014549,-0.058771,-0.011402,0.035806,-0.006425,1_essays
1,Homework Clubs: The Key to Unlocking Academic ...,1,llama2_chat,essays,7249,"[-0.02119578, -0.028534176, 0.012626177, 0.021...","[0.02044188, 0.023948455, 0.0008347047, 0.0031...",0.020442,0.023948,0.000835,...,0.015699,0.01399,-0.010262,-0.036643,-0.076902,-0.054551,0.016694,0.040694,-0.068161,1_essays
2,"""The legalization of marijuana has been a cont...",1,falcon_180b_v1,essays,2603,"[0.13941133, 0.07467412, 0.0131247295, -0.0430...","[-0.010192346, 0.12589885, 0.032203812, 0.0045...",-0.010192,0.125899,0.032204,...,0.045021,-0.090693,-0.006985,-0.035033,0.017864,-0.041333,-0.005947,0.015107,0.012057,1_essays
3,Taking the opportunity to learn new things can...,1,mistral7binstruct_v1,essays,3993,"[-0.0354803, 0.005210507, 0.013692067, 0.04516...","[0.0003891297, -0.0013969073, -0.013199714, -0...",0.000389,-0.001397,-0.0132,...,-5.7e-05,-0.025593,-0.031279,0.005985,0.002542,-0.056115,0.007472,0.148076,-0.04513,1_essays
4,Working with a partner is an effective way fo...,1,mistral7binstruct_v2,essays,3773,"[-0.06940715, 0.047554933, 0.03059095, 0.00423...","[0.038683362, 0.020366993, -0.02159926, 0.0061...",0.038683,0.020367,-0.021599,...,0.01771,-0.000647,0.001769,-0.024092,-0.066796,-0.024826,0.013834,0.105534,-0.056507,1_essays


In [61]:
le = LabelEncoder()

In [67]:
def objective(trial):
    classifier_name = trial.suggest_categorical(
        'classifier', 
        ['RandomForest', 'AdaBoostClassifier', 'HistGradientBoostingClassifier', 'XGBoost']
    )
    
    X = data.loc[:, 0:767]
    y = le.fit_transform(data['Label'])
    
    cv_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    if classifier_name == "RandomForest":
        param = {
            "max_depth": trial.suggest_int('rf_max_depth', 2, 32, log=True),
            "n_estimators": trial.suggest_int('rf_n_estimators', 10, 1000, log=True),
            "max_features": trial.suggest_categorical('rf_max_features', ['log2', 'sqrt']) 
        }
        classifier_obj = RandomForestClassifier(**param)
    
    elif classifier_name == "AdaBoostClassifier":
        param = {
            "n_estimators": trial.suggest_int('ada_n_estimators', 10, 1000, log=True),
            "learning_rate": trial.suggest_float("ada_learning_rate", 0.01, 1.0, log=True)
        }
        classifier_obj = AdaBoostClassifier(**param)
    
    elif classifier_name == "HistGradientBoostingClassifier":
        param = {
            "max_iter": trial.suggest_int('hist_max_iter', 10, 100),
            "max_depth": trial.suggest_int('hist_max_depth', 2, 32, log=True),
            "learning_rate": trial.suggest_float("hist_learning_rate", 0.01, 1.0, log=True)
        }
        classifier_obj = HistGradientBoostingClassifier(**param)
    
    elif classifier_name == "XGBoost":
        param = {
            "n_jobs": 1,
            "eval_metric": 'auc',
            "use_label_encoder": False,
            "n_estimators": trial.suggest_int('xgb_n_estimators', 10, 1000, log=True),
            "max_depth": trial.suggest_int('xgb_max_depth', 2, 32, log=True),
            "learning_rate": trial.suggest_float("xgb_eta", 1e-8, 1.0, log=True),
            "gamma": trial.suggest_float("xgb_gamma", 1e-8, 1.0, log=True),
            "reg_alpha": trial.suggest_float("xgb_alpha", 1e-8, 1.0, log=True),
            "reg_lambda": trial.suggest_float("xgb_lambda", 1e-8, 1.0, log=True),
            "booster": trial.suggest_categorical('xgb_booster', ['gbtree', 'gblinear', 'dart'])
        }
        
        pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
        classifier_obj = xgb.XGBClassifier(**param)
    
    cv_results = cross_validate(
        estimator=classifier_obj,
        X=X,
        y=y,
        scoring='accuracy',
        cv=cv_skf
    )
    
    print(param)
    print(cv_results)
    mean_score = cv_results['test_score'].mean()
    return mean_score

In [68]:
study = optuna.create_study(study_name="best_clr_ipynb",
                            direction='maximize',
                            load_if_exists=True,storage="sqlite:///best_clr_ipynb.db", load_if_exists=True)

study.optimize(objective, n_trials=10, show_progress_bar = True)

[I 2024-08-23 15:06:52,430] Using an existing study with name 'best_clr_ipynb' instead of creating a new one.


  0%|          | 0/10 [00:00<?, ?it/s]



{'n_estimators': 29, 'learning_rate': 0.012940925506561354}
{'fit_time': array([184.6389091 , 176.58479881, 179.71732879, 177.18944716,
       172.74654412]), 'score_time': array([0.09733391, 0.0939002 , 0.09763193, 0.13433886, 0.09522009]), 'test_score': array([0.5641875, 0.563875 , 0.5610625, 0.5615   , 0.570375 ])}
[I 2024-08-23 15:21:44,012] Trial 7 finished with value: 0.5641999999999999 and parameters: {'classifier': 'AdaBoostClassifier', 'ada_n_estimators': 29, 'ada_learning_rate': 0.012940925506561354}. Best is trial 4 with value: 0.6954874999999999.
{'n_jobs': 1, 'eval_metric': 'auc', 'use_label_encoder': False, 'n_estimators': 64, 'max_depth': 6, 'learning_rate': 0.03327588215984515, 'gamma': 0.0008184519148281169, 'reg_alpha': 1.0101556013232293e-06, 'reg_lambda': 0.07569057354377955, 'booster': 'gbtree'}
{'fit_time': array([21.64416528, 22.22803617, 21.014992  , 21.56227088, 23.15121794]), 'score_time': array([0.08862782, 0.08428693, 0.10565186, 0.11824417, 0.10205603]), 't

Parameters: { "gamma", "max_depth" } are not used.

Parameters: { "gamma", "max_depth" } are not used.

Parameters: { "gamma", "max_depth" } are not used.

Parameters: { "gamma", "max_depth" } are not used.

Parameters: { "gamma", "max_depth" } are not used.



{'n_jobs': 1, 'eval_metric': 'auc', 'use_label_encoder': False, 'n_estimators': 332, 'max_depth': 4, 'learning_rate': 2.4046824556586832e-08, 'gamma': 2.0284840337984725e-05, 'reg_alpha': 6.793936108916289e-08, 'reg_lambda': 6.098521793896174e-05, 'booster': 'gblinear'}
{'fit_time': array([56.32925582, 55.15828395, 57.06365919, 57.63547301, 53.86811614]), 'score_time': array([0.2145462 , 0.20518994, 0.17446709, 0.18674922, 0.15316677]), 'test_score': array([0.61225  , 0.617875 , 0.6120625, 0.6126875, 0.6136875])}
[I 2024-08-23 15:28:15,490] Trial 9 finished with value: 0.6137125 and parameters: {'classifier': 'XGBoost', 'xgb_n_estimators': 332, 'xgb_max_depth': 4, 'xgb_eta': 2.4046824556586832e-08, 'xgb_gamma': 2.0284840337984725e-05, 'xgb_alpha': 6.793936108916289e-08, 'xgb_lambda': 6.098521793896174e-05, 'xgb_booster': 'gblinear'}. Best is trial 4 with value: 0.6954874999999999.




{'n_estimators': 280, 'learning_rate': 0.07882704776406434}
{'fit_time': array([1874.5135088 , 3659.8232677 , 8245.83675694, 8797.45103288,
       8259.05357814]), 'score_time': array([0.89312696, 0.89453006, 0.93179107, 1.25625801, 1.26293492]), 'test_score': array([0.6385625, 0.6301875, 0.641    , 0.6321875, 0.6378125])}
[I 2024-08-24 00:02:17,630] Trial 10 finished with value: 0.63595 and parameters: {'classifier': 'AdaBoostClassifier', 'ada_n_estimators': 280, 'ada_learning_rate': 0.07882704776406434}. Best is trial 4 with value: 0.6954874999999999.
{'n_jobs': 1, 'eval_metric': 'auc', 'use_label_encoder': False, 'n_estimators': 35, 'max_depth': 2, 'learning_rate': 6.956897373651766e-08, 'gamma': 2.271917094552916e-05, 'reg_alpha': 0.22768514921956837, 'reg_lambda': 0.01008771180003035, 'booster': 'gbtree'}
{'fit_time': array([12.79390597, 13.69530296, 14.14514709, 14.05778384, 14.19418287]), 'score_time': array([0.19258595, 0.1188581 , 0.11148977, 0.18378091, 0.18687892]), 'test_sc



{'n_estimators': 214, 'learning_rate': 0.12432545191937902}
{'fit_time': array([4713.08826709, 4293.88226485, 5655.31165171, 7179.30366302,
       5452.6238389 ]), 'score_time': array([1.59819078, 2.49598622, 0.93064809, 1.9002409 , 1.087852  ]), 'test_score': array([0.644625 , 0.633375 , 0.6441875, 0.6359375, 0.644    ])}
[I 2024-08-24 07:38:30,504] Trial 12 finished with value: 0.6404250000000001 and parameters: {'classifier': 'AdaBoostClassifier', 'ada_n_estimators': 214, 'ada_learning_rate': 0.12432545191937902}. Best is trial 4 with value: 0.6954874999999999.
{'max_iter': 95, 'max_depth': 18, 'learning_rate': 0.802141173844583}
{'fit_time': array([21.82018828, 23.63505292, 20.97108984, 21.27544188, 19.03649378]), 'score_time': array([0.12526894, 0.11818719, 0.12004995, 0.17263103, 0.13189006]), 'test_score': array([0.7026875, 0.6990625, 0.694125 , 0.6975625, 0.7018125])}
[I 2024-08-24 07:40:18,516] Trial 13 finished with value: 0.69905 and parameters: {'classifier': 'HistGradientB