In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_wine
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')
sample_submission = pd.read_csv('../sample_submission.csv')

In [4]:
train

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [5]:
test

Unnamed: 0,ID,first_party,second_party,facts
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."
...,...,...,...,...
1235,TEST_1235,"Haitian Centers Council, Inc., et al.","Chris Sale, Acting Commissioner, Immigration A...",According to Executive Order No. 12807 signed ...
1236,TEST_1236,Whitman,"American Trucking Associations, Inc.",Section 109(a) of the Clean Air Act (CAA) requ...
1237,TEST_1237,Linda A. Matteo and John J. Madigan,William G. Barr,Linda Matteo and John Madigan created a plan f...
1238,TEST_1238,Washington State Apple Advertising Commission,Hunt,"In 1972, the North Carolina Board of Agricultu..."


In [6]:
sample_submission

Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
...,...,...
1235,TEST_1235,0
1236,TEST_1236,0
1237,TEST_1237,0
1238,TEST_1238,0


In [7]:
train.info()
print("\n ------------------------------------ \n")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB

 ------------------------------------ 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1240 entries, 0 to 1239
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            1240 non-null   object
 1   first_party   1240 non-null   object
 2   second_party  1240 non-null   object
 3   facts         1240 non-null   object
dtypes: object(4)
memory usage: 38.9+ KB


In [8]:
label_encoder = LabelEncoder()
label_encoder.fit(train['ID'])
train['ID'] = label_encoder.transform(train['ID'])
label_encoder.fit(train['first_party'])
train['first_party'] = label_encoder.transform(train['first_party'])
label_encoder.fit(train['second_party'])
train['second_party'] = label_encoder.transform(train['second_party'])
label_encoder.fit(train['facts'])
train['facts'] = label_encoder.transform(train['facts'])

label_encoder.fit(test['ID'])
test['ID'] = label_encoder.transform(test['ID'])
label_encoder.fit(test['first_party'])
test['first_party'] = label_encoder.transform(test['first_party'])
label_encoder.fit(test['second_party'])
test['second_party'] = label_encoder.transform(test['second_party'])
label_encoder.fit(test['facts'])
test['facts'] = label_encoder.transform(test['facts'])


train_x = train.drop('first_party_winner', axis=1)
train_y = train['first_party_winner']
test_x = test

In [9]:
train_x, train_y = load_wine(return_X_y=True, as_frame=True)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.3, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(124, 13) (54, 13) (124,) (54,)


In [11]:
XGB = XGBClassifier(max_depth=10,
                    n_estimators=25,
                    grow_policy='depthwise',
                    n_jobs=-1,
                    random_state=42,
                    tree_method='auto',
                    use_label_encoder=False,
                    eval_metric='mlogloss'
                    )

In [12]:
XGB.fit(X_train, y_train)

In [13]:
# print(test_x)
# X_test = pd.get_dummies(data=test_x)
# print(X_test)
XGB_pred = XGB.predict(X_val)

In [14]:
accuracy = accuracy_score(y_val, XGB_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 94.44%


In [15]:
print(classification_report(y_val, XGB_pred))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95        19
           1       0.95      0.90      0.93        21
           2       1.00      0.93      0.96        14

    accuracy                           0.94        54
   macro avg       0.95      0.94      0.95        54
weighted avg       0.95      0.94      0.94        54



In [16]:
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Fit the model
    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_train, y_train)

    # Make predictions
    XGB_pred = optuna_model.predict(X_val)

    # Evaluate predictions
    accuracy = accuracy_score(y_val, XGB_pred)
    return accuracy

In [17]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-06-08 13:52:53,293] A new study created in memory with name: no-name-546f35d4-9f15-4e9d-9178-f270f05a4fc9
[I 2023-06-08 13:52:53,370] Trial 0 finished with value: 0.9814814814814815 and parameters: {'max_depth': 2, 'learning_rate': 0.9920467404274725, 'n_estimators': 158, 'min_child_weight': 5, 'gamma': 8.43220146133792e-06, 'subsample': 0.7604910820081897, 'colsample_bytree': 0.8996359493875281, 'reg_alpha': 7.77142142884651e-08, 'reg_lambda': 0.009478931375570373}. Best is trial 0 with value: 0.9814814814814815.
[I 2023-06-08 13:52:53,494] Trial 1 finished with value: 0.35185185185185186 and parameters: {'max_depth': 9, 'learning_rate': 0.017867027017862507, 'n_estimators': 363, 'min_child_weight': 6, 'gamma': 0.0001155650333675365, 'subsample': 0.010150024752852246, 'colsample_bytree': 0.3046314439460006, 'reg_alpha': 5.5961654080914395e-06, 'reg_lambda': 1.4353923636709694e-07}. Best is trial 0 with value: 0.9814814814814815.
[I 2023-06-08 13:52:53,608] Trial 2 finished wit

[I 2023-06-08 13:52:55,172] Trial 20 finished with value: 0.9629629629629629 and parameters: {'max_depth': 4, 'learning_rate': 0.9802228448073392, 'n_estimators': 131, 'min_child_weight': 3, 'gamma': 1.0022933716678235e-05, 'subsample': 0.6495281552819088, 'colsample_bytree': 0.3648805917596044, 'reg_alpha': 9.513201056644573e-06, 'reg_lambda': 0.0005184581906973949}. Best is trial 13 with value: 1.0.
[I 2023-06-08 13:52:55,227] Trial 21 finished with value: 0.9814814814814815 and parameters: {'max_depth': 1, 'learning_rate': 0.2442807358412428, 'n_estimators': 70, 'min_child_weight': 7, 'gamma': 0.6897515739894674, 'subsample': 0.9470879760387555, 'colsample_bytree': 0.9122597131459376, 'reg_alpha': 2.1268210947556396e-06, 'reg_lambda': 0.010270756055865561}. Best is trial 13 with value: 1.0.
[I 2023-06-08 13:52:55,301] Trial 22 finished with value: 0.9814814814814815 and parameters: {'max_depth': 2, 'learning_rate': 0.1952957930224125, 'n_estimators': 131, 'min_child_weight': 7, 'gam

[I 2023-06-08 13:52:56,883] Trial 41 finished with value: 0.9629629629629629 and parameters: {'max_depth': 1, 'learning_rate': 0.3969445929705682, 'n_estimators': 76, 'min_child_weight': 7, 'gamma': 0.008842305231711941, 'subsample': 0.7819513284621185, 'colsample_bytree': 0.6712948764185216, 'reg_alpha': 1.0414355136827573e-06, 'reg_lambda': 0.02823956319585156}. Best is trial 13 with value: 1.0.
[I 2023-06-08 13:52:56,951] Trial 42 finished with value: 0.9444444444444444 and parameters: {'max_depth': 2, 'learning_rate': 0.4550899461143358, 'n_estimators': 90, 'min_child_weight': 6, 'gamma': 0.0320307622676317, 'subsample': 0.9941424069695125, 'colsample_bytree': 0.8344198792440899, 'reg_alpha': 9.023650830766691e-07, 'reg_lambda': 0.07038833969233502}. Best is trial 13 with value: 1.0.
[I 2023-06-08 13:52:57,032] Trial 43 finished with value: 0.9629629629629629 and parameters: {'max_depth': 1, 'learning_rate': 0.1455795510759387, 'n_estimators': 149, 'min_child_weight': 5, 'gamma': 0

[I 2023-06-08 13:52:58,630] Trial 62 finished with value: 1.0 and parameters: {'max_depth': 1, 'learning_rate': 0.5823703209642029, 'n_estimators': 135, 'min_child_weight': 2, 'gamma': 1.7085341004723594e-05, 'subsample': 0.719636192872873, 'colsample_bytree': 0.5622547005004374, 'reg_alpha': 0.012078918566185936, 'reg_lambda': 0.987605581390358}. Best is trial 13 with value: 1.0.
[I 2023-06-08 13:52:58,711] Trial 63 finished with value: 1.0 and parameters: {'max_depth': 1, 'learning_rate': 0.3870291074870325, 'n_estimators': 144, 'min_child_weight': 2, 'gamma': 4.997987620579799e-05, 'subsample': 0.7234872516682546, 'colsample_bytree': 0.5627804190785601, 'reg_alpha': 1.892366207042116e-05, 'reg_lambda': 0.082776338240496}. Best is trial 13 with value: 1.0.
[I 2023-06-08 13:52:58,782] Trial 64 finished with value: 0.9814814814814815 and parameters: {'max_depth': 1, 'learning_rate': 0.5699555656993598, 'n_estimators': 135, 'min_child_weight': 1, 'gamma': 1.9618484068034545e-05, 'subsam

[I 2023-06-08 13:53:00,453] Trial 83 finished with value: 1.0 and parameters: {'max_depth': 1, 'learning_rate': 0.6438419636909531, 'n_estimators': 135, 'min_child_weight': 1, 'gamma': 5.868958498601309e-05, 'subsample': 0.7646977330274616, 'colsample_bytree': 0.5455404354183702, 'reg_alpha': 9.116186382776755e-05, 'reg_lambda': 0.1662828518706411}. Best is trial 13 with value: 1.0.
[I 2023-06-08 13:53:00,538] Trial 84 finished with value: 1.0 and parameters: {'max_depth': 2, 'learning_rate': 0.3613612764546025, 'n_estimators': 154, 'min_child_weight': 2, 'gamma': 0.00010824756742289346, 'subsample': 0.523366425162367, 'colsample_bytree': 0.432191533975653, 'reg_alpha': 0.0003039435260182245, 'reg_lambda': 0.10517494112112843}. Best is trial 13 with value: 1.0.
[I 2023-06-08 13:53:00,608] Trial 85 finished with value: 0.9814814814814815 and parameters: {'max_depth': 1, 'learning_rate': 0.4845759397119288, 'n_estimators': 123, 'min_child_weight': 2, 'gamma': 0.0015806912252926207, 'subs

In [20]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


Number of finished trials: 100
Best trial:
  Value: 1.0
  Params: 
    max_depth: 1
    learning_rate: 0.2885669025748655
    n_estimators: 137
    min_child_weight: 4
    gamma: 0.9150811874539638
    subsample: 0.5945202689350423
    colsample_bytree: 0.4947813111073132
    reg_alpha: 3.491363397516474e-07
    reg_lambda: 0.02715722949150186


In [21]:
params = trial.params
XGB = XGBClassifier(**params)
XGB.fit(X_train, y_train)


In [22]:
XGB_pred = XGB.predict(X_val)


In [23]:
accuracy = accuracy_score(y_val, XGB_pred)
print("Accuracy after tuning: %.2f%%" % (accuracy * 100.0))

Accuracy after tuning: 100.00%


In [24]:
print(classification_report(y_val, XGB_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        14

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



In [26]:

XGB_submission = pd.read_csv('../sample_submission.csv')
XGB_pred = pd.DataFrame(XGB_pred)
XGB_submission['first_party_winner'] = XGB_pred
XGB_submission.to_csv("./XGB_tryout_submission.csv", index=False)