In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_wine
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [3]:
train

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [4]:
test

Unnamed: 0,ID,first_party,second_party,facts
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."
...,...,...,...,...
1235,TEST_1235,"Haitian Centers Council, Inc., et al.","Chris Sale, Acting Commissioner, Immigration A...",According to Executive Order No. 12807 signed ...
1236,TEST_1236,Whitman,"American Trucking Associations, Inc.",Section 109(a) of the Clean Air Act (CAA) requ...
1237,TEST_1237,Linda A. Matteo and John J. Madigan,William G. Barr,Linda Matteo and John Madigan created a plan f...
1238,TEST_1238,Washington State Apple Advertising Commission,Hunt,"In 1972, the North Carolina Board of Agricultu..."


In [5]:
sample_submission

Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
...,...,...
1235,TEST_1235,0
1236,TEST_1236,0
1237,TEST_1237,0
1238,TEST_1238,0


In [6]:
train.info()
print("\n ------------------------------------ \n")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB

 ------------------------------------ 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1240 entries, 0 to 1239
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            1240 non-null   object
 1   first_party   1240 non-null   object
 2   second_party  1240 non-null   object
 3   facts         1240 non-null   object
dtypes: object(4)
memory usage: 38.9+ KB


In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(train['ID'])
train['ID'] = label_encoder.transform(train['ID'])
label_encoder.fit(train['first_party'])
train['first_party'] = label_encoder.transform(train['first_party'])
label_encoder.fit(train['second_party'])
train['second_party'] = label_encoder.transform(train['second_party'])
label_encoder.fit(train['facts'])
train['facts'] = label_encoder.transform(train['facts'])

label_encoder.fit(test['ID'])
test['ID'] = label_encoder.transform(test['ID'])
label_encoder.fit(test['first_party'])
test['first_party'] = label_encoder.transform(test['first_party'])
label_encoder.fit(test['second_party'])
test['second_party'] = label_encoder.transform(test['second_party'])
label_encoder.fit(test['facts'])
test['facts'] = label_encoder.transform(test['facts'])


train_x = train.drop('first_party_winner', axis=1)
train_y = train['first_party_winner']
test_x = test

In [8]:
train_x, train_y = load_wine(return_X_y=True, as_frame=True)

In [9]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.3, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(124, 13) (54, 13) (124,) (54,)


In [10]:
XGB = XGBClassifier(max_depth=10,
                    n_estimators=25,
                    grow_policy='depthwise',
                    n_jobs=-1,
                    random_state=42,
                    tree_method='auto',
                    use_label_encoder=False,
                    eval_metric='mlogloss'
                    )

In [11]:
XGB.fit(X_train, y_train)

In [16]:
# print(test_x)
# X_test = pd.get_dummies(data=test_x)
# print(X_test)
XGB_pred = XGB.predict(X_val)

In [17]:
accuracy = accuracy_score(y_val, XGB_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 94.44%


In [18]:
print(classification_report(y_val, XGB_pred))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95        19
           1       0.95      0.90      0.93        21
           2       1.00      0.93      0.96        14

    accuracy                           0.94        54
   macro avg       0.95      0.94      0.95        54
weighted avg       0.95      0.94      0.94        54



In [19]:
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Fit the model
    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_train, y_train)

    # Make predictions
    XGB_pred = optuna_model.predict(X_val)

    # Evaluate predictions
    accuracy = accuracy_score(y_val, XGB_pred)
    return accuracy

In [20]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-06-06 17:54:24,230] A new study created in memory with name: no-name-1da71020-233e-4a45-a116-c4c38071ea51
[I 2023-06-06 17:54:24,287] Trial 0 finished with value: 0.9629629629629629 and parameters: {'max_depth': 4, 'learning_rate': 0.07957494721636177, 'n_estimators': 120, 'min_child_weight': 4, 'gamma': 9.146552122350105e-06, 'subsample': 0.3182889422710428, 'colsample_bytree': 0.4295624962971933, 'reg_alpha': 0.003393681088222139, 'reg_lambda': 2.5251401101476725e-07}. Best is trial 0 with value: 0.9629629629629629.
[I 2023-06-06 17:54:24,342] Trial 1 finished with value: 0.3888888888888889 and parameters: {'max_depth': 1, 'learning_rate': 0.8633018916111044, 'n_estimators': 269, 'min_child_weight': 4, 'gamma': 0.007722158150426904, 'subsample': 0.04321742224864196, 'colsample_bytree': 0.2710197677015753, 'reg_alpha': 0.8711304993510616, 'reg_lambda': 0.04911247688666042}. Best is trial 0 with value: 0.9629629629629629.
[I 2023-06-06 17:54:24,404] Trial 2 finished with value:

[I 2023-06-06 17:54:25,956] Trial 20 finished with value: 0.9629629629629629 and parameters: {'max_depth': 6, 'learning_rate': 0.6250889482384687, 'n_estimators': 398, 'min_child_weight': 3, 'gamma': 1.8911746702378054e-06, 'subsample': 0.9434722736134871, 'colsample_bytree': 0.9796844912759397, 'reg_alpha': 2.6841897372501214e-05, 'reg_lambda': 8.885424320794657e-08}. Best is trial 13 with value: 1.0.
[I 2023-06-06 17:54:26,096] Trial 21 finished with value: 1.0 and parameters: {'max_depth': 7, 'learning_rate': 0.4418912994007248, 'n_estimators': 499, 'min_child_weight': 2, 'gamma': 1.6761983036396429e-06, 'subsample': 0.9708539576621257, 'colsample_bytree': 0.47315605435614794, 'reg_alpha': 3.383205000368191e-05, 'reg_lambda': 0.0001394067569442042}. Best is trial 13 with value: 1.0.
[I 2023-06-06 17:54:26,234] Trial 22 finished with value: 1.0 and parameters: {'max_depth': 8, 'learning_rate': 0.29608287236077985, 'n_estimators': 474, 'min_child_weight': 1, 'gamma': 1.580610447894615

[I 2023-06-06 17:54:28,462] Trial 41 finished with value: 1.0 and parameters: {'max_depth': 8, 'learning_rate': 0.30131271493107514, 'n_estimators': 475, 'min_child_weight': 1, 'gamma': 1.5206569962739002e-07, 'subsample': 0.588813731720707, 'colsample_bytree': 0.5478708123367937, 'reg_alpha': 1.2348545536151285e-05, 'reg_lambda': 3.612665030727864e-05}. Best is trial 13 with value: 1.0.
[I 2023-06-06 17:54:28,583] Trial 42 finished with value: 1.0 and parameters: {'max_depth': 8, 'learning_rate': 0.35012070745259255, 'n_estimators': 457, 'min_child_weight': 1, 'gamma': 2.0332128159597607e-07, 'subsample': 0.7366033746716157, 'colsample_bytree': 0.7791923903324651, 'reg_alpha': 3.2889552538420585e-05, 'reg_lambda': 4.2652605502307654e-05}. Best is trial 13 with value: 1.0.
[I 2023-06-06 17:54:28,721] Trial 43 finished with value: 1.0 and parameters: {'max_depth': 8, 'learning_rate': 0.4526437771588359, 'n_estimators': 478, 'min_child_weight': 1, 'gamma': 4.767515615049954e-07, 'subsamp

[I 2023-06-06 17:54:31,232] Trial 62 finished with value: 0.9814814814814815 and parameters: {'max_depth': 7, 'learning_rate': 0.4116998492901609, 'n_estimators': 500, 'min_child_weight': 2, 'gamma': 5.851878324424621e-06, 'subsample': 0.7709150017612604, 'colsample_bytree': 0.7079601830776278, 'reg_alpha': 1.3344158523446172e-05, 'reg_lambda': 5.758877901326988e-05}. Best is trial 13 with value: 1.0.
[I 2023-06-06 17:54:31,364] Trial 63 finished with value: 1.0 and parameters: {'max_depth': 7, 'learning_rate': 0.33329588771650026, 'n_estimators': 445, 'min_child_weight': 1, 'gamma': 1.6825734435006601e-06, 'subsample': 0.5864154713629728, 'colsample_bytree': 0.6216257640742913, 'reg_alpha': 0.00010168719078864587, 'reg_lambda': 0.001703222780336515}. Best is trial 13 with value: 1.0.
[I 2023-06-06 17:54:31,506] Trial 64 finished with value: 0.9814814814814815 and parameters: {'max_depth': 6, 'learning_rate': 0.6079817693242958, 'n_estimators': 484, 'min_child_weight': 3, 'gamma': 6.37

[I 2023-06-06 17:54:33,816] Trial 83 finished with value: 0.9814814814814815 and parameters: {'max_depth': 9, 'learning_rate': 0.4301624779727093, 'n_estimators': 497, 'min_child_weight': 8, 'gamma': 4.619081044031891e-07, 'subsample': 0.7324309433797748, 'colsample_bytree': 0.465595601906634, 'reg_alpha': 8.065149480244897e-06, 'reg_lambda': 0.00013572429041102185}. Best is trial 13 with value: 1.0.
[I 2023-06-06 17:54:33,957] Trial 84 finished with value: 1.0 and parameters: {'max_depth': 8, 'learning_rate': 0.36228766149466507, 'n_estimators': 463, 'min_child_weight': 2, 'gamma': 1.0137817430702755e-06, 'subsample': 0.5263052092404696, 'colsample_bytree': 0.8615587489841697, 'reg_alpha': 0.00010986753683009211, 'reg_lambda': 1.2684942536404194e-05}. Best is trial 13 with value: 1.0.
[I 2023-06-06 17:54:34,098] Trial 85 finished with value: 1.0 and parameters: {'max_depth': 7, 'learning_rate': 0.20399409433526605, 'n_estimators': 481, 'min_child_weight': 2, 'gamma': 5.698615831275989

In [21]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


Number of finished trials: 100
Best trial:
  Value: 1.0
  Params: 
    max_depth: 7
    learning_rate: 0.3238039061156579
    n_estimators: 496
    min_child_weight: 2
    gamma: 1.1995985615385756e-05
    subsample: 0.2053578972599596
    colsample_bytree: 0.8322934444325596
    reg_alpha: 2.3500521450821262e-05
    reg_lambda: 1.123139439076734e-08


In [22]:
params = trial.params
XGB = XGBClassifier(**params)
XGB.fit(X_train, y_train)


In [23]:
XGB_pred = XGB.predict(X_val)


In [24]:
accuracy = accuracy_score(y_val, XGB_pred)
print("Accuracy after tuning: %.2f%%" % (accuracy * 100.0))

Accuracy after tuning: 100.00%


In [25]:
print(classification_report(y_val, XGB_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        14

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



In [26]:

XGB_submission = pd.read_csv('./sample_submission.csv')
XGB_pred = pd.DataFrame(XGB_pred)
XGB_submission['first_party_winner'] = XGB_pred
XGB_submission.to_csv("./XGB_tryout_submission.csv", index=False)