In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_wine
from sklearn.preprocessing import LabelEncoder, StandardScaler
import optuna

import warnings
warnings.filterwarnings('ignore')

In [14]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [3]:
train

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


In [4]:
test

Unnamed: 0,ID,first_party,second_party,facts
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."
...,...,...,...,...
1235,TEST_1235,"Haitian Centers Council, Inc., et al.","Chris Sale, Acting Commissioner, Immigration A...",According to Executive Order No. 12807 signed ...
1236,TEST_1236,Whitman,"American Trucking Associations, Inc.",Section 109(a) of the Clean Air Act (CAA) requ...
1237,TEST_1237,Linda A. Matteo and John J. Madigan,William G. Barr,Linda Matteo and John Madigan created a plan f...
1238,TEST_1238,Washington State Apple Advertising Commission,Hunt,"In 1972, the North Carolina Board of Agricultu..."


In [5]:
sample_submission

Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0
...,...,...
1235,TEST_1235,0
1236,TEST_1236,0
1237,TEST_1237,0
1238,TEST_1238,0


In [15]:
train.info()
print("\n ------------------------------------ \n")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2478 entries, 0 to 2477
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  2478 non-null   object
 1   first_party         2478 non-null   object
 2   second_party        2478 non-null   object
 3   facts               2478 non-null   object
 4   first_party_winner  2478 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 96.9+ KB

 ------------------------------------ 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1240 entries, 0 to 1239
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            1240 non-null   object
 1   first_party   1240 non-null   object
 2   second_party  1240 non-null   object
 3   facts         1240 non-null   object
dtypes: object(4)
memory usage: 38.9+ KB


In [16]:
label_encoder = LabelEncoder()
label_encoder.fit(train['ID'])
train['ID'] = label_encoder.transform(train['ID'])
label_encoder.fit(train['first_party'])
train['first_party'] = label_encoder.transform(train['first_party'])
label_encoder.fit(train['second_party'])
train['second_party'] = label_encoder.transform(train['second_party'])
label_encoder.fit(train['facts'])
train['facts'] = label_encoder.transform(train['facts'])

label_encoder.fit(test['ID'])
test['ID'] = label_encoder.transform(test['ID'])
label_encoder.fit(test['first_party'])
test['first_party'] = label_encoder.transform(test['first_party'])
label_encoder.fit(test['second_party'])
test['second_party'] = label_encoder.transform(test['second_party'])
label_encoder.fit(test['facts'])
test['facts'] = label_encoder.transform(test['facts'])


train_x = train.drop('first_party_winner', axis=1)
train_y = train['first_party_winner']
test_x = test

In [17]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.3, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(1734, 4) (744, 4) (1734,) (744,)


In [19]:
XGB = XGBClassifier(max_depth=10,
                    n_estimators=25,
                    grow_policy='depthwise',
                    n_jobs=-1,
                    random_state=42,
                    tree_method='auto',
                    use_label_encoder=False,
                    eval_metric='mlogloss'
                    )

In [20]:
XGB.fit(X_train, y_train)

In [21]:
print(test_x)
X_test = pd.get_dummies(data=test_x)
print(X_test)
XGB_pred = XGB.predict(X_test)

        ID  first_party  second_party  facts
0        0          847           969    478
1        1          660           567    345
2        2          710           356    250
3        3          423           969    139
4        4           97           420    234
...    ...          ...           ...    ...
1235  1235          417           171     25
1236  1236         1027            43    455
1237  1237          570          1036    346
1238  1238         1015           446    204
1239  1239          939          1031    379

[1240 rows x 4 columns]
        ID  first_party  second_party  facts
0        0          847           969    478
1        1          660           567    345
2        2          710           356    250
3        3          423           969    139
4        4           97           420    234
...    ...          ...           ...    ...
1235  1235          417           171     25
1236  1236         1027            43    455
1237  1237          570       

In [25]:
print(y_val)

1753    1
259     1
2072    1
1000    0
56      0
       ..
591     1
2415    1
1446    0
1839    1
1975    0
Name: first_party_winner, Length: 744, dtype: int64


In [26]:
accuracy = accuracy_score(y_val, XGB_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

ValueError: Found input variables with inconsistent numbers of samples: [744, 1240]

In [27]:
print(classification_report(y_val, XGB_pred))

ValueError: Found input variables with inconsistent numbers of samples: [744, 1240]

In [28]:
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'mlogloss',
        'use_label_encoder': False
    }

    # Fit the model
    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_train, y_train)

    # Make predictions
    XGB_pred = optuna_model.predict(X_val)

    # Evaluate predictions
    accuracy = accuracy_score(y_val, XGB_pred)
    return accuracy

In [29]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-06-06 18:00:36,919] A new study created in memory with name: no-name-92aa09a7-2018-4e5e-89af-dd98c00a1c8c
[I 2023-06-06 18:00:36,964] Trial 0 finished with value: 0.6706989247311828 and parameters: {'max_depth': 2, 'learning_rate': 0.019232990192595115, 'n_estimators': 109, 'min_child_weight': 3, 'gamma': 3.065682237012287e-06, 'subsample': 0.3010628084548075, 'colsample_bytree': 0.010362803999082883, 'reg_alpha': 2.9268423405556826e-05, 'reg_lambda': 0.315668737415772}. Best is trial 0 with value: 0.6706989247311828.
[I 2023-06-06 18:00:36,988] Trial 1 finished with value: 0.5591397849462365 and parameters: {'max_depth': 8, 'learning_rate': 0.5561831452328493, 'n_estimators': 117, 'min_child_weight': 6, 'gamma': 6.076337954601188e-08, 'subsample': 0.026472928286010715, 'colsample_bytree': 0.031609207385723294, 'reg_alpha': 3.776247571865427e-08, 'reg_lambda': 0.01915688243398078}. Best is trial 0 with value: 0.6706989247311828.
[I 2023-06-06 18:00:37,040] Trial 2 finished with

[I 2023-06-06 18:00:39,361] Trial 20 finished with value: 0.5833333333333334 and parameters: {'max_depth': 9, 'learning_rate': 0.15502993178943159, 'n_estimators': 312, 'min_child_weight': 4, 'gamma': 0.0004098169920010513, 'subsample': 0.4188264781899202, 'colsample_bytree': 0.0160484341287176, 'reg_alpha': 0.0009184347312130795, 'reg_lambda': 1.671182477578929e-06}. Best is trial 7 with value: 0.6733870967741935.
[I 2023-06-06 18:00:39,498] Trial 21 finished with value: 0.6706989247311828 and parameters: {'max_depth': 9, 'learning_rate': 0.01079192214766695, 'n_estimators': 220, 'min_child_weight': 5, 'gamma': 0.8753869376243679, 'subsample': 0.19746686864286606, 'colsample_bytree': 0.047215291790703356, 'reg_alpha': 9.848945567698997e-08, 'reg_lambda': 1.950248781584225e-07}. Best is trial 7 with value: 0.6733870967741935.
[I 2023-06-06 18:00:39,618] Trial 22 finished with value: 0.6720430107526881 and parameters: {'max_depth': 9, 'learning_rate': 0.013360537054538511, 'n_estimators

[I 2023-06-06 18:00:41,474] Trial 40 finished with value: 0.6706989247311828 and parameters: {'max_depth': 5, 'learning_rate': 0.030212991755098444, 'n_estimators': 75, 'min_child_weight': 8, 'gamma': 0.00016234990007628157, 'subsample': 0.11985262295938058, 'colsample_bytree': 0.10863091902964844, 'reg_alpha': 6.71797507464513e-07, 'reg_lambda': 4.353305444504202e-06}. Best is trial 7 with value: 0.6733870967741935.
[I 2023-06-06 18:00:41,546] Trial 41 finished with value: 0.6706989247311828 and parameters: {'max_depth': 4, 'learning_rate': 0.022222796326438587, 'n_estimators': 165, 'min_child_weight': 6, 'gamma': 0.004717186090826166, 'subsample': 0.16241247242428822, 'colsample_bytree': 0.07974206113236508, 'reg_alpha': 1.2218576826258914e-06, 'reg_lambda': 1.244240350207356e-06}. Best is trial 7 with value: 0.6733870967741935.
[I 2023-06-06 18:00:41,629] Trial 42 finished with value: 0.6706989247311828 and parameters: {'max_depth': 6, 'learning_rate': 0.013425992927610488, 'n_estim

[I 2023-06-06 18:00:44,047] Trial 60 finished with value: 0.668010752688172 and parameters: {'max_depth': 4, 'learning_rate': 0.011900766383482083, 'n_estimators': 258, 'min_child_weight': 3, 'gamma': 0.0012226748184803289, 'subsample': 0.20355046365352325, 'colsample_bytree': 0.05379748955320259, 'reg_alpha': 2.0755069545411676e-05, 'reg_lambda': 1.5743952869235426e-06}. Best is trial 7 with value: 0.6733870967741935.
[I 2023-06-06 18:00:44,179] Trial 61 finished with value: 0.6733870967741935 and parameters: {'max_depth': 9, 'learning_rate': 0.013442705638397834, 'n_estimators': 150, 'min_child_weight': 5, 'gamma': 0.16605124929887827, 'subsample': 0.24159388406911145, 'colsample_bytree': 0.08893062676733794, 'reg_alpha': 1.037340298071392e-06, 'reg_lambda': 2.1933839071228402e-07}. Best is trial 7 with value: 0.6733870967741935.
[I 2023-06-06 18:00:44,269] Trial 62 finished with value: 0.6706989247311828 and parameters: {'max_depth': 9, 'learning_rate': 0.014320526063639153, 'n_esti

[I 2023-06-06 18:00:46,354] Trial 80 finished with value: 0.6706989247311828 and parameters: {'max_depth': 7, 'learning_rate': 0.012931128929189246, 'n_estimators': 172, 'min_child_weight': 5, 'gamma': 0.13851906670928263, 'subsample': 0.1697902134150824, 'colsample_bytree': 0.0345643445342152, 'reg_alpha': 9.917245370361685e-08, 'reg_lambda': 6.095457019664372e-07}. Best is trial 7 with value: 0.6733870967741935.
[I 2023-06-06 18:00:46,419] Trial 81 finished with value: 0.6747311827956989 and parameters: {'max_depth': 8, 'learning_rate': 0.027482111535622012, 'n_estimators': 63, 'min_child_weight': 2, 'gamma': 0.00645460043680281, 'subsample': 0.2166124537909851, 'colsample_bytree': 0.10237138139856099, 'reg_alpha': 1.843220980075756e-06, 'reg_lambda': 2.238158524702282e-08}. Best is trial 81 with value: 0.6747311827956989.
[I 2023-06-06 18:00:46,512] Trial 82 finished with value: 0.6653225806451613 and parameters: {'max_depth': 8, 'learning_rate': 0.028303928192696347, 'n_estimators'

In [30]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


Number of finished trials: 100
Best trial:
  Value: 0.6747311827956989
  Params: 
    max_depth: 8
    learning_rate: 0.027482111535622012
    n_estimators: 63
    min_child_weight: 2
    gamma: 0.00645460043680281
    subsample: 0.2166124537909851
    colsample_bytree: 0.10237138139856099
    reg_alpha: 1.843220980075756e-06
    reg_lambda: 2.238158524702282e-08


In [31]:
params = trial.params
XGB = XGBClassifier(**params)
XGB.fit(X_train, y_train)


In [32]:
XGB_pred = XGB.predict(X_val)


In [33]:
accuracy = accuracy_score(y_val, XGB_pred)
print("Accuracy after tuning: %.2f%%" % (accuracy * 100.0))

Accuracy after tuning: 67.47%


In [34]:
print(classification_report(y_val, XGB_pred))

              precision    recall  f1-score   support

           0       0.67      0.02      0.05       245
           1       0.67      0.99      0.80       499

    accuracy                           0.67       744
   macro avg       0.67      0.51      0.43       744
weighted avg       0.67      0.67      0.55       744



In [35]:

XGB_submission = pd.read_csv('./sample_submission.csv')
XGB_pred = pd.DataFrame(XGB_pred)
XGB_submission['first_party_winner'] = XGB_pred
XGB_submission.to_csv("./XGB_tryout_submission.csv", index=False)