In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [24]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [25]:
train.dropna(inplace = True)

In [26]:
train['target'] = [1 if t == ">50K" else 0 for t in train['target']]

In [27]:
train.groupby('workclass').target.mean().sort_values()

workclass
Without-pay         0.000000
Private             0.219125
State-gov           0.278125
Self-emp-not-inc    0.295122
Local-gov           0.300876
Federal-gov         0.378205
Self-emp-inc        0.545290
Name: target, dtype: float64

In [28]:
def workclass_te(x) :
    if x == 'Without-pay' :
        return 0
    elif x == 'Private' :
        return 1
    elif x == 'State-gov' :
        return 2
    elif x == 'Self-emp-not-inc' :
        return 3
    elif x == 'Local-gov' :
        return 4
    elif x == 'Federal-gov' :
        return 5
    else :
        return 6

In [29]:
train['workclass'] = train['workclass'].apply(workclass_te)
test['workclass'] = test['workclass'].apply(workclass_te)

In [None]:
train['sex'] = [1 if s == 'Male' else 0 for s in train['sex']]
test['sex'] = [1 if s == 'Male' else 0 for s in test['sex']]

In [59]:
from sklearn.preprocessing import LabelEncoder

In [60]:
le = LabelEncoder()

In [61]:
train['native.country'] = le.fit_transform(train['native.country'])
test['native.country'] = le.transform(test['native.country'])

In [66]:
train['race'] = le.fit_transform(train['race'])
test['race'] = le.transform(test['race'])

In [74]:
o_tm = train.groupby('occupation').target.mean().sort_values().reset_index(name = 'o_tm')

In [75]:
train = pd.merge(train, o_tm, how = 'left')
test = pd.merge(test, o_tm, how = 'left')

In [77]:
train.drop(['education', 'occupation'], axis = 1, inplace = True)
test.drop(['education', 'occupation'], axis = 1, inplace = True)

In [90]:
pd.crosstab(train['marital.status'], train.relationship)

relationship,Husband,Not-in-family,Other-relative,Own-child,Unmarried,Wife
marital.status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Divorced,0,1133,59,152,742,0
Married-AF-spouse,6,0,1,1,0,4
Married-civ-spouse,6236,9,62,47,0,738
Married-spouse-absent,0,91,12,27,69,0
Never-married,0,2191,268,1956,399,0
Separated,0,196,20,45,193,0
Widowed,0,216,19,4,185,0


In [93]:
pd.crosstab(test['marital.status'], test.relationship)

relationship,Husband,Not-in-family,Other-relative,Own-child,Unmarried,Wife
marital.status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Divorced,0,1135,44,156,793,0
Married-AF-spouse,3,0,0,0,0,6
Married-civ-spouse,6218,5,56,36,0,658
Married-spouse-absent,0,90,14,16,51,0
Never-married,0,2257,280,1973,402,0
Separated,0,187,33,45,220,0
Widowed,0,216,21,8,158,0


In [97]:
train['mr'] = train['marital.status'] + train.relationship
test['mr'] = test['marital.status'] + test.relationship

In [98]:
train['mr'] = le.fit_transform(train['mr'])
test['mr'] = le.transform(test['mr'])

In [None]:
train.drop(['marital.status', 'relationship'], axis = 1, inplace = True)
test.drop(['marital.status', 'relationship'], axis = 1, inplace = True)

In [106]:
train['capital.gain'] = np.log1p(train['capital.gain'])
test['capital.gain'] = np.log1p(test['capital.gain'])

In [161]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [109]:
X = train.drop(['id', 'target'], axis = 1)
y = train.target

In [112]:
y.value_counts()

0    11308
1     3773
Name: target, dtype: int64

In [113]:
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

In [129]:
cat_features = ['race', 'native.country']

In [130]:
target = test[X.columns]

In [153]:
cb_pred = np.zeros(target.shape[0])
val_acc = []
for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    tr_data = Pool(data = tr_x, label = tr_y)#, cat_features = cat_features)
    val_data = Pool(data = val_x, label = val_y)#, cat_features = cat_features)
    
    cb = CatBoostClassifier(random_state = 42, iterations = 5000, max_depth = 4, learning_rate = 0.02,
                            eval_metric = 'Accuracy', use_best_model = True)#, cat_features = cat_features)
    
    cb.fit(tr_data, eval_set = val_data, early_stopping_rounds = 500, verbose = 0)
    
    val_pred = cb.predict_proba(val_x)[:, 1]
    val_cls = [1 if p >= 0.5 else 0 for p in val_pred]
    fold_acc = accuracy_score(val_y, val_cls)
    print(f"{i + 1} Fold ACC = {fold_acc}")
    val_acc.append(fold_acc)
    
    fold_pred = cb.predict_proba(target)[:, 1] / skf.n_splits
    cb_pred += fold_pred

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

1 Fold ACC = 0.8727634194831014
2 Fold ACC = 0.8594164456233422
3 Fold ACC = 0.8647214854111406
4 Fold ACC = 0.8620689655172413
5 Fold ACC = 0.866710875331565
6 Fold ACC = 0.863395225464191
7 Fold ACC = 0.8793103448275862
8 Fold ACC = 0.866710875331565
9 Fold ACC = 0.8759946949602122
10 Fold ACC = 0.886604774535809


In [154]:
np.mean(val_acc)

0.8697697106485756

In [150]:
lgbm_pred = np.zeros(target.shape[0])
val_acc = []
for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    lgbm = LGBMClassifier(random_state = 42, n_estimators = 5000, max_depth = 4,
                          learning_rate = 0.02, eval_metric = 'accuracy')
    
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], eval_metric = 'accuracy',
             early_stopping_rounds = 500, verbose = 500)
    
    val_pred = lgbm.predict_proba(val_x)[:, 1]
    val_cls = [1 if p >= 0.5 else 0 for p in val_pred]
    fold_acc = accuracy_score(val_y, val_cls)
    val_acc.append(fold_acc)
    
    fold_pred = lgbm.predict_proba(target)[:, 1] / skf.n_splits
    lgbm_pred += fold_pred

Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.264652	valid_1's binary_logloss: 0.281921
[1000]	training's binary_logloss: 0.240383	valid_1's binary_logloss: 0.278713
Early stopping, best iteration is:
[903]	training's binary_logloss: 0.24406	valid_1's binary_logloss: 0.278475
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.262622	valid_1's binary_logloss: 0.303887
[1000]	training's binary_logloss: 0.237757	valid_1's binary_logloss: 0.30312
Early stopping, best iteration is:
[742]	training's binary_logloss: 0.249019	valid_1's binary_logloss: 0.301279
Training until validation scores don't improve for 500 rounds
[500]	training's binary_logloss: 0.259998	valid_1's binary_logloss: 0.308541
[1000]	training's binary_logloss: 0.234931	valid_1's binary_logloss: 0.309132
Early stopping, best iteration is:
[712]	training's binary_logloss: 0.247688	valid_1's binary_logloss: 0.307961
Training until 

In [151]:
np.mean(val_acc)

0.8665865109959165

In [166]:
xgb_pred = np.zeros(target.shape[0])
val_acc = []
for i, (tr_idx, val_idx) in enumerate(skf.split(X, y)) :
    
    tr_x, tr_y = X.iloc[tr_idx], y.iloc[tr_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    xgb = XGBClassifier(random_state = 42, n_estimators = 5000, max_depth = 4,
                          learning_rate = 0.02)
    
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)],
             early_stopping_rounds = 500, verbose = 0)
    
    val_pred = xgb.predict_proba(val_x)[:, 1]
    val_cls = [1 if p >= 0.5 else 0 for p in val_pred]
    fold_acc = accuracy_score(val_y, val_cls)
    val_acc.append(fold_acc)
    print(f"{i + 1} Fold ACC = {fold_acc}")
    
    fold_pred = xgb.predict_proba(target)[:, 1] / skf.n_splits
    xgb_pred += fold_pred

1 Fold ACC = 0.8754141815772034
2 Fold ACC = 0.8594164456233422
3 Fold ACC = 0.8541114058355438
4 Fold ACC = 0.8554376657824934
5 Fold ACC = 0.8720159151193634
6 Fold ACC = 0.8580901856763926
7 Fold ACC = 0.8759946949602122
8 Fold ACC = 0.8653846153846154
9 Fold ACC = 0.870026525198939
10 Fold ACC = 0.8793103448275862


In [168]:
np.mean(val_acc)

0.866520197998569

In [144]:
submission = pd.read_csv('sample_submission.csv')

In [169]:
submission['target'] = [1 if p >= 0.5 else 0 for p in (cb_pred + lgbm_pred + xgb_pred) / 3]

In [170]:
submission.target.value_counts()

0    11985
1     3096
Name: target, dtype: int64

In [171]:
submission.to_csv("cb_lgbm.csv", index = False)