In [67]:
import numpy as np
import pandas as pd

In [68]:
train = pd.read_csv("../data/tabular_playground/train.csv")
test = pd.read_csv("../data/tabular_playground/test.csv")

data = pd.concat([train, test], sort=False)

In [69]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1.0,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0.0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0.0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0.0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1.0,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [70]:
data["Sex"].replace(["male", "female"], [0,1], inplace=True)
data["Embarked"].fillna(("S"), inplace=True)
data["Embarked"] = data["Embarked"].map(
    {"S":0, "C":1, "Q":2}
).astype(int)
data["Fare"].fillna(np.mean(
    data["Fare"]
), inplace=True)

In [71]:
data["Age"].fillna(data["Age"].median(), inplace=True)

In [72]:
data["FamilySize"] = data["Parch"] + data["SibSp"] + 1
data["IsAlone"] = 0
data.loc[data["FamilySize"] == 1, "IsAlone"] = 1

In [73]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,0,1.0,1,"Oconnor, Frankie",0,31.0,2,0,209245,27.14,C12239,0,3,0
1,1,0.0,3,"Bryan, Drew",0,31.0,0,0,27323,13.35,,0,1,1
2,2,0.0,3,"Owens, Kenneth",0,0.33,1,2,CA 457703,71.29,,0,4,0
3,3,0.0,3,"Kramer, James",0,19.0,0,0,A. 10866,13.04,,0,1,1
4,4,1.0,3,"Bond, Michael",0,25.0,0,0,427635,7.76,,0,1,1


In [74]:
delete_columns = ['Name', 'PassengerId', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,,3,0,19.0,0,0,63.01,0,1,1
1,,3,1,53.0,0,0,5.81,0,1,1
2,,1,1,19.0,0,0,38.91,1,1,1
3,,2,0,25.0,0,0,12.93,0,1,1
4,,1,1,17.0,0,2,26.89,1,3,0


In [75]:
y_train = train["Survived"]
X_train = train.drop("Survived", axis=1)
X_test = test.drop("Survived", axis=1)

In [76]:
y_train.head()

0    1.0
1    0.0
2    0.0
3    0.0
4    1.0
Name: Survived, dtype: float64

In [77]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,1,0,31.0,2,0,27.14,0,3,0
1,3,0,31.0,0,0,13.35,0,1,1
2,3,0,0.33,1,2,71.29,0,4,0
3,3,0,19.0,0,0,13.04,0,1,1
4,3,0,25.0,0,0,7.76,0,1,1


In [78]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,0,19.0,0,0,63.01,0,1,1
1,3,1,53.0,0,0,5.81,0,1,1
2,1,1,19.0,0,0,38.91,1,1,1
3,2,0,25.0,0,0,12.93,0,1,1
4,1,1,17.0,0,2,26.89,1,3,0


In [79]:
# lightLGM + skfold
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

In [80]:
y_preds = []
models = []
oof_train = np.zeros((len(X_train), ))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [81]:
# categorical_features = ["Embarked", "Pclass", "Sex"]
categorical_features = ["Embarked", "Pclass", "Sex", "IsAlone"]


In [82]:
params = {
    "objective":"binary",
    "max_bin":300,
    "learning_rate":0.05,
    "num_leaves":40
}

In [83]:
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index,:]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    lgb_train = lgb.Dataset(
        X_tr, y_tr, categorical_feature=categorical_features
    )

    lgb_eval = lgb.Dataset(
        X_val, y_val, reference=lgb_train, categorical_feature=categorical_features 
    )

    model = lgb.train(
        params, lgb_train, valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10, num_boost_round=1000, early_stopping_rounds=10
    )

    oof_train[valid_index] = model.predict(
        X_val, num_iteration=model.best_iteration
    )

    y_pred = model.predict(
        X_test, num_iteration=model.best_iteration
    )

    y_preds.append(y_pred)
    models.append(model)



[LightGBM] [Info] Number of positive: 34219, number of negative: 45781
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 496
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.427737 -> initscore=-0.291088
[LightGBM] [Info] Start training from score -0.291088
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.568323	valid_1's binary_logloss: 0.569099
[20]	training's binary_logloss: 0.52102	valid_1's binary_logloss: 0.522226
[30]	training's binary_logloss: 0.500019	valid_1's binary_logloss: 0.501748
[40]	training's binary_logloss: 0.490211	valid_1's binary_logloss: 0.492487
[50]	training's binary_logloss: 0.485277	valid_1's binary_logloss: 0.488182
[60]	training's binary_logloss: 0.48246	valid_1's binary_logloss: 0.486212
[70]	training's binary_logloss: 0

In [84]:
from sklearn.metrics import accuracy_score

y_pred_oof = (oof_train > 0.5).astype(int)
accuracy_score(y_train, y_pred_oof)

0.77222

In [85]:
y_sub = sum(y_preds) / len(y_preds)
y_sub = (y_sub > 0.5).astype(int)
y_sub[:10]

array([0, 1, 1, 0, 1, 0, 1, 0, 1, 0])

In [86]:
sub = pd.read_csv('../data/tabular_playground/sample_submission.csv')

In [87]:
sub["Survived"] = y_sub
sub.to_csv("./subs/lightgbm_skfold_isAlone_categorized.csv",index=False)

In [88]:
sub.head()

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,1
2,100002,1
3,100003,0
4,100004,1
