In [57]:
import numpy as np
import pandas as pd

In [58]:
train = pd.read_csv("../data/tabular_playground/train.csv")
test = pd.read_csv("../data/tabular_playground/test.csv")

data = pd.concat([train, test], sort=False)

In [59]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1.0,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0.0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0.0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0.0,3,"Kramer, James",male,19.00,0,0,A. 10866,13.04,,S
4,4,1.0,3,"Bond, Michael",male,25.00,0,0,427635,7.76,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,199995,,3,"Cash, Cheryle",female,27.00,0,0,7686,10.12,,Q
99996,199996,,1,"Brown, Howard",male,59.00,1,0,13004,68.31,,S
99997,199997,,3,"Lightfoot, Cameron",male,47.00,0,0,4383317,10.87,,S
99998,199998,,1,"Jacobsen, Margaret",female,49.00,1,2,PC 26988,29.68,B20828,C


In [60]:
data["Sex"].replace(["male", "female"], [0,1], inplace=True)
data["Embarked"].fillna(("S"), inplace=True)
data["Embarked"] = data["Embarked"].map(
    {"S":0, "C":1, "Q":2}
).astype(int)
data["Fare"].fillna(np.mean(
    data["Fare"]
), inplace=True)

In [61]:
data["Age"].fillna(data["Age"].median(), inplace=True)

In [62]:
data["FamilySize"] = data["Parch"] + data["SibSp"] + 1
data["IsAlone"] = 0
data.loc[data["FamilySize"] == 1, "IsAlone"] = 1

In [63]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,IsAlone
0,0,1.0,1,"Oconnor, Frankie",0,31.0,2,0,209245,27.14,C12239,0,3,0
1,1,0.0,3,"Bryan, Drew",0,31.0,0,0,27323,13.35,,0,1,1
2,2,0.0,3,"Owens, Kenneth",0,0.33,1,2,CA 457703,71.29,,0,4,0
3,3,0.0,3,"Kramer, James",0,19.0,0,0,A. 10866,13.04,,0,1,1
4,4,1.0,3,"Bond, Michael",0,25.0,0,0,427635,7.76,,0,1,1


In [64]:
delete_columns = ['Name', 'PassengerId', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

train = data[:len(train)]
test = data[len(train):]

test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,,3,0,19.0,0,0,63.01,0,1,1
1,,3,1,53.0,0,0,5.81,0,1,1
2,,1,1,19.0,0,0,38.91,1,1,1
3,,2,0,25.0,0,0,12.93,0,1,1
4,,1,1,17.0,0,2,26.89,1,3,0


In [65]:
y_train = train["Survived"]
X_train = train.drop("Survived", axis=1)
X_test = test.drop("Survived", axis=1)

In [66]:
y_train.head()

0    1.0
1    0.0
2    0.0
3    0.0
4    1.0
Name: Survived, dtype: float64

In [67]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,1,0,31.0,2,0,27.14,0,3,0
1,3,0,31.0,0,0,13.35,0,1,1
2,3,0,0.33,1,2,71.29,0,4,0
3,3,0,19.0,0,0,13.04,0,1,1
4,3,0,25.0,0,0,7.76,0,1,1


In [68]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,0,19.0,0,0,63.01,0,1,1
1,3,1,53.0,0,0,5.81,0,1,1
2,1,1,19.0,0,0,38.91,1,1,1
3,2,0,25.0,0,0,12.93,0,1,1
4,1,1,17.0,0,2,26.89,1,3,0


In [69]:
# lightLGM + skfold
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

In [70]:
y_preds = []
models = []
oof_train = np.zeros((len(X_train), ))
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [71]:
# categorical_features = ["Embarked", "Pclass", "Sex"]
categorical_features = ["Embarked", "Pclass", "Sex", "IsAlone"]


In [72]:
params = {
    "objective":"binary",
    "max_bin":300,
    "learning_rate":0.05,
    "num_leaves":40
}

In [73]:
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index,:]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    lgb_train = lgb.Dataset(
        X_tr, y_tr, categorical_feature=categorical_features
    )

    lgb_eval = lgb.Dataset(
        X_val, y_val, reference=lgb_train, categorical_feature=categorical_features 
    )

    model = lgb.train(
        params, lgb_train, valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10, num_boost_round=1000, early_stopping_rounds=10
    )

    oof_train[valid_index] = model.predict(
        X_val, num_iteration=model.best_iteration
    )

    y_pred = model.predict(
        X_test, num_iteration=model.best_iteration
    )

    y_preds.append(y_pred)
    models.append(model)



This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 34219, number of negative: 45781
[LightGBM] [Info] Total Bins 492
[LightGBM] [Info] Number of data: 80000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.427737 -> initscore=-0.291088
[LightGBM] [Info] Start training from score -0.291088
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.568323	valid_1's binary_logloss: 0.569099
[20]	training's binary_logloss: 0.52102	valid_1's binary_logloss: 0.522226
[30]	training's binary_logloss: 0.500021	valid_1's binary_logloss: 0.501753
[40]	training's binary_logloss: 0.490233	valid_1's binary_logloss: 0.49251
[50]	training's binary_logloss: 0.485326	valid_1's binary_logloss: 0.488233
[60]	training's binary_logloss: 0.48253	valid_1's binary_logloss: 0.486196
[70]	trai

In [74]:
from sklearn.metrics import accuracy_score

y_pred_oof = (oof_train > 0.5).astype(int)
accuracy_score(y_train, y_pred_oof)

0.77224

In [75]:
y_sub = sum(y_preds) / len(y_preds)
y_sub = (y_sub > 0.5).astype(int)
y_sub[:10]

array([0, 1, 1, 0, 1, 0, 1, 0, 0, 0])

In [76]:
sub = pd.read_csv('../data/tabular_playground/sample_submission.csv')

In [77]:
sub["Survived"] = y_sub
sub.to_csv("./subs/lightgbm_skfold_isAlone_categorized.csv",index=False)

In [78]:
sub.head()

Unnamed: 0,PassengerId,Survived
0,100000,0
1,100001,1
2,100002,1
3,100003,0
4,100004,1


# Pseudolabellingを使ってみる
- 参考
    - https://www.kaggle.com/c/tabular-playground-series-apr-2021/discussion/231738

In [79]:
test.loc[:,"Survived"] = y_sub

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [80]:
test

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,0,3,0,19.0,0,0,63.01,0,1,1
1,1,3,1,53.0,0,0,5.81,0,1,1
2,1,1,1,19.0,0,0,38.91,1,1,1
3,0,2,0,25.0,0,0,12.93,0,1,1
4,1,1,1,17.0,0,2,26.89,1,3,0
...,...,...,...,...,...,...,...,...,...,...
99995,1,3,1,27.0,0,0,10.12,2,1,1
99996,0,1,0,59.0,1,0,68.31,0,2,0
99997,0,3,0,47.0,0,0,10.87,0,1,1
99998,1,1,1,49.0,1,2,29.68,1,4,0


In [106]:
pseudoLabeled_train = pd.concat([train, test], sort=False)
pseudoLabeled_train = pseudoLabeled_train.reset_index(drop=True)
pseudoLabeled_train

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,1.0,1,0,31.00,2,0,27.14,0,3,0
1,0.0,3,0,31.00,0,0,13.35,0,1,1
2,0.0,3,0,0.33,1,2,71.29,0,4,0
3,0.0,3,0,19.00,0,0,13.04,0,1,1
4,1.0,3,0,25.00,0,0,7.76,0,1,1
...,...,...,...,...,...,...,...,...,...,...
199995,1.0,3,1,27.00,0,0,10.12,2,1,1
199996,0.0,1,0,59.00,1,0,68.31,0,2,0
199997,0.0,3,0,47.00,0,0,10.87,0,1,1
199998,1.0,1,1,49.00,1,2,29.68,1,4,0


In [107]:
y_train = pseudoLabeled_train["Survived"]
X_train = pseudoLabeled_train.drop("Survived", axis=1)
X_test = test.drop("Survived", axis=1)

In [108]:
y_train

0         1.0
1         0.0
2         0.0
3         0.0
4         1.0
         ... 
199995    1.0
199996    0.0
199997    0.0
199998    1.0
199999    1.0
Name: Survived, Length: 200000, dtype: float64

In [109]:
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,3,0,19.0,0,0,63.01,0,1,1
1,3,1,53.0,0,0,5.81,0,1,1
2,1,1,19.0,0,0,38.91,1,1,1
3,2,0,25.0,0,0,12.93,0,1,1
4,1,1,17.0,0,2,26.89,1,3,0
...,...,...,...,...,...,...,...,...,...
99995,3,1,27.0,0,0,10.12,2,1,1
99996,1,0,59.0,1,0,68.31,0,2,0
99997,3,0,47.0,0,0,10.87,0,1,1
99998,1,1,49.0,1,2,29.68,1,4,0


In [110]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,IsAlone
0,1,0,31.00,2,0,27.14,0,3,0
1,3,0,31.00,0,0,13.35,0,1,1
2,3,0,0.33,1,2,71.29,0,4,0
3,3,0,19.00,0,0,13.04,0,1,1
4,3,0,25.00,0,0,7.76,0,1,1
...,...,...,...,...,...,...,...,...,...
199995,3,1,27.00,0,0,10.12,2,1,1
199996,1,0,59.00,1,0,68.31,0,2,0
199997,3,0,47.00,0,0,10.87,0,1,1
199998,1,1,49.00,1,2,29.68,1,4,0


In [111]:
y_preds = []
models = []
oof_train = np.zeros((len(X_train), ))
cv2 = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [112]:
for fold_id, (train_index, valid_index) in enumerate(cv2.split(X_train, y_train)):
    X_tr = X_train.loc[train_index, :]
    X_val = X_train.loc[valid_index,:]
    y_tr = y_train[train_index]
    y_val = y_train[valid_index]

    lgb_train = lgb.Dataset(
        X_tr, y_tr, categorical_feature=categorical_features
    )

    lgb_eval = lgb.Dataset(
        X_val, y_val, reference=lgb_train, categorical_feature=categorical_features 
    )

    model = lgb.train(
        params, lgb_train, valid_sets=[lgb_train, lgb_eval],
        verbose_eval=10, num_boost_round=1000, early_stopping_rounds=10
    )

    oof_train[valid_index] = model.predict(
        X_val, num_iteration=model.best_iteration
    )

    y_pred = model.predict(
        X_test, num_iteration=model.best_iteration
    )

    y_preds.append(y_pred)
    models.append(model)



This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 57526, number of negative: 102474
[LightGBM] [Info] Total Bins 504
[LightGBM] [Info] Number of data: 160000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.359538 -> initscore=-0.577372
[LightGBM] [Info] Start training from score -0.577372
Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.46962	valid_1's binary_logloss: 0.466627
[20]	training's binary_logloss: 0.392793	valid_1's binary_logloss: 0.387908
[30]	training's binary_logloss: 0.356447	valid_1's binary_logloss: 0.350261
[40]	training's binary_logloss: 0.338409	valid_1's binary_logloss: 0.331386
[50]	training's binary_logloss: 0.328955	valid_1's binary_logloss: 0.321429
[60]	training's binary_logloss: 0.323703	valid_1's binary_logloss: 0.316118
[70]	

In [113]:

y_pred_oof = (oof_train > 0.5).astype(int)
accuracy_score(y_train, y_pred_oof)

0.883005

In [114]:
y_sub = sum(y_preds) / len(y_preds)
y_sub = (y_sub > 0.5).astype(int)

sub["Survived"] = y_sub
sub.to_csv("./subs/lightgbm_skfold_pseudo_labeled.csv",index=False)