## 讀取檔案

In [81]:
import os
import pandas as pd

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

def modify_distribution(series, ratio = 0.5):
    series = series.apply(lambda x: x ** ratio)
    return series
    
train["Age"] = modify_distribution(train["Age"], 1/3)
test["Age"] = modify_distribution(test["Age"], 1/3)


In [82]:
from pandas.api.types import is_string_dtype, is_numeric_dtype

df = train.copy()
cols = [col for col in df if is_numeric_dtype(df[col]) and "id" not in col.lower() and col != "Exited"]

df = df[cols]
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)
iqr = q3 - q1
outliers = df[(df < (q1 - 1.5 * iqr)) | (df > (q3 + 1.5 * iqr))]

train = train[outliers.sum(axis = 1) == 0]

In [83]:
from sklearn.preprocessing import LabelEncoder

geography_encoder= LabelEncoder()
gender_encoder= LabelEncoder()

train["Gender"] = gender_encoder.fit_transform(train["Gender"])
train["Geography"] = geography_encoder.fit_transform(train["Geography"])

test["Gender"] = gender_encoder.transform(test["Gender"])
test["Geography"] = geography_encoder.transform(test["Geography"])

In [84]:
from sklearn.preprocessing import StandardScaler

standard_dic = {}

for col in ["CreditScore", "Age", "Balance", "EstimatedSalary"]:
    standard_dic[col] = StandardScaler()
    train[col] = standard_dic[col].fit_transform(train[[col]])
    test[col] = standard_dic[col].transform(test[[col]])
train

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,0.140615,0,1,-0.548150,3,-0.881434,2,1.0,0.0,1.367672,0
1,1,15749177,Okwudiliolisa,-0.375473,0,1,-0.548150,1,-0.881434,2,1.0,1.0,-1.258300,0
2,2,15694510,Hsueh,0.266490,0,1,0.390591,10,-0.881434,2,1.0,0.0,1.435671,0
3,3,15741417,Kao,-0.954499,0,1,-0.406390,2,1.489564,1,1.0,1.0,-0.560595,0
4,4,15766172,Chiemenam,0.744816,2,1,-0.548150,5,-0.881434,2,1.0,1.0,-1.943612,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165029,165029,15667085,Meng,0.128028,2,0,-0.548150,2,-0.881434,1,1.0,1.0,0.380240,0
165030,165030,15665521,Okechukwu,1.701467,0,1,-0.267382,3,-0.881434,1,0.0,0.0,0.380234,0
165031,165031,15664752,Hsia,-1.155899,0,1,-0.840504,5,-0.881434,1,1.0,1.0,0.292569,0
165032,165032,15689614,Hsiung,-1.294361,2,0,-0.991416,7,1.691026,1,0.0,1.0,-0.827038,0


In [85]:
train = train.drop(["id", "CustomerId", "Surname"], axis = 1)
test_id = test[["id"]]
test = test.drop(["id", "CustomerId", "Surname"], axis = 1)

train.head()
test.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,-0.891561,0,0,-2.155312,2,-0.881434,2,0.0,1.0,0.960218
1,0.329428,0,0,1.111364,2,-0.881434,1,1.0,0.0,-0.799648
2,-0.010435,0,0,-0.40639,7,-0.881434,2,1.0,0.0,0.520494
3,0.304253,0,1,-0.130998,8,-0.881434,1,1.0,0.0,0.023934
4,1.197966,1,1,0.134374,10,1.049725,1,1.0,0.0,0.531419


In [86]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score, roc_auc_score

def evaluate(y_test, y_pred):
    recall = recall_score(y_test, y_pred)
    prescision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"f1:{f1}, accuracy:{accuracy}")


In [87]:
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn import metrics
param_grid = {
  'n_estimators': [900, 750],
  'max_depth': [9],
  'learning_rate': [0.1]
}


X = train.drop(["Exited"], axis = 1)
y = train["Exited"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

xgb_have_sampling = make_pipeline(SMOTE(), 
                    TomekLinks(),
                    GridSearchCV(XGBClassifier(),
                                 param_grid=param_grid,
                                 scoring="f1",
                                 cv=3
                                ))

# xgb_have_sampling.fit(X_train, y_train)

In [88]:
param_grid = {
  'n_estimators': [900, 750],
  'max_depth': [9],
  'learning_rate': [0.1]
}


X = train.drop(["Exited"], axis = 1)
y = train["Exited"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

xgb_no_sampling = make_pipeline(
                    GridSearchCV(XGBClassifier(),
                                 param_grid=param_grid,
                                 scoring="f1",
                                 cv=3
                                ))

# xgb_no_sampling.fit(X_train, y_train)

In [89]:
# 設定超參數搜索空間
param_grid = {
    'n_estimators': [300, 500],
    'learning_rate': [0.1],
    'max_depth': [7, 9],
}


X = train.drop(["Exited"], axis = 1)
y = train["Exited"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

gbt_have_sampling = make_pipeline(SMOTE(), 
                    TomekLinks(),
                    GridSearchCV(GradientBoostingClassifier(),
                                 param_grid=param_grid,
                                 scoring="f1",
                                 cv=3
                                ))

# gbt_have_sampling.fit(X_train, y_train)

In [90]:
# 設定超參數搜索空間
param_grid = {
    'n_estimators': [300, 500],
    'learning_rate': [0.1],
    'max_depth': [7, 9],
}


X = train.drop(["Exited"], axis = 1)
y = train["Exited"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

gbt_no_sampling = make_pipeline(GridSearchCV(GradientBoostingClassifier(),
                                 param_grid=param_grid,
                                 scoring="f1",
                                 cv=3
                                ))

# gbt_no_sampling.fit(X_train, y_train)

In [91]:
from lightgbm import LGBMClassifier
# 設定超參數搜索空間
param_grid = {
    'n_estimators': [700, 500],
    'learning_rate': [0.1],
    'max_depth': [7, 9],
}


X = train.drop(["Exited"], axis = 1)
y = train["Exited"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

lgbm_have_sampling = make_pipeline(SMOTE(), 
                    TomekLinks(),
                    GridSearchCV(LGBMClassifier(),
                                 param_grid=param_grid,
                                 scoring="f1",
                                 cv=3
                                ))

# gbt_have_sampling.fit(X_train, y_train)

In [92]:

# 設定超參數搜索空間
param_grid = {
    'n_estimators': [700, 500],
    'learning_rate': [0.1],
    'max_depth': [7, 9],
}

X = train.drop(["Exited"], axis = 1)
y = train["Exited"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

lgbm_no_sampling = make_pipeline(GridSearchCV(LGBMClassifier(),
                                 param_grid=param_grid,
                                 scoring="f1",
                                 cv=3
                                ))


In [93]:
from sklearn.ensemble import VotingClassifier

# 建立基模型
base_models = [
    ('xgb_have_sampling', xgb_have_sampling),
    ('xgb_no_sampling', xgb_no_sampling),
    ('lgbm_have_sampling', lgbm_have_sampling),
    ('lgbm_no_sampling', lgbm_no_sampling),
]

# 建立 voting 模型
voting_clf = VotingClassifier(estimators=base_models, voting='soft', verbose=True)

# 訓練模型
voting_clf.fit(X_train, y_train)

# 預測結果
predictions = voting_clf.predict(X_test)

# 評估模型
print('準確率:', accuracy_score(y_test, predictions))

[Voting] ........ (1 of 4) Processing xgb_have_sampling, total=  30.5s
[Voting] .......... (2 of 4) Processing xgb_no_sampling, total=  16.9s
[LightGBM] [Info] Number of positive: 65675, number of negative: 67566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1549
[LightGBM] [Info] Number of data points in the train set: 133241, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492904 -> initscore=-0.028387
[LightGBM] [Info] Start training from score -0.028387
[LightGBM] [Info] Number of positive: 65674, number of negative: 67567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[L

[LightGBM] [Info] Number of positive: 65675, number of negative: 67566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1549
[LightGBM] [Info] Number of data points in the train set: 133241, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492904 -> initscore=-0.028387
[LightGBM] [Info] Start training from score -0.028387
[LightGBM] [Info] Number of positive: 65674, number of negative: 67567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 133241, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 17579, number of negative: 67567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000810 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 85146, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206457 -> initscore=-1.346415
[LightGBM] [Info] Start training from score -1.346415
[LightGBM] [Info] Number of positive: 17579, number of negative: 67566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000773 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 829
[LightGBM] [Info] Number of data points in the train set: 85145, number of used features: 10
[LightGBM] [Info] [b

In [94]:
# 評估模型
evaluate(y_test, predictions)

f1:0.6391202393466484, accuracy:0.8602254932665205


In [95]:
predictions = voting_clf.predict_proba(test)
ans = [i[1] for i in predictions]




In [96]:
test_id["Exited"] = ans
test_id.to_csv("submission_0323_bigger_model.csv", index=False)

In [97]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier

# 建立 stacking 模型
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=RandomForestClassifier(), verbose=1)
# 訓練模型
stacking_clf.fit(X_train, y_train)

# 預測結果
predictions = stacking_clf.predict(X_test)

# 評估模型
evaluate(y_test, predictions)

[LightGBM] [Info] Number of positive: 65649, number of negative: 67566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1549
[LightGBM] [Info] Number of data points in the train set: 133215, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492805 -> initscore=-0.028783
[LightGBM] [Info] Start training from score -0.028783
[LightGBM] [Info] Number of positive: 65648, number of negative: 67567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001144 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 133215, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 65649, number of negative: 67566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1549
[LightGBM] [Info] Number of data points in the train set: 133215, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492805 -> initscore=-0.028783
[LightGBM] [Info] Start training from score -0.028783
[LightGBM] [Info] Number of positive: 65648, number of negative: 67567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 133215, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 17579, number of negative: 67567
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 85146, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206457 -> initscore=-1.346415
[LightGBM] [Info] Start training from score -1.346415
[LightGBM] [Info] Number of positive: 17579, number of negative: 67566
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000835 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 829
[LightGBM] [Info] Number of data points in the train set: 85145, number of used features: 10
[LightGBM] [Info] [b

[LightGBM] [Info] Number of positive: 52534, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001027 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 106587, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492874 -> initscore=-0.028504
[LightGBM] [Info] Start training from score -0.028504
[LightGBM] [Info] Number of positive: 52534, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000908 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1548
[LightGBM] [Info] Number of data points in the train set: 106588, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 52534, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000934 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 106587, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492874 -> initscore=-0.028504
[LightGBM] [Info] Start training from score -0.028504
[LightGBM] [Info] Number of positive: 52534, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1548
[LightGBM] [Info] Number of data points in the train set: 106588, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 52551, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1549
[LightGBM] [Info] Number of data points in the train set: 106604, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492955 -> initscore=-0.028181
[LightGBM] [Info] Start training from score -0.028181
[LightGBM] [Info] Number of positive: 52552, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 106605, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 52540, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000883 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 106594, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492898 -> initscore=-0.028409
[LightGBM] [Info] Start training from score -0.028409
[LightGBM] [Info] Number of positive: 52541, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1549
[LightGBM] [Info] Number of data points in the train set: 106594, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 52540, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000930 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 106594, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492898 -> initscore=-0.028409
[LightGBM] [Info] Start training from score -0.028409
[LightGBM] [Info] Number of positive: 78811, number of negative: 81080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001398 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1547
[LightGBM] [Info] Number of data points in the train set: 159891, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 52466, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000938 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 106519, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492551 -> initscore=-0.029800
[LightGBM] [Info] Start training from score -0.029800
[LightGBM] [Info] Number of positive: 52465, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000986 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1550
[LightGBM] [Info] Number of data points in the train set: 106519, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 52499, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000992 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1549
[LightGBM] [Info] Number of data points in the train set: 106552, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492708 -> initscore=-0.029171
[LightGBM] [Info] Start training from score -0.029171
[LightGBM] [Info] Number of positive: 52499, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1548
[LightGBM] [Info] Number of data points in the train set: 106552, number of used features: 10
[LightGBM] [Info

[LightGBM] [Info] Number of positive: 78748, number of negative: 81080
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1548
[LightGBM] [Info] Number of data points in the train set: 159828, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.492705 -> initscore=-0.029183
[LightGBM] [Info] Start training from score -0.029183
[LightGBM] [Info] Number of positive: 14062, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000641 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 829
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] 

[LightGBM] [Info] Number of positive: 14063, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000687 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 829
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206457 -> initscore=-1.346418
[LightGBM] [Info] Start training from score -1.346418
[LightGBM] [Info] Number of positive: 14062, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 829
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] [b

[LightGBM] [Info] Number of positive: 14063, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 831
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206457 -> initscore=-1.346418
[LightGBM] [Info] Start training from score -1.346418
[LightGBM] [Info] Number of positive: 14062, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] [b

[LightGBM] [Info] Number of positive: 14063, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 829
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206457 -> initscore=-1.346418
[LightGBM] [Info] Start training from score -1.346418
[LightGBM] [Info] Number of positive: 14063, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000642 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] [b

[LightGBM] [Info] Number of positive: 14063, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 829
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206457 -> initscore=-1.346418
[LightGBM] [Info] Start training from score -1.346418
[LightGBM] [Info] Number of positive: 14063, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] [b

[LightGBM] [Info] Number of positive: 14063, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 68117, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206454 -> initscore=-1.346436
[LightGBM] [Info] Start training from score -1.346436
[LightGBM] [Info] Number of positive: 14063, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 829
[LightGBM] [Info] Number of data points in the train set: 68116, number of used features: 10
[LightGBM] [Info] [b

[LightGBM] [Info] Number of positive: 14064, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 68117, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206468 -> initscore=-1.346347
[LightGBM] [Info] Start training from score -1.346347
[LightGBM] [Info] Number of positive: 14063, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000617 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 68117, number of used features: 10
[LightGBM] [Info] [b

[LightGBM] [Info] Number of positive: 14064, number of negative: 54053
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 68117, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.206468 -> initscore=-1.346347
[LightGBM] [Info] Start training from score -1.346347
[LightGBM] [Info] Number of positive: 14063, number of negative: 54054
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000827 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 830
[LightGBM] [Info] Number of data points in the train set: 68117, number of used features: 10
[LightGBM] [Info] [b

In [99]:
predictions = stacking_clf.predict_proba(test)
ans = [i[1] for i in predictions]
test_id["Exited"] = ans
test_id.to_csv("submission_0323_stacking.csv", index=False)



In [1]:
print("hello world")

hello world
