In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import lightgbm as lgb
import matplotlib.pyplot as plt

In [2]:
lgb.__version__

'2.3.2'

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
data = pd.concat([train, test], sort=False)

In [5]:
data['Sex'].replace(['male','female'], [0, 1], inplace=True)

data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

data['Fare'].fillna(np.mean(data['Fare']), inplace=True)

age_avg = data['Age'].mean()
age_std = data['Age'].std()

data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)

delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']
data.drop(delete_columns, axis=1, inplace=True)

In [6]:
train = data[:len(train)]
test = data[len(train):]


In [7]:
y_train = train['Survived']
X_train = train.drop('Survived', axis = 1)
X_test = test.drop('Survived', axis = 1)


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=0, stratify=y_train)

In [9]:
categorical_features = ['Embarked', 'Pclass', 'Sex']

In [10]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature=categorical_features)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train, categorical_feature=categorical_features)

params = {
    'objective': 'binary'
}

model = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_eval],
    verbose_eval=10,
    num_boost_round=1000,
    early_stopping_rounds=10
)

y_pred = model.predict(X_test, num_iteration=model.best_iteration)

Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.426569	valid_1's binary_logloss: 0.474937
[20]	training's binary_logloss: 0.355246	valid_1's binary_logloss: 0.450215
[30]	training's binary_logloss: 0.309672	valid_1's binary_logloss: 0.447568
[40]	training's binary_logloss: 0.272486	valid_1's binary_logloss: 0.453176
Early stopping, best iteration is:
[30]	training's binary_logloss: 0.309672	valid_1's binary_logloss: 0.447568


In [54]:
import gc
import lightgbm


class LGBM_Model():

    def __init__(
        self, params, train_param, train_data, valid_data,
        target_col, feature_cols, categorical_cols=None, train_weight=None, valid_weight=None
    ):
        self.target_col = target_col
        self.feature_cols = feature_cols
        self.categorical_cols = categorical_cols
               
        model = self.fit(params, train_param, train_data, valid_data)
    
    def _remove_bin_file(self, filepath):
        if os.path.exists(filepath):
            os.remove(filepath)

    def _convert_dataset(self, train_data, valid_data, save_binary=True):
        train_dataset = lgb.Dataset(
            train_data[self.feature_cols], train_data[self.target_col],
            feature_name=self.feature_cols, categorical_feature=self.categorical_cols
        )
        valid_dataset = lgb.Dataset(
            valid_data[self.feature_cols], valid_data[self.target_col], reference=train_dataset
        )
        
        if save_binary:
            # Define cache filepath.
            train_bin_path = 'tmp_train_set.bin'
            valid_bin_path = 'tmp_valid_set.bin'
            # Remove Cached File.
            self._remove_bin_file(train_bin_path)
            self._remove_bin_file(valid_bin_path)
            # Save Binary Cache.
            train_dataset.save_binary(train_bin_path)
            valid_dataset.save_binary(valid_bin_path)
            # Reload Binary Cache.
            train_dataset = lgb.Dataset(train_bin_path)
            valid_dataset = lgb.Dataset(valid_bin_path)

        return train_dataset, valid_dataset

    def fit(self, params, train_param, train_data, valid_data):
        train_dataset, valid_dataset = self._convert_dataset(train_data, valid_data)
        
        self.model = lgb.train(
            params, train_dataset, 
            valid_sets=[train_dataset, valid_dataset],
            **train_param
        )

    def predict(self, data):
        return self.model.predict(data, num_iteration=model.best_iteration)
           

    def get_importance(self):
        # Define Feature Importance DataFrame.
        imp_df = pd.DataFrame(
            [self.model.feature_importance()],
            columns=self.model.feature_name(),
            index=['Importance']
        ).T
        imp_df.sort_values(by='Importance', inplace=True)
        return imp_df

    def save_importance(self, filepath, max_num_features=50, figsize=(10, 8)):
        imp_df = self.get_importance()
        # Plot Importance DataFrame.
        plt.figure(figsize=figsize)
        imp_df[-max_num_features:].plot(
            kind='barh', title='Feature importance', figsize=figsize,
            y='Importance', align="center"
        )
        plt.savefig(filepath)
        plt.close('all')

    
params = {
    'objective': 'binary'
}
train_param = {
    'verbose_eval': 10,
    'num_boost_round': 100,
    'early_stopping_rounds': 10,
    # 'fobj': custom_fobj,
    # 'feval': custom_feval
}
target = 'Survived'
drop_cols = ['Survived']
features = [f for f in train.columns.tolist() if f not in drop_cols]
categoricals = ['Embarked', 'Pclass', 'Sex']

from sklearn.model_selection import train_test_split
train_data, valid_data  = train_test_split(train, test_size=0.3, random_state=42)

lgb_model = LGBM_Model(params, train_param, train_data, valid_data, target, features, categoricals)

from sklearn.metrics import roc_auc_score

pred = lgb_model.predict(valid_data[features])

print('\nAUC:', roc_auc_score(valid_data[target], pred))

Training until validation scores don't improve for 10 rounds
[10]	training's binary_logloss: 0.424818	valid_1's binary_logloss: 0.467657
[20]	training's binary_logloss: 0.352678	valid_1's binary_logloss: 0.438348
[30]	training's binary_logloss: 0.311784	valid_1's binary_logloss: 0.44407
Early stopping, best iteration is:
[20]	training's binary_logloss: 0.352678	valid_1's binary_logloss: 0.438348

AUC: 0.8729270671945831


In [55]:
lgb_model.save_importance(filepath='test.png')

In [15]:
lgb_model.predict(X_test[features])

array([0.10245555, 0.50346386, 0.16143407, 0.08835708, 0.3464338 ,
       0.16390117, 0.67245277, 0.22910887, 0.67569778, 0.09699357,
       0.07134228, 0.19136593, 0.89431227, 0.09346822, 0.90750857,
       0.8455184 , 0.17096313, 0.21431275, 0.36463652, 0.67245277,
       0.53808075, 0.29322006, 0.90352428, 0.4591375 , 0.84022806,
       0.05898227, 0.90306643, 0.21431275, 0.37593787, 0.27431665,
       0.09297708, 0.10652605, 0.51163464, 0.19935972, 0.49745931,
       0.19919661, 0.29795146, 0.29275027, 0.08740561, 0.33949013,
       0.17196534, 0.31840153, 0.104167  , 0.91795549, 0.90750857,
       0.13383814, 0.39512116, 0.19238485, 0.90750857, 0.60527686,
       0.18417261, 0.28129866, 0.85767095, 0.86173787, 0.28129866,
       0.21327232, 0.07607485, 0.15927382, 0.0853102 , 0.86173787,
       0.10487709, 0.23713838, 0.15208164, 0.67569778, 0.40420014,
       0.8455184 , 0.58842114, 0.1597121 , 0.36216734, 0.84022806,
       0.69425922, 0.0674152 , 0.34161526, 0.36216734, 0.86173