## 1. PREPROCESSING

## 2. FEATURE ENGINEERING & SELECTION based on EDA

## 3. LightGBM 


#### Learn the concepts first

- Gradient Boosting [GBM](https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/)
- XGBOOST [XGB](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)
- LightGBM [LGBM](https://www.analyticsvidhya.com/blog/2017/06/which-algorithm-takes-the-crown-light-gbm-vs-xgboost/)

## 1.1 import libraries

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import gc

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [14]:
## 1.2 Reduce memory

def reduce_mem_usage(df):
    
    """ Iterate through all the columns of a dataframe and 
        modify the data type to reduce memory usage.
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    # df.memory_usage() : bytes for each columns
    # / 1024*1024 : bytes to Megabytes
    
    
    for col in df.columns:
        col_type = df[col].dtype
        
        # int64 or float64
        if col_type != object: 
            
            c_min = df[col].min()
            c_max = df[col].max()
            
            # int64
            if str(col_type)[:3] == 'int': 
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    # np.iinfo(np.int16).max = 32767
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int36).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    
            # float64
            else: 
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
        # string       
        else:
            df[col] = df[col].astype('category')
            
        
    end_mem = df.memory_usage().sum() / 1024**2    
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem-end_mem)/start_mem))
    
    return df