https://www.kaggle.com/davidcairuz/feature-engineering-lightgbm

## 1. PREPROCESSING

## 2. FEATURE ENGINEERING & SELECTION based on EDA

## 3. LightGBM 


#### Learn the concepts first

- Gradient Boosting [GBM](https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/)
- XGBOOST [XGB](https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/)
- LightGBM [LGBM](https://www.analyticsvidhya.com/blog/2017/06/which-algorithm-takes-the-crown-light-gbm-vs-xgboost/)

------

_______

## 1. Preprocessing

In [1]:
## 1.1 Import libraries

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import gc

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
## 1.2 Reduce memory

def reduce_mem_usage(df):
    
    """ Iterate through all the columns of a dataframe and 
        modify the data type to reduce memory usage.
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    # df.memory_usage() : bytes for each columns
    # / 1024*1024 : bytes to Megabytes
    
    
    for col in df.columns:
        col_type = df[col].dtype
        
        # int64 or float64
        if col_type != object: 
            
            c_min = df[col].min()
            c_max = df[col].max()
            
            # int64
            if str(col_type)[:3] == 'int': 
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    # e.g., np.iinfo(np.int16).max = 32767
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int36).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                    
            # float64
            else: 
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
        # string       
        else:
            df[col] = df[col].astype('category')
            
        
    end_mem = df.memory_usage().sum() / 1024**2    
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem-end_mem)/start_mem))
    
    return df

In [3]:
## 1.3 Load data sets

%%time

dir_path = '/Users/jkim/main/kaggle/ieee_cis_fraud/'
print('Loading data...')

train_identity = pd.read_csv(f'{dir_path}train_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_identity!')

train_transaction = pd.read_csv(f'{dir_path}train_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded train_transaction!')

test_identity = pd.read_csv(f'{dir_path}test_identity.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_identity!')

test_transaction = pd.read_csv(f'{dir_path}test_transaction.csv', index_col='TransactionID')
print('\tSuccessfully loaded test_transaction!')

sub = pd.read_csv(f'{dir_path}sample_submission.csv')
print('\tSuccessfully loaded sample_submission!')

print('Data was successfully loaded!\n')

Loading data...
	Successfully loaded train_identity!
	Successfully loaded train_transaction!
	Successfully loaded test_identity!
	Successfully loaded test_transaction!
	Successfully loaded sample_submission!
Data was successfully loaded!

CPU times: user 11.7 s, sys: 2.79 s, total: 14.5 s
Wall time: 14.9 s


In [41]:
pd.set_option('display.max_columns', None)
train_identity.id_31.nunique()

df_id_31=train_identity['id_31'].str.split(' ', expand=True)
df_id_31.columns = ['one','two','three','four']
df_id_31.loc[df_id_31['one'].str.contains('mobile', na = False), 'two'].nunique()

1

In [14]:
## 1.4 preprocessing for identity data

## id_23 ':'
## id_30 'OS  Version'
## id_31 'Browser' 'Version'
## id_34 ':'
## id_35 'x' [e.g. 2220x1080]

Index(['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_07', 'id_08',
       'id_09', 'id_10', 'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16',
       'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24',
       'id_25', 'id_26', 'id_27', 'id_28', 'id_29', 'id_30', 'id_31', 'id_32',
       'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType',
       'DeviceInfo'],
      dtype='object')