In [None]:
!pip install --upgrade numpy pandas matplotlib seaborn scipy tensorflow keras lightgbm nltk torch torchvision

In [None]:
!pip install --upgrade scikit-learn pillow opencv-python node2vec graphviz gensim fastai django cython 

In [None]:
!rm -r /opt/conda/lib/python3.6/site-packages/lightgbm
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
!apt-get install -y -qq libboost-all-dev

In [None]:
%%bash
cd LightGBM
rm -r build
mkdir build
cd build
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ ..
make -j$(nproc)

In [None]:
!cd LightGBM/python-package/;python3 setup.py install --precompile

In [None]:
!mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
!rm -r LightGBM

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sn
sn.set()
from sklearn import preprocessing
import gc, datetime, random
import lightgbm
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
pd.options.display.max_rows = 4000

def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED)
START_DATE = datetime.datetime.strptime('2017-11-30', '%Y-%m-%d')

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    convert_dict = {}
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    convert_dict.update({col: np.int8})
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    convert_dict.update({col: np.int16})
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    convert_dict.update({col: np.int32})
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    convert_dict.update({col: np.int64})  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    convert_dict.update({col: np.float16})
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    convert_dict.update({col: np.float32})
                else:
                    convert_dict.update({col: np.float64})
        else:
            convert_dict.update({col: "category"})
    df = df.astype(convert_dict)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
%%time
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('/kaggle/input/ieee-fraud-detection/sample_submission.csv', index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)

print("Train shape: ", train.shape)
print("Test shape: ", test.shape)

y_train = train['isFraud'].copy()
del train_transaction, train_identity, test_transaction, test_identity

# Drop target, fill in NaNs
train = train.drop('isFraud', axis=1)

In [None]:
%%time
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
def addNewFeatures(data): 
    data['uid'] = data['card1'].astype(str)+'_'+data['card2'].astype(str)

    data['uid2'] = data['uid'].astype(str)+'_'+data['card3'].astype(str)+'_'+data['card5'].astype(str)

    data['uid3'] = data['uid2'].astype(str)+'_'+data['addr1'].astype(str)+'_'+data['addr2'].astype(str)

    data['uid4'] = data['addr1'].astype(str)+'_'+data['addr2'].astype(str)
    
    data['D9'] = np.where(data['D9'].isna(),0,1)
    
    return data

train = addNewFeatures(train)
test = addNewFeatures(test)

In [None]:
i_cols = ['card1','card2','card3','card5','uid','uid2','uid3', 'uid4']

for col in i_cols:
    for agg_type in ['mean','std']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        temp_df = pd.concat([train[[col, 'TransactionAmt']], test[[col,'TransactionAmt']]])
        #temp_df['TransactionAmt'] = temp_df['TransactionAmt'].astype(int)
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})

        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   

        train[new_col_name] = train[col].map(temp_df)
        test[new_col_name]  = test[col].map(temp_df)

train = train.replace(np.inf,999)
test = test.replace(np.inf,999)

In [None]:
train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
train['D15_to_mean_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
train['D15_to_std_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('std')

test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
test['D15_to_mean_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('mean')
test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
test['D15_to_std_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('std')

train=train.replace([np.inf,-np.inf],np.nan)
test=test.replace([np.inf,-np.inf],np.nan)

In [None]:
train['TransactionAmt'] = np.log1p(train['TransactionAmt'])
test['TransactionAmt'] = np.log1p(test['TransactionAmt'])

In [None]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other',
          'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft',
          'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 
          'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other',
          'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo',
          'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo',
          'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo',
          'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo',
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other',
          'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple',
          'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other',
          'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
us_emails = ['gmail', 'net', 'edu']

for c in ['P_emaildomain', 'R_emaildomain']:
    train[c + '_bin'] = train[c].map(emails)
    test[c + '_bin'] = test[c].map(emails)
    
    train[c + '_suffix'] = train[c].map(lambda x: str(x).split('.')[-1])
    test[c + '_suffix'] = test[c].map(lambda x: str(x).split('.')[-1])
    
    train[c + '_suffix'] = train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    test[c + '_suffix'] = test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [None]:
p = 'P_emaildomain'
r = 'R_emaildomain'
uknown = 'email_not_provided'

def setDomain(df):
    df[p] = df[p].fillna(uknown)
    df[r] = df[r].fillna(uknown)
    
    # Check if P_emaildomain matches R_emaildomain
    df['email_check'] = np.where((df[p]==df[r])&(df[p]!=uknown),1,0)

    df[p+'_prefix'] = df[p].apply(lambda x: x.split('.')[0])
    df[r+'_prefix'] = df[r].apply(lambda x: x.split('.')[0])
    
    return df
    
train=setDomain(train)
test=setDomain(test)

In [None]:
def datetime_trans(train,start_date='2017-11-30'):
    startdate=datetime.datetime.strptime(start_date,"%Y-%m-%d")
    train['TransactionDT']=train['TransactionDT'].fillna(train['TransactionDT'].mean())
    train['date']=train['TransactionDT'].apply(lambda x : datetime.timedelta(seconds=x)+startdate)
    train['weekday']=train['date'].apply(lambda x :x.weekday())#不适合单独使用
    train['month']=(train['date'].dt.year-2017)*12+train['date'].dt.month
    train['hour']=train['date'].apply(lambda x :x.hour)#可以使用
    train['day']=(train['date'].dt.year-2017)*365+train['date'].dt.dayofyear
    train['year_weekday']=train['date'].apply(lambda x : str(x.year)+'_'+str(x.weekday()))#有一定的偏度，但较为平坦
    train['weekday_hour']=train['date'].apply(lambda x :str(x.weekday())+'_'+str(x.hour))#波动性质较好
date_col=['weekday','month','day','hour','year_weekday','weekday_hour']
datetime_trans(train)
datetime_trans(test)

In [None]:
#add 'others' mark threshold=0.95 exclude 'na's
def add_others_mark(train,test,categ_col):
    temp_df=pd.concat([train[[categ_col]],test[[categ_col]]])
    series=temp_df[categ_col].value_counts(normalize=True).cumsum()
    others_index=list(series[series>0.95].index)
    if len(others_index)!=0:
        train[categ_col]=train[categ_col].apply(lambda x : 'others' if x in others_index else x)
        test[categ_col]=test[categ_col].apply(lambda x : 'others' if x in others_index else x)
        print(f'{categ_col}:{len(others_index)} of {len(series)} feature values has been replaced to \'others\'')
mail_col=['P_emaildomain', 'R_emaildomain',
          'DeviceInfo',
          'id_30','id_33']       

for c in mail_col:
    add_others_mark(train,test,c)
add_others_mark(train,test,mail_col[3])

In [None]:
train["lastest_browser"] = np.zeros(train.shape[0])
test["lastest_browser"] = np.zeros(test.shape[0])

def setBrowser(df):
    df.loc[df["id_31"]=="samsung browser 7.0",'lastest_browser']=1
    df.loc[df["id_31"]=="opera 53.0",'lastest_browser']=1
    df.loc[df["id_31"]=="mobile safari 10.0",'lastest_browser']=1
    df.loc[df["id_31"]=="google search application 49.0",'lastest_browser']=1
    df.loc[df["id_31"]=="firefox 60.0",'lastest_browser']=1
    df.loc[df["id_31"]=="edge 17.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 69.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 67.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 63.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 64.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 65.0 for ios",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for android",'lastest_browser']=1
    df.loc[df["id_31"]=="chrome 66.0 for ios",'lastest_browser']=1
    return df

train=setBrowser(train)
test=setBrowser(test)

In [None]:
def setDevice(df):
    df['DeviceInfo'] = df['DeviceInfo'].fillna('unknown_device').str.lower()
    
    df['device_name'] = df['DeviceInfo'].str.split('/', expand=True)[0]

    df.loc[df['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    df.loc[df['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    df.loc[df['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    df.loc[df['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    df.loc[df['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    df.loc[df['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    df.loc[df['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    df.loc[df['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    df.loc[df['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    df.loc[df['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    df.loc[df.device_name.isin(df.device_name.value_counts()[df.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    df['had_id'] = 1
    gc.collect()
    
    return df

train=setDevice(train)
test=setDevice(test)

In [None]:
i_cols = ['card1','card2','card3','card5',
          'C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14',
          'D1','D2','D3','D4','D5','D6','D7','D8',
          'addr1','addr2',
          'dist1','dist2',
          'P_emaildomain', 'R_emaildomain',
          'DeviceInfo','device_name',
          'id_30','id_33',
          'uid','uid2','uid3','uid4'
         ]+date_col

for col in i_cols:
    temp_df = pd.concat([train[[col]], test[[col]]])
    fq_encode = temp_df[col].value_counts(dropna=False).to_dict()   
    train[col+'_fq_enc'] = train[col].map(fq_encode)
    test[col+'_fq_enc']  = test[col].map(fq_encode)

def set_freq_col(train,test,col):
    prefix='_fq'
    temp_df=pd.concat([train[[col]],test[[col]]])
    fq=temp_df[col].value_counts(dropna=False)
    train[col+prefix]=train[col].map(fq)
    test[col+prefix]=test[col].map(fq)
    
for c in i_cols:
    set_freq_col(train,test,c)

periods = ['month','year_weekday','weekday_hour']
uids = ['uid','uid2','uid3','uid4']
def set_uid_period(train,test,periods,uids):
    for period in periods:
        for col in uids:
            new_column = col + '_' + period

            temp_df = pd.concat([train[[col,period]], test[[col,period]]])
            temp_df[new_column] = temp_df[col].astype(str) + '_' + (temp_df[period]).astype(str)
            fq_encode = temp_df[new_column].value_counts()

            train[new_column] = (train[col].astype(str) + '_' + train[period].astype(str)).map(fq_encode)
            test[new_column]  = (test[col].astype(str) + '_' + test[period].astype(str)).map(fq_encode)

            train[new_column] /= train[period+'_fq']
            test[new_column]  /= test[period+'_fq']
            
set_uid_period(train,test,periods,uids)

In [None]:
def get_too_many_null_attr(data):
    many_null_cols = [col for col in data.columns if data[col].isnull().sum() / data.shape[0] > 0.85]
    return many_null_cols

def get_too_many_repeated_val(data):
    big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.85]
    return big_top_value_cols

def get_useless_columns(data):
    too_many_null = get_too_many_null_attr(data)
    print("More than 85% null: " + str(len(too_many_null)))
    too_many_repeated = get_too_many_repeated_val(data)
    print("More than 85% repeated value: " + str(len(too_many_repeated)))
    cols_to_drop = list(set(too_many_null + too_many_repeated))
    #cols_to_drop.remove('isFraud')
    return cols_to_drop

In [None]:
cols_to_drop = get_useless_columns(train)

In [None]:
train = train.drop(cols_to_drop, axis=1)
test = test.drop(cols_to_drop, axis=1)

In [None]:
print(train.shape)
print(test.shape)
print(y_train.shape)

In [None]:
numerical_cols = train.select_dtypes(exclude = 'object').columns
categorical_cols = train.select_dtypes(include = 'object').columns

In [None]:
print(len(categorical_cols))

In [None]:
categorical_cols[:5]

In [None]:
%%time
# Label Encoding
for f in train.columns:
    if train[f].dtype.name =='object' or test[f].dtype.name =='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

In [None]:
train.drop(columns=["date"], inplace=True)
test.drop(columns=["date"], inplace=True)

In [None]:
train = train.fillna(-999)
test = test.fillna(-999)

In [None]:
print(train.isnull().sum().max())
print(test.isnull().sum().max())

In [None]:
train.head()

In [None]:
train.drop(['TransactionDT'],axis = 1, inplace = True)
test.drop(['TransactionDT'], axis = 1, inplace = True)

In [None]:
train.info()

In [None]:
print("isFraud" in train.columns)

In [None]:
NUM_SPLIT = 10

In [None]:
def split_n_set(n, df_x, df_y):
    num_minor_class = np.min(df_y.value_counts())
    #minor_class = np.argmin(df_y.value_counts().tolist())
    minor_class = 1
    
    minor_x = df_x.loc[df_y == minor_class]
    minor_y = df_y.loc[df_y == minor_class]
    major_x = df_x.loc[df_y != minor_class]
    major_y = df_y.loc[df_y != minor_class]
    #print("no. sample in minor_x: " + str(len(minor_x.index)))
    #print("no. sample in minor_y: " + str(len(minor_y.index)))
    #print("no. sample in major_x: " + str(len(major_x.index)))
    #print("no. sample in major_y: " + str(len(major_y.index)))
    
    #major_x = major_x.sample(frac=1).reset_index(drop=True)
    rand_split = np.array_split(range(len(major_x.index)), n)
    #print("len of rand_split: " + str(len(rand_split[0])))
    
    major_split_x = []
    major_split_y = []
    for i in range(n):
        major_part_x = major_x[0:len(rand_split[i])]
        major_part_y = major_y[0:len(rand_split[i])]
        major_part_x = major_part_x.append(minor_x)
        major_part_y = major_part_y.append(minor_y)
        major_x = major_x[len(rand_split[i]):]
        major_y = major_y[len(rand_split[i]):]
        major_split_x.append(major_part_x)
        major_split_y.append(major_part_y)
    return major_split_x, major_split_y

In [None]:
train_sub, y_train_sub = split_n_set(NUM_SPLIT, train, y_train)

In [None]:
del train, y_train
gc.collect()

In [None]:
%%time
params = {'num_leaves': 3,
          'min_child_weight': 0.03,
          'feature_fraction': 0.072,
          'bagging_fraction': 0.072,
          'min_data_in_leaf': 179,
          'objective': 'binary',
          'max_depth': 2,
          'learning_rate': 0.006,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.33*2,
          'reg_lambda': 0.39*2,
          'random_state': 42,
          'is_unbalance': True,
          'device': 'gpu',
          'gpu_platform_id': 0,
          'gpu_device_id': 0
}
models = []
for i in range(NUM_SPLIT):
    print("training model " + str(i+1))
    X_tr, X_val, y_tr, y_val = train_test_split(train_sub[i], y_train_sub[i], test_size = 0.2, stratify = y_train_sub[i], random_state = 42)
    models.append(lightgbm.train(params,
                       lightgbm.Dataset(X_tr, label=y_tr),
                       valid_sets=lightgbm.Dataset(X_val, label=y_val),
                       num_boost_round=10000,
                       early_stopping_rounds=300,
                       verbose_eval = 1000,
                       )
                 )
    
del train_sub, y_train_sub
gc.collect()

In [None]:
%%time
y_pred = np.zeros(len(test.index))
for i in range(NUM_SPLIT):
    print(i+1)
    y_pred += models[0].predict(test)
    del models[0]
    gc.collect()


In [None]:
y_pred = np.true_divide(y_pred, NUM_SPLIT)

In [None]:
submission = pd.read_csv('/kaggle/input/ieee-fraud-detection/sample_submission.csv')
submission['isFraud'] = y_pred
submission.head()
submission.to_csv('submission.csv', index=False)

In [None]:
from IPython.display import HTML
def create_download_link(title = "Download CSV file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)
create_download_link(filename='submission.csv')

In [None]:
for i in range(len(y_pred)):
    if y_pred[i] <= 0.5:
        y_pred[i] = 0
    else:
        y_pred[i] = 1

In [None]:
submission = pd.read_csv('/kaggle/input/ieee-fraud-detection/sample_submission.csv')
submission['isFraud'] = y_pred
submission.head()
submission.to_csv('submission.csv', index=False)

In [None]:
create_download_link(filename='submission.csv')

In [None]:
params = {'num_leaves': 546,
          'min_child_weight': 0.03,
          'feature_fraction': 0.18,
          'bagging_fraction': 0.22,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.06,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.33,
          'reg_lambda': 0.39,
          'random_state': 42,
          'is_unbalance': True
}

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(train, y_train, test_size = 0.2, stratify = y_train, random_state = 42)
gc.collect()
model = lightgbm.train(params,
                       lightgbm.Dataset(X_tr, label=y_tr),
                       valid_sets=lightgbm.Dataset(X_val, label=y_val),
                       num_boost_round=10000,
                       early_stopping_rounds=300,
                       verbose_eval = 200,
                       )

In [None]:
del X_tr, X_val, y_tr, y_val
gc.collect()

In [None]:
from sklearn.metrics import classification_report
print("Training Set:")
y_pred = model.predict(X_tr)
for i in range(len(y_pred)):
    if y_pred[i] <= 0.5:
        y_pred[i] = 0
    else:
        y_pred[i] = 1
print(classification_report(y_tr, y_pred))

print("Validation Set:")
y_pred = model.predict(X_val)
for i in range(len(y_pred)):
    if y_pred[i] <= 0.5:
        y_pred[i] = 0
    else:
        y_pred[i] = 1
print(classification_report(y_val, y_pred))

print("Test Set:")
y_pred = model.predict(X_test)
for i in range(len(y_pred)):
    if y_pred[i] <= 0.5:
        y_pred[i] = 0
    else:
        y_pred[i] = 1
print(classification_report(y_test, y_pred))

In [None]:
intermediate_output_test = intermediate_layer_model.predict(test)
columns = []
for i in range(intermediate_output_test.shape[1]):
    columns.append("NN_feature_" + str(i))
new_features_test = pd.DataFrame(intermediate_output_test, columns=columns, index=test.index)
del intermediate_output_test
gc.collect()
result_test = pd.concat([test, new_features_test], axis=1, sort=False)
del new_features_test
gc.collect()

In [None]:
y_pred = model.predict(result_test)

In [None]:
submission = pd.read_csv('/kaggle/input/ieee-fraud-detection/sample_submission.csv')
submission['isFraud'] = y_pred
submission.head()
submission.to_csv('submission.csv', index=False)

In [None]:
from IPython.display import HTML
def create_download_link(title = "Download CSV file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)
create_download_link(filename='submission.csv')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
result_columns = test.columns.tolist()
for i in range(len(test.columns)):
    result_columns.append("NN_feature_" + str(i))
# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp = pd.DataFrame(sorted(zip(model.feature_importance()[:400],result_columns[:400])), columns=['Value','Feature'])

plt.figure(figsize=(200, 100))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()
plt.savefig('lgbm_importances-01.png')