[] handle object features :- simplify the names   
[] find a way to fill null values   
[] set validation properly   
[] experiment with folds (increase and decrease the folds)   
[] handle data imbalance (upsampling or downsampling)   
[] get more usefull features checking other kernels   

[] try other models  

Importing required libraries

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from sklearn.model_selection import KFold,StratifiedKFold
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import os
from sklearn.metrics import roc_auc_score
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



Function to reduce the memory usuage

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

Loading train and test transaction files

In [None]:
train_transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')

Check the training transaction data

In [None]:
train_transaction.head().T

Loading train and test identity data

In [None]:
train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')

Checking train identity data

In [None]:
train_identity.head().T

Merging transaction and identity data

In [None]:
train_data = train_transaction.merge(train_identity,on='TransactionID',how = 'left',left_index=True,right_index=True)
test_data = test_transaction.merge(test_identity,on='TransactionID',how='left',left_index=True,right_index=True)

Deleting not required data to save memory

In [None]:
del train_transaction,train_identity,test_transaction,test_identity

Checking the training data

In [None]:
train_data.head().T

Getting number of columns having null values

In [None]:
train_data.isnull().any().sum()

In [None]:
#columns which has null values above 90%

null_features = ((train_data.isnull().sum()/len(train_data)).sort_values(ascending = False) > .90).loc[((train_data.isnull().sum()/len(train_data)).sort_values(ascending = False) > .90)].index

In [None]:
#removing null columns 
train_data.drop(null_features,axis = 1,inplace = True)
test_data.drop(null_features,axis=1,inplace=True)

In [None]:
#decreasing memory usage
train_data = reduce_mem_usage(train_data)
test_data = reduce_mem_usage(test_data)

Data is very imbalanced

In [None]:
#checking target balance
train_data['isFraud'].value_counts(dropna = True).plot.bar()
print(train_data['isFraud'].value_counts(dropna = True))
print(train_data['isFraud'].value_counts(dropna = True)[0]/len(train_data))

In [None]:
#categorical features
cat_features = train_data.select_dtypes(include = 'object').columns
print(cat_features)


In [None]:
train_data['ProductCD'].value_counts(dropna = False)

In [None]:
train_data['DeviceInfo'].value_counts(dropna = False)

Getting target value and features differently 

In [None]:
y_train = train_data['isFraud']
train_data = train_data.drop('isFraud',axis = 1)

Filling missing values by -99

In [None]:
train_data.fillna(-99,inplace = True)
test_data.fillna(-99,inplace = True)

Label encoding the categorical features

In [None]:
for c in train_data.columns:
    if train_data[c].dtype == 'object' or test_data[c].dtype == 'object':
        le = LabelEncoder()
        le.fit(train_data[c].astype('str').values.tolist() + test_data[c].astype('str').values.tolist())
        train_data[c] = le.transform(train_data[c].astype('str').values)
        test_data[c] = le.transform(test_data[c].astype('str').values)
        

In [None]:
SEED = 42

In [None]:
#old params

params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47,
         }

In [None]:
#new parameters
params = {
                'objective':'binary',
                'boosting_type':'gbdt',
                'metric':'auc',
                'n_jobs':-1,
                'learning_rate':0.01,
                'num_leaves': 2**8,
                'max_depth':-1,
                'tree_learner':'serial',
                'colsample_bytree': 0.85,
                'subsample_freq':1,
                'subsample':0.85,
                'n_estimators':2**9,
                'max_bin':255,
                'verbose':-1,
                'seed': SEED,
                'early_stopping_rounds':100,
                'reg_alpha':0.3,
                'reg_lamdba':0.243
            } 

In [None]:
#usefull features as indicated by other kernels
useful_features = ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1',
                   'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13',
                   'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3',
                   'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V17',
                   'V19', 'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V44', 'V45', 'V46', 'V47', 'V48',
                   'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V69', 'V70', 'V71',
                   'V72', 'V73', 'V74', 'V75', 'V76', 'V78', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V87', 'V90', 'V91', 'V92',
                   'V93', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V138', 'V139', 'V140',
                   'V143', 'V145', 'V146', 'V147', 'V149', 'V150', 'V151', 'V152', 'V154', 'V156', 'V158', 'V159', 'V160', 'V161',
                   'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V172', 'V173', 'V175', 'V176', 'V177',
                   'V178', 'V180', 'V182', 'V184', 'V187', 'V188', 'V189', 'V195', 'V197', 'V200', 'V201', 'V202', 'V203', 'V204',
                   'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V219', 'V220',
                   'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V231', 'V233', 'V234', 'V238', 'V239',
                   'V242', 'V243', 'V244', 'V245', 'V246', 'V247', 'V249', 'V251', 'V253', 'V256', 'V257', 'V258', 'V259', 'V261',
                   'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276',
                   'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V292', 'V294', 'V303',
                   'V304', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V322', 'V323', 'V324', 'V326',
                   'V329', 'V331', 'V332', 'V333', 'V335', 'V336', 'V338', 'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09',
                   'id_11', 'id_12', 'id_13', 'id_14', 'id_15', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33',
                   'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']

In [None]:
#training lightgbm with kfold validation and prediction on test data
%%time

NFOLDS = 5
#folds = KFold(n_splits=NFOLDS)
folds = StratifiedKFold(n_splits=NFOLDS) 
#columns = train_data.columns 
columns = useful_features 
splits = folds.split(train_data, y_train)
y_preds = np.zeros(test_data.shape[0])
y_oof = np.zeros(train_data.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns
  
for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train_, X_valid = train_data[columns].iloc[train_index], train_data[columns].iloc[valid_index]
    y_train_, y_valid = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    dtrain = lgb.Dataset(X_train_, label=y_train_)
    dvalid = lgb.Dataset(X_valid, label=y_valid) 

    clf = lgb.train(params, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)
     
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(test_data[columns]) / NFOLDS
    
    del X_train_, X_valid, y_train_, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(y_train, y_oof)}")

In [None]:
sample_submission = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [None]:
sample_submission['isFraud'] = y_preds

In [None]:
print(sample_submission.head(10))
print(sample_submission.tail(10))

In [None]:
sample_submission.to_csv('submission.csv',index = False)

In [None]:
from IPython.display import HTML
html = "<a href='submission.csv'>d</a>"
HTML(html)