In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
from scipy import stats
from sklearn import preprocessing
import seaborn as sns
import pickle
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler, minmax_scale
from sklearn.feature_selection import RFECV

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Seen from https://www.kaggle.com/kabure/almost-complete-feature-engineering-ieee-data#V-Features
def PCA_change(df, cols, n_components, prefix='PCA_', rand_seed=4):
    pca = PCA(n_components=n_components, random_state=rand_seed)

    principalComponents = pca.fit_transform(df[cols])

    principalDf = pd.DataFrame(principalComponents)

    df.drop(cols, axis=1, inplace=True)

    principalDf.rename(columns=lambda x: str(prefix)+str(x), inplace=True)

    df = pd.concat([df, principalDf], axis=1)
    
    return df

def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        # print(name)
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

In [None]:
df_identity_train = pd.read_csv("../Data/train_identity.csv")
df_transaction_train = pd.read_csv("../Data/train_transaction.csv")
df_identity_test = pd.read_csv("../Data/test_identity.csv")
df_transaction_test = pd.read_csv("../Data/test_transaction.csv")
df_train = pd.merge(df_transaction_train, df_identity_train, how = "left", on = "TransactionID")
df_test = pd.merge(df_transaction_test, df_identity_test, how = "left", on = "TransactionID")

In [None]:
pd.set_option("display.max_rows", 500)

In [None]:
print("shape of df_identity_train: ", df_identity_train.shape)
print("shape of df_transaction_train: ", df_transaction_train.shape)
print("shape of df_train: ", df_train.shape)

In [None]:
del df_identity_train, df_transaction_train, df_identity_test, df_transaction_test

In [None]:
for col in df_test.columns:
    if df_test[col].dtype != 'object':
        print(col, df_train[col].min(), df_test[col].min(), df_train[col].max(), df_test[col].max())

In [None]:
one_val_cols =      [col for col in df_train.columns if df_train[col].nunique()<=1] +\
                        [col for col in df_test.columns if df_test[col].nunique()<=1]
missing_val_cols =  [col for col in df_train.columns if df_train[col].isnull().sum()/df_train.shape[0]>0.9] +\
                        [col for col in df_test.columns if df_test[col].isnull().sum()/df_test.shape[0]>0.9]
same_val_cols =     [col for col in df_train.columns if df_train[col].value_counts(dropna=False, normalize=True).values[0] > 0.9] +\
                        [col for col in df_test.columns if df_test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
cols_to_drop = list(set(one_val_cols + missing_val_cols + same_val_cols))
print(cols_to_drop)


In [None]:
cols_to_drop.remove('isFraud')

In [None]:
len(cols_to_drop)

In [None]:
print(df_train["id_30"].dtype)

In [None]:
df_train.drop(cols_to_drop, axis = 1, inplace = True)
df_test.drop(cols_to_drop, axis = 1, inplace = True)

In [None]:
df_train.fillna(-999, inplace = True)
df_test.fillna(-999, inplace = True)

In [None]:
resumetable(df_train)

In [None]:
def make_day_feature(df, offset=0.58, tname='TransactionDT'):
    """
    Creates a day of the week feature, encoded as 0-6.
    """
    days = df[tname] / (3600 * 24)
    encoded_days = np.floor(days - 1 + offset) % 7
    return encoded_days

def make_hour_feature(df, tname='TransactionDT'):
    """
    Creates an hour of the day feature, encoded as 0-23.
    """
    hours = df[tname] / (3600)
    encoded_hours = np.floor(hours) % 24
    return encoded_hours

In [None]:
df_train["Weekday"] = make_day_feature(df_train)
df_train["Hour"]= make_hour_feature(df_train)
df_test["Weekday"] = make_day_feature(df_test)
df_test["Hour"] = make_hour_feature(df_test)

In [None]:
df_train.dtypes

In [None]:

emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 
          'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 
          'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 
          'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 
          'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other',
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 
          'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 
          'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 
          'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 
          'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 
          'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 
          'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple',
          -999:"undefined"}
us_emails = ['gmail', 'net', 'edu']
# https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest_df-579654
for col in ['P_emaildomain', 'R_emaildomain']:
    df_train[col + '_pre'] = df_train[col].map(emails)
    df_test[col + '_pre'] = df_test[col].map(emails)
    
    df_train[col + '_suffix'] = df_train[col].map(lambda x: str(x).split('.')[-1])
    df_test[col + '_suffix'] = df_test[col].map(lambda x: str(x).split('.')[-1])
    
    df_train[col + '_suffix'] = df_train[col + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    df_test[col + '_suffix'] = df_test[col + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')


In [None]:
df_train["P_emaildomain_suffix"].unique()

In [None]:
for col in df_train.columns:
    if col.startswith("id"):
        print(col, df_train[col].nunique())
        if df_train[col].dtype!='object':
            print(df_train[col].nunique(), sorted(df_train[col].unique())[:10])

In [None]:
df_train.columns[-50:]

In [None]:
categorical = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2",
               "P_emaildomain", "R_emaildomain"] +  ["M" + str(i) for i in range(1, 10)] +\
                    ['DeviceType', 'DeviceInfo', 'Weekday', 'Hour',
                     'P_emaildomain_pre', 'P_emaildomain_suffix', 'R_emaildomain_pre',
                     'R_emaildomain_suffix', 'id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_19', 'id_20', 
                     'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',]

In [None]:
# Label encoding
category_counts = {}
for col in categorical:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(df_train[col].values) + list(df_test[col].values))
    df_train[col] = lbl.transform(list(df_train[col].values))
    df_test[col] = lbl.transform(list(df_test[col].values))
    category_counts[col] = len(list(lbl.classes_)) + 1

In [None]:
category_counts

In [None]:
for col in categorical:
    print(col, df_train[col].nunique(), df_train[col].min(), df_train[col].max())

In [None]:
print(df_train.columns[-50:])

In [None]:
del df_train["TransactionID"]
del df_train["TransactionDT"]
del df_test["TransactionID"]
del df_test["TransactionDT"]

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

In [None]:
y_train = df_train['isFraud']

In [None]:
del df_train['isFraud']
x_train = df_train

In [None]:
x_test = df_test

In [None]:
y_train.shape

In [None]:
scaler = StandardScaler()
scaler.fit(np.concatenate([x_train, x_test]))
scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)

In [None]:
scaled_x_train.shape

In [None]:
scaled_x_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from bayes_opt import BayesianOptimization

In [None]:
X_train, X_cv, Y_train, Y_cv = train_test_split(scaled_x_train, y_train, test_size=0.3)

In [None]:
print(X_train.shape)
print(X_cv.shape)
print(scaled_x_test.shape)
print(Y_train.shape)
print(Y_cv.shape)

In [None]:
X_test = scaled_x_test

In [None]:
with open("../Data/X_train_v5.pkl", "wb") as handle:
    pickle.dump(X_train, handle)
with open("../Data/X_cv_v5.pkl", "wb") as handle:
    pickle.dump(X_cv, handle)
with open("../Data/X_test_v5.pkl", "wb") as handle:
    pickle.dump(X_test, handle)
with open("../Data/Y_train_v5.pkl", "wb") as handle:
    pickle.dump(Y_train, handle)
with open("../Data/Y_cv_v5.pkl", "wb") as handle:
    pickle.dump(Y_cv, handle)

In [None]:
# Learnt from https://www.kaggle.com/pavelvpster/ieee-fraud-eda-lightgbm-baseline/notebook

In [None]:
lgb_train = lgb.Dataset(data=X_train.astype('float32'), label=Y_train.astype('float32'))
lgb_valid = lgb.Dataset(data=X_cv.astype('float32'), label=Y_cv.astype('float32'))

In [None]:
def train_model(num_leaves, min_data_in_leaf, max_depth, bagging_fraction, feature_fraction, lambda_l1, lambda_l2):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': False,
        'boost_from_average': True,
        'num_threads': 4,
        
        'num_leaves': int(num_leaves),
        'min_data_in_leaf': int(min_data_in_leaf),
        'max_depth': int(max_depth),
        'bagging_fraction' : bagging_fraction,
        'feature_fraction' : feature_fraction,
        'lambda_l1': lambda_l1,
        'lambda_l2': lambda_l2
    }
    
    lgb_model = lgb.train(params, lgb_train, valid_sets=lgb_valid, verbose_eval=1000)
    
    y = lgb_model.predict(X_cv.astype('float32'), num_iteration=lgb_model.best_iteration)
    
    score = roc_auc_score(Y_cv.astype('float32'), y)
    return score

In [None]:
bounds = {
    'num_leaves': (31, 500),
    'min_data_in_leaf': (20, 200),
    'max_depth':(-1, 50),
    'bagging_fraction' : (0.1, 0.9),
    'feature_fraction' : (0.1, 0.9),
    'lambda_l1': (0, 2),
    'lambda_l2': (0, 2)
}

In [None]:
bo = BayesianOptimization(train_model, bounds, random_state=42)

In [None]:
bo.maximize(init_points=10, n_iter=15, acq='ucb', xi=0.0, alpha=1e-6)

In [None]:
bo.max

In [None]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': False,
    'boost_from_average': True,
    'num_threads': 4,
    
    'num_leaves': int(bo.max['params']['num_leaves']),
    'min_data_in_leaf': int(bo.max['params']['min_data_in_leaf']),
    'max_depth': int(bo.max['params']['max_depth']),
    'bagging_fraction' : bo.max['params']['bagging_fraction'],
    'feature_fraction' : bo.max['params']['feature_fraction'],
    'lambda_l1': bo.max['params']['lambda_l1'],
    'lambda_l2': bo.max['params']['lambda_l2']
}

lgb_model = lgb.train(params, lgb_train, valid_sets=lgb_valid, verbose_eval=1000)

In [None]:
Y_pred = lgb_model.predict(X_test.astype('float32'), num_iteration=lgb_model.best_iteration)

In [None]:
Y_pred.sum()

In [None]:
submission = pd.read_csv('../Data/sample_submission.csv', index_col='TransactionID')
submission['isFraud'] = Y_pred
submission.to_csv('../Data/Y_test_v5.csv')