### FDS (Fraud Detection System, 이상금융거래탐지시스템)

##### Module import

In [1]:
import os
import gc
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import multiprocessing
import missingno as msno
import tensorflow as tf

from sklearn import metrics, preprocessing
from sklearn.preprocessing import Imputer, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics, preprocessing
from sklearn.decomposition import PCA
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

##### Reduce Memory Usage

In [2]:
%%time
# From kernel https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

Wall time: 0 ns


##### Load CSV file

In [3]:
train_tr = pd.read_csv('./input/train_transaction.csv', index_col='TransactionID')
print("Completed load")
train_id = pd.read_csv('./input/train_identity.csv', index_col='TransactionID')
print("Completed load")
test_tr = pd.read_csv('./input/test_transaction.csv', index_col='TransactionID')
print("Completed load")
test_id = pd.read_csv('./input/test_identity.csv', index_col='TransactionID')
print("Completed load")
sub = pd.read_csv('./input/sample_submission.csv', index_col='TransactionID')
print("Completed load")

Completed load
Completed load
Completed load
Completed load
Completed load


##### Left join by TransactionId

In [None]:
train = pd.merge(train_tr, train_id,
                        how='left',
                        on='TransactionID')
del train_tr, train_id

test = pd.merge(test_tr, test_id,
                        how='left',
                        on='TransactionID')
del test_tr, test_id

train = reduce_mem_usage(train)
test  = reduce_mem_usage(test)

##### EDA(Exploratory Data Analysis)

In [None]:
# covert to object for reuse
train['ProductCD'] = train['ProductCD'].astype('object')
train['P_emaildomain'] = train['P_emaildomain'].astype('object')
train['R_emaildomain'] = train['R_emaildomain'].astype('object')    
train['DeviceType'] = train['DeviceType'].astype('object')
train['DeviceInfo'] = train['DeviceInfo'].astype('object')

card_cols = [c for c in train.columns if 'card' in c]
for col in card_cols:
    train[col] = train[col].astype('object')

addres_cols = [c for c in train.columns if 'addr' in c]
for col in addres_cols:
        train[col] = train[col].astype('object')

M_cols = [c for c in train.columns if 'M' in c]
for col in M_cols:
        train[col] = train[col].astype('object')
        
C_cols = [c for c in train.columns if 'C' in c]
for col in C_cols:
        train[col] = train[col].astype('object')
        
id_cols = [c for c in train.columns if 'id' in c]
for col in id_cols:
        train[col] = train[col].astype('object')

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20, 6))
ax = sns.distplot(train['dist1'].dropna(axis = 0), bins=10, hist=False, ax=ax1)
ax1.set_title('dist1 Distribution', fontsize=14)

ax = sns.distplot(train['dist2'].dropna(axis = 0), bins=10, hist=False, ax=ax2)
ax2.set_title('dist2 Distribution', fontsize=14)

ax = sns.distplot(train['TransactionAmt'].dropna(axis = 0), bins=2, hist=False, ax=ax3)
ax3.set_title('TransactionAmt Distribution', fontsize=14)

##### Imputation of missing values

In [None]:
def missing_value_statistics(df):
    missing_values_count = df[df.columns].isnull().sum()
    print (missing_values_count.head())
    total_cells = np.product(df.shape)
    total_missing = missing_values_count.sum()
    print ("% of missing data = ",(total_missing/total_cells) * 100)

In [None]:
def missing_values_categorical(df, threshold=100.00): 
    missing_values_count = df.isnull().sum()
    missing_percentage= (missing_values_count/df.shape[0]) * 100
    columns = missing_percentage[missing_percentage<threshold].index
    df = df[columns]
    
    features_categorical = df.select_dtypes(include = ['object']).columns
    df[features_categorical] = df[features_categorical].fillna('-999',inplace=False)
    
    missing_value_statistics(df)
    
    del features_categorical
    
    return df

In [None]:
def missing_values_numerical(df, threshold=100.00): 
    missing_values_count = df.isnull().sum()
    missing_percentage= (missing_values_count/df.shape[0]) * 100
    columns = missing_percentage[missing_percentage<threshold].index
    df = df[columns]
    
    features_dist_transAmt = df[['dist1','dist2','TransactionAmt']].columns
    
    imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imputer = imputer.fit(df[features_dist_transAmt])
    
    df = imputer.transform(df[features_dist_transAmt])
    df = pd.DataFrame(df, columns=['dist1','dist2','TransactionAmt'])
    
    del features_dist_transAmt
    
    return df

In [None]:
print ("% of train data missing = ",(train[train.columns].isnull().sum().sum()/np.product(train.shape)) * 100)

In [None]:
threshold=100.0
features_numeric = train.select_dtypes(include = ['int32','int8','float16','float64', 'int64']).columns
col_1 = [c for c in features_numeric if c in ['dist1','dist2','TransactionAmt']]
col_2 = [c for c in features_numeric if c not in col_1]

s=train.index
train_clean_col_1 = missing_values_numerical(train[col_1], threshold)
train_clean_col_1 = train_clean_col_1.set_index([s])

train_clean_col_2 = train[col_2].fillna(-999)
train_clean_col_2 = train_clean_col_2.set_index([s])

train_clean_N = train_clean_col_1.merge(train_clean_col_2, left_on='TransactionID', right_on='TransactionID', how='left')
missing_value_statistics(train_clean_N)

In [None]:
threshold=100.0
features_category = train.select_dtypes(include = ['object']).columns
train_clean_C = missing_values_categorical(train[features_category], threshold)
train_clean_C['card1'] = train_clean_C['card1'].astype('object')
missing_value_statistics(train_clean_C)

In [None]:
train_clean = train_clean_N.merge(train_clean_C, left_on='TransactionID', right_on='TransactionID', how='left')

In [None]:
del train, train_clean_N, train_clean_C

### TransactionDT guess

In [None]:
START_DATE = '2018-01-01'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
train_clean['TransactionDT'] = train_clean['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
print(train_clean.head())
print(train_clean.tail())

##### Feature Scaling

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20, 6))
ax = sns.distplot(train_clean['dist1'].dropna(axis = 0), bins=10, hist=False, ax=ax1)
ax1.set_title('dist1 Distribution', fontsize=14)

ax = sns.distplot(train_clean['dist2'].dropna(axis = 0), bins=10, hist=False, ax=ax2)
ax2.set_title('dist2 Distribution', fontsize=14)

ax = sns.distplot(train_clean['TransactionAmt'].dropna(axis = 0), bins=2, hist=False, ax=ax3)
ax3.set_title('TransactionAmt Distribution', fontsize=14)

In [None]:
train_clean['TransactionAmt'] = np.log(train_clean['TransactionAmt']+1)
train_clean['dist1'] = np.log(train_clean['dist1']+1)
train_clean['dist2'] = np.log(train_clean['dist2']+1)

In [None]:
#전체 스케일링 결과는 비슷하지만 아웃라이어를 제거한 
#나머지 데이터의 분포는 로버스트 스케일링을 사용했을 때가 더 좋다.
#아웃라이어 검출이 목적이므로 가장 좋은 스케일링 방법이라 볼 수 있다.
from sklearn.preprocessing import RobustScaler

cols_1 = [c for c in train_clean.columns if c in ['dist1','dist2','TransactionAmt']]
cols_2 = [c for c in train_clean.columns if c not in col_1]

# RobustScaler is less prone to outliers.
rob_scaler = RobustScaler(with_scaling=True, with_centering=False)
train_clean_rob = pd.DataFrame(data=rob_scaler.fit_transform(train_clean[cols_1]), columns=['dist1','dist2','TransactionAmt'])

# Set the index of the scaled dataset. It is the same as the original dataset
s=train_clean.index
train_clean_rob = train_clean_rob.set_index([s])

#Merge the scaled dataset with the categorical features and the ["isFraud", "TransactionDT"] columns to get back the cleaned 
#dataset but with scaled numerical columns
train_clean_rob = pd.merge(train_clean_rob, train_clean[cols_2],left_index=True, right_index=True)

#Just a check of the dimensions.
print(train_clean.shape)
print(train_clean_rob.shape)

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20, 6))
ax = sns.distplot(train_clean_rob['dist1'], bins=10, hist=False, ax=ax1)
ax1.set_title('dist1 Distribution After Scaling', fontsize=14)

ax = sns.distplot(train_clean_rob['dist2'], bins=10, hist=False, ax=ax2)
ax2.set_title('dist2 Distribution After Scaling', fontsize=14)

ax = sns.distplot(train_clean_rob['TransactionAmt'], bins=2, hist=False, ax=ax3)
ax3.set_title('TransactionAmt Distribution After Scaling', fontsize=14)

In [None]:
# train_clean_rob.head()

In [None]:
# train_clean.columns

In [None]:
# train_clean_rob['TransactionDT']

### Transaction amount comparison fraud and non-fraud

In [None]:
f,(ax1,ax2) = plt.subplots(2,1,sharex=True,figsize=(12,6))
bins = 30

ax1.hist(train_clean_rob.TransactionAmt[train_clean_rob.isFraud==1],bins=bins)
ax1.set_title('Fraud')

ax2.hist(train_clean_rob.TransactionAmt[train_clean_rob.isFraud==0],bins=bins)
ax2.set_title('Non-Fraud')

plt.xlabel('TransactionAmt($)')
plt.ylabel('Number of transaction')
plt.yscale('log')
plt.show()

In [None]:
import matplotlib.gridspec as gridspec

features = train_clean.ix[:,51:100].columns;


for i,cn in enumerate(train_clean[features]):
    print("i: ",i)
    print("cn: ",cn)
    fig =  plt.figure(figsize=(20,12))
    ax = fig.add_subplot(111)
    sns.distplot(train_clean[cn][train_clean.isFraud == 1], bins=50,color='r')
    sns.distplot(train_clean[cn][train_clean.isFraud == 0], bins=50,color='b')
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(cn))
    plt.savefig('./output/pics/histgram of {}.png'.format(str(cn)))
    plt.clf()
#plt.show()

In [None]:
# #Min-Max NOrmarlization
# from sklearn.preprocessing import minmax_scale, StandardScaler, MinMaxScaler
# df['A'].apply(minmax_scale)

# minmax_scale = MinMaxScaler(feature_range=[0,1]).fit(df[['A', 'B']]) # A, B 컬럼 각각 standard_scaler가 만들어짐
# df_minmax = minmax_scale.transform(df[['A', 'B']])

In [None]:
# df['A'].apply(lambda x: StandardScaler(x))
# std_scale = StandardScaler().fit(df[['A', 'B']]) # A, B 컬럼 각각 standard_scaler가 만들어짐
# df_std = std_scale.transform(df[['A', 'B']])

In [None]:
print('sas')

In [None]:
# #train.isnull().sum(axis=0)
# #train.isnull().mean(axis=0)
# #train['dist1'].isnull().mean(axis=0)
# # train.dropna(axis=0, thresh= 3)
# # train.fillna(value=0)

# def Preprocess_rate(targetdf, cols, drop_rate):
#     df = targetdf.copy()
#     for col in cols:
#         if(df[col].isnull().mean(axis=0) > drop_rate):
#             df = df.drop(columns = col)
#     return df
# newdf = Preprocess_rate(train, train.columns, 0.4)

In [None]:
# sns.distplot(newdf.C1.dropna())
# plt.show()

In [None]:
# #fill NaN by most_frequency (have to adapt with categorical columns)
# newdf2 = pd.get_dummies(newdf)
# def Most_freq(targetdf, cols):
#     df = targetdf.copy()
#     for col in cols:
#         if(df[col].isnull().mean(axis=0) > 0):
#             print(col)
#             most_frequency = df[col].value_counts(dropna = True).idxmax()
#             print(most_frequency)
#             df = df[col].fillna(most_frequency, inplace = True)
#     return df

# newdf3 = Most_freq(newdf2, newdf2.columns)
# newdf3.isnull().mean(axis=0)

In [None]:
# most_frequency = newdf2['TransactionDT'].value_counts(dropna = True).idxmax()
# print(most_frequency)

In [None]:
# print(frauddf.mean())

In [None]:
# corr_with_Fraud = train.corrwith(train['isFraud'])
# for d in corr_with_Fraud:
#     if(d > 0.01):
#         print((corr_with_Fraud==d).argmax(), d)


In [None]:
# corr_with_Fraud = train.corrwith(train['isFraud'])
# print(corr_with_Fraud)

In [None]:
# msno.heatmap(train.iloc[:, :5])

In [None]:
# msno.matrix(df=train.iloc[:, 61:80], figsize=(20, 20), color=(0.2, 0.3, 0.8))

In [None]:
# ###Fraud ratio pie chart
# f, ax = plt.subplots(1, 2, figsize=(18, 10))

# train['isFraud'].value_counts().plot.pie(explode=[0, 0.1], autopct='%1.1f%%', ax=ax[0], shadow=True)
# ax[0].set_title('Pie plot - isFraud')
# ax[0].set_ylabel('')
# sns.countplot('isFraud', data=train, ax=ax[1])
# ax[1].set_title('Count plot - isFraud')

# plt.show()

In [None]:
# train['isFraud'].value_counts()

In [None]:
# print(train2.columns)

In [None]:
#train_tr['isFraud'].count()

In [None]:
# #without preprocessing

# # del train
# # del train_tr
# # del train_id

# # train = train.drop(columns=['dist2','R_emaildomain','D2','D3','D4','D5',
# #                              'D7','D8','D9','D12','D13','D14'
# #                             ])
# # train = train.drop(columns=['V1','V2'])
# train = train.loc[:,['isFraud','TransactionDT','TransactionAmt','ProductCD','card1',
#                      'card2','card3','C1','C2','C3','C4',
#                      'V12'
#                     ]]
# #print(train)
# train2 = pd.get_dummies(train)
# train2=train2.dropna(axis=0)
# #train2.isnull().sum(axis=0)

# #y= pd.get_dummies(train2.isFraud)
# y =train2.isFraud
# X = train2.drop(columns=['isFraud'])

# #headerX = train2.drop(columns=['isFraud']).columns

In [None]:
# X2= np.array(X)
# y = np.array(y)

# X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3,shuffle = False )
# print(X2.shape)
# print(y.shape)


In [None]:
# print(X_train)

In [None]:
# print(y_train[:])

In [None]:
# glm = LogisticRegression()
# glm.fit(X_train, y_train[:])
# y_pred = glm.predict(X_test)
# acc = np.mean(y_test[:] == y_pred )
# print("SKLEARN Logistic Regression Accuracy = {:3.3f}".format(acc))