In [8]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta 

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
%matplotlib inline

import scipy.stats as stats

from geopy.distance import geodesic
import geopy.distance

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer

from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

In [9]:
train = pd.read_csv('Data_Detailed/frauddata_detailed.csv')
train
test = pd.read_csv('Data_Detailed/fraudDetailed.csv.csv')
test

FileNotFoundError: [Errno 2] No such file or directory: 'Data_Detailed/fraudDetailed.csv.csv'

In [None]:
def details(df):
    sum_null_values = df.isnull().sum()
    percent_null_values = 100* (sum_null_values/len(df))
    data_type = df.dtypes
    unique_values = df.nunique()

    table = pd.concat([sum_null_values,percent_null_values,data_type,unique_values], axis=1)
    table_col = table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Missing Values', 2 : 'Data_Type', 3: 'Unique values'})
    return table_col

In [None]:
df_train = train.iloc[0:,1:]
df_test = test.iloc[0:,1:]

df_train['Full_Name'] = df_train['first'] +' ' + df_train['last']
df_test['Full_Name'] = df_test['first'] +' ' + df_test['last']

df_train.drop(['first','last'],1,inplace=True)
df_test.drop(['first','last'],1,inplace=True)

df_train['trans_date_trans_time'] = pd.to_datetime(df_train['trans_date_trans_time'], errors='coerce')
df_train['dob'] = pd.to_datetime(df_train['dob'], errors='coerce')
df_train['unix_time'] = pd.to_datetime(df_train['unix_time'], errors='coerce')

df_train['Transaction_Date'] = (df_train['trans_date_trans_time']).dt.date.astype('datetime64[ns]')
df_train['age'] = df_train['Transaction_Date'].dt.year - df_train['dob'].dt.year
df_train.drop('dob',1,inplace=True)

df_test['trans_date_trans_time'] = pd.to_datetime(df_test['trans_date_trans_time'], errors='coerce')
df_test['dob'] = pd.to_datetime(df_test['dob'], errors='coerce')
df_test['unix_time'] = pd.to_datetime(df_test['unix_time'], errors='coerce')

df_test['Transaction_Date'] = (df_test['trans_date_trans_time']).dt.date.astype('datetime64[ns]')
df_test['age'] = df_test['Transaction_Date'].dt.year - df_test['dob'].dt.year
df_test.drop('dob',1,inplace=True)

In [None]:
df_train['weekday_no'] = df_train['trans_date_trans_time'].dt.dayofweek
df_train['week_day'] = df_train['trans_date_trans_time'].dt.day_name()
df_train['week_no'] = df_train['trans_date_trans_time'].dt.week
df_train['day_no'] = df_train['trans_date_trans_time'].dt.day
df_train['min_day'] = df_train['trans_date_trans_time'].dt.minute
df_train['hr_day'] = df_train['trans_date_trans_time'].dt.hour
df_train['month_name'] = df_train['trans_date_trans_time'].dt.month_name()
df_train['month'] = df_train['trans_date_trans_time'].dt.month
df_train['year'] = df_train['trans_date_trans_time'].dt.year
df_train['year_dayno'] = df_train['trans_date_trans_time'].dt.dayofyear

df_test['weekday_no'] = df_test['trans_date_trans_time'].dt.dayofweek
df_test['week_day'] = df_test['trans_date_trans_time'].dt.day_name()
df_test['week_no'] = df_test['trans_date_trans_time'].dt.week
df_test['day_no'] = df_test['trans_date_trans_time'].dt.day
df_test['min_day'] = df_test['trans_date_trans_time'].dt.minute
df_test['hr_day'] = df_test['trans_date_trans_time'].dt.hour
df_test['month_name'] = df_test['trans_date_trans_time'].dt.month_name()
df_test['month'] = df_test['trans_date_trans_time'].dt.month
df_test['year'] = df_test['trans_date_trans_time'].dt.year
df_test['year_dayno'] = df_test['trans_date_trans_time'].dt.dayofyear

In [None]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    
    ''' Feed longitude first and latitude second'''

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c  #6367
    return km

In [None]:
#Distance from same origin people co-ord 
df_train['dist_same_origin_co-ord(kms)'] = haversine_np(df_train['long'][0], df_train['lat'][0], df_train.loc[1:, 'long'], df_train.loc[1:, 'lat'])
#longitude first, latitude second
#Distance between successive people co-ord  
df_train['distance_successive_co-ord(kms)'] = haversine_np(df_train.long.shift(), df_train.lat.shift(), df_train.loc[1:, 'long'], df_train.loc[1:, 'lat'])
#longitude first, latitude second
#Distance between people co-ord and merchant co-ord
df_train['distance_people_to_merchant_co-ord(kms)'] = haversine_np(df_train['long'],df_train['lat'],df_train['merch_long'],df_train['merch_lat'])
#longitude first, latitude second

In [None]:
#Distance from same origin people co-ord 
df_test['dist_same_origin_co-ord(kms)'] = haversine_np(df_test['long'][0], df_test['lat'][0], df_test.loc[1:, 'long'], df_test.loc[1:, 'lat'])
#longitude first, latitude second
#Distance between successive people co-ord  
df_test['distance_successive_co-ord(kms)'] = haversine_np(df_test.long.shift(), df_test.lat.shift(), df_test.loc[1:, 'long'], df_test.loc[1:, 'lat'])
#longitude first, latitude second
#Distance between people co-ord and merchant co-ord
df_test['distance_people_to_merchant_co-ord(kms)'] = haversine_np(df_test['long'],df_test['lat'],df_test['merch_long'],df_test['merch_lat'])
#longitude first, latitude second

In [None]:
# Delta between successive times
df_train['delta_time'] = df_train['trans_date_trans_time']-df_train['trans_date_trans_time'].shift()

# Delta between successive times in seconds
df_train['delta_time(sec)'] = df_train['delta_time'].dt.total_seconds()

# Delta time elapsed from origin
dt = pd.to_datetime(df_train['trans_date_trans_time'].iloc[0])
df_train['delta_time_elapsed'] = df_train['trans_date_trans_time'] - dt

# Delta time from origin converted to days
df_train['delta_time_elapsed(days)'] = df_train['delta_time_elapsed'].dt.days

In [None]:
# Delta between successive times
df_test['delta_time'] = df_test['trans_date_trans_time']-df_test['trans_date_trans_time'].shift()

# Delta between successive times in seconds
df_test['delta_time(sec)'] = df_test['delta_time'].dt.total_seconds()

# Delta time elapsed from origin
dt = pd.to_datetime(df_test['trans_date_trans_time'].iloc[0])
df_test['delta_time_elapsed'] = df_test['trans_date_trans_time'] - dt

# Delta time from origin converted to days
df_test['delta_time_elapsed(days)'] = df_test['delta_time_elapsed'].dt.days

In [None]:
df_train.loc[pd.isnull(df_train['dist_same_origin_co-ord(kms)']), ['dist_same_origin_co-ord(kms)']] = 0.0
df_train.loc[pd.isnull(df_train['distance_successive_co-ord(kms)']), ['distance_successive_co-ord(kms)']] = 0.0
df_train.loc[pd.isnull(df_train['delta_time']), ['delta_time']] = pd.to_timedelta('0 days 00:00:00')
df_train.loc[pd.isnull(df_train['delta_time(sec)']), ['delta_time(sec)']] = 0.0

In [None]:
df_test.loc[pd.isnull(df_test['dist_same_origin_co-ord(kms)']), ['dist_same_origin_co-ord(kms)']] = 0.0
df_test.loc[pd.isnull(df_test['distance_successive_co-ord(kms)']), ['distance_successive_co-ord(kms)']] = 0.0
df_test.loc[pd.isnull(df_test['delta_time']), ['delta_time']] = pd.to_timedelta('0 days 00:00:00')
df_test.loc[pd.isnull(df_test['delta_time(sec)']), ['delta_time(sec)']] = 0.0

In [None]:
non_fraud_train = df_train[df_train['is_fraud'] == 0]
fraud_train = df_train[df_train['is_fraud'] == 1]

non_fraud_test = df_test[df_test['is_fraud'] == 0]
fraud_test = df_test[df_test['is_fraud'] == 1]

In [None]:
non_fraud_train.groupby('month_name')['amt'].mean()

In [None]:
fraud_train.groupby('month_name')['amt'].mean()

In [None]:
fraud_train[fraud_train['is_fraud'] == 1]['amt'].sort_values(ascending=False)[:10]   

In [None]:
fraud_train.groupby(["year",'category'])['amt'].agg(['sum'])  

In [None]:
fraud_train.groupby(['cc_num'])['amt'].sum().sort_values(ascending=False)[:10]

In [None]:
fraud_train.groupby(['Full_Name'])['dist_same_origin_co-ord(kms)'].mean().sort_values(ascending=False)[:10]

In [None]:
fraud_train.groupby(['merchant'])['dist_same_origin_co-ord(kms)'].mean().sort_values(ascending=False)[:10]

In [None]:
fraud_train.groupby(['city'])['amt'].mean().sort_values(ascending=False)[:10]

In [None]:
fraud_train.groupby(['job'])['amt'].mean().sort_values(ascending=False)[:10]

In [None]:
round(100*df_train['is_fraud'].value_counts(normalize=True),2).plot(kind='bar', figsize=(6,4))


In [None]:
round(100*df_test['is_fraud'].value_counts(normalize=True),2).plot(kind='bar', figsize=(6,4))


In [None]:
sns.barplot(data=df_train, x='is_fraud', y='amt')


In [None]:
sns.stripplot( x = df_train['is_fraud'], y= df_train['amt'], data=df_train)

In [None]:
df_train.groupby('gender')['is_fraud'].sum().sort_values(ascending=False).plot(kind='bar')

In [None]:
fraud_train.groupby('week_day')['is_fraud'].sum().sort_values(ascending=False).plot(kind='bar', figsize=(20,5))

In [None]:
fraud_train.groupby('month_name')['is_fraud'].sum().sort_values(ascending=False).plot(kind='bar', figsize=(20,5))

In [None]:
plt.figure(figsize=(12,5))
sns.distplot(df_train[df_train['is_fraud'] == 0]["hr_day"], color='g')
sns.distplot(df_train[df_train['is_fraud'] == 1]["hr_day"], color='r')
plt.title('Fraud x Normal Transactions by Hours', fontsize=17)


In [None]:
#Looking the Amount and time distribuition of FRAUD transactions

ax = sns.lmplot(y="amt", x="week_no", fit_reg=False,aspect=1.8, data=df_train, hue='is_fraud')
plt.title("Amounts by days of Frauds and Normal Transactions",fontsize=16)

ax = sns.lmplot(y="amt", x="min_day", fit_reg=False,aspect=1.8, data=df_train, hue='is_fraud')
plt.title("Amounts by hour of Frauds and Normal Transactions",fontsize=16)

In [None]:
df_train_outlier = df_train.copy()
df_test_outlier = df_test.copy()

In [None]:
df_train_outlier[['amt','city_pop','age','weekday_no','week_no','day_no','min_day','hr_day','month','year','year_dayno','dist_same_origin_co-ord(kms)','distance_successive_co-ord(kms)','distance_people_to_merchant_co-ord(kms)','delta_time(sec)','delta_time_elapsed(days)']].describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).apply(lambda s: s.apply('{0:.5f}'.format)) 

In [None]:
col_data1 = ['amt','city_pop','age','weekday_no','week_no','day_no','min_day','hr_day','month','year','year_dayno','dist_same_origin_co-ord(kms)','distance_successive_co-ord(kms)','distance_people_to_merchant_co-ord(kms)','delta_time(sec)','delta_time_elapsed(days)']

plt.figure(figsize=(30,20))
for i in enumerate(col_data1):
    plt.subplot(4,4,i[0]+1)
    sns.boxplot(df_train_outlier[i[1]])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.6, random_state=23, stratify=y)

In [None]:
pt = PowerTransformer(copy=False)
pt.fit_transform(X_train)
pt.transform(X_valid)
pt.transform(X_test)

In [None]:
# Models
model = list()
# Balancing imbalanced data
resample = list()
# Precision
precision = list()
# Recall
recall = list()
# F1-Score
F1score = list()
# Area under ROC Curve
AUCROC = list()

In [None]:
# Logistic Regression
model_LR = LogisticRegression()
# Random Forest
model_RF = RandomForestClassifier(oob_score = True, random_state=23)

In [None]:
params_LR = {'C':np.logspace(-1, 5, 10), 'class_weight':[None,'balanced'], 'penalty':['l1','l2']}


params_DT = {
    'max_depth': [10, 20, 50, 100, 200],
    'min_samples_leaf': [10, 20, 50, 100, 200],
    'min_samples_split' : [10, 20, 50, 100, 200],
    'criterion': ["gini", "entropy"]
} 


# Not able to run hyperparameters for Random Forest because of crashing, hence choosing limited parameters
params_RF = {    
    'n_estimators': [50],
    'max_depth': [50],
    'min_samples_leaf': [200],
    'min_samples_split' : [100],
    'criterion': ["gini"]

In [None]:
def model_fit_evaluation(model_model, params, X_train, y_train, X_test, y_test, algo=None, sampling=None):
    
    rcv = RandomizedSearchCV(model_model, params, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=23)
    rcv.fit(X_train, y_train)
    
    print('\n')
    print('best estimator : ', rcv.best_estimator_)
    print('best parameters: ', rcv.best_params_)
    print('best score: ', rcv.best_score_)
    print('\n')

    # Test set prediction
    y_test_prob=(rcv.best_estimator_).predict_proba(X_test)
    y_test_pred=(rcv.best_estimator_).predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test, y_test_pred),"\n")
    print('Classification Report')
    print('='*60) 

In [None]:
model_fit_evaluation(model_LR, params_LR, X_train, y_train, X_valid, y_valid, 'Logistic Regression', 'actual')

In [None]:
non_fraud_train = df_train[df_train['is_fraud'] == 0]
fraud_train = df_train[df_train['is_fraud'] == 1]

non_fraud_test = df_test[df_test['is_fraud'] == 0]
fraud_test = df_test[df_test['is_fraud'] == 1]

In [None]:
Avg_no_transactions_per_month_train = len(df_train)/12
Avg_no_transactions_per_month_train

Avg_no_transactions_per_month_test = len(df_test)/12
Avg_no_transactions_per_month_test

In [None]:
Avg_no_fraud_transactions_per_month_train = len(fraud_train)/12
Avg_no_fraud_transactions_per_month_train

Avg_no_fraud_transactions_per_month_test = len(fraud_test)/12
Avg_no_fraud_transactions_per_month_test

In [None]:
Avg_amount_per_fraud_transactions_train = fraud_train['amt'].mean()
Avg_amount_per_fraud_transactions_train
Avg_amount_per_fraud_transactions_test = fraud_test['amt'].mean()
Avg_amount_per_fraud_transactions_test