In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import display, HTML
from sklearn.externals import joblib
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pickle
import os
from joblib import dump, load
from google.colab import drive
from sklearn.model_selection import train_test_split
from tqdm import tqdm
pd.set_option('display.width', 500)
pd.options.display.max_rows
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

def final_model(X):

  '''
    Function to Train a model, Save trained model and predict the values on the model.
    All the Data preprocessing, cleaning, prediction is at one place.
    Expected input is the raw data.
    Expected output is the list of predicted values.  
  '''

  clean_data(X)
  preprocess_data(X)
  return do_predict(X)


def clean_data(X):

  '''
  Function to clean the raw data.
  Any missing values will be filled, less/high importance values are removed or grouped as others.
  This function is subjective to modify depend onthe observations on EDA.

  X : DataFrame containing raw or cleaned data. 
  '''

  print('## Data Cleaning - filling missing values for categorical data')
  for cat in cat_feat :
    if cat in X:
      X[cat].fillna(MISSING, inplace=True)
      X[cat] = X[cat].astype(str)

  # Numerical features : fill missing values with median.
  print('## Data Cleaning - filling missing values for numerical data')
  for cat in [col for col in X.columns if col not in cat_feat]:
    X[cat].fillna(X[cat].median(), inplace=True)

  # All the versions of the same OS categories are grouped together.
  # Here we are not taking advantage of the versions specified.
  X.loc[X['id_30'].str.contains('Windows', na=False), 'id_30'] = 'Windows'
  X.loc[X['id_30'].str.contains('iOS', na=False), 'id_30'] = 'iOS'
  X.loc[X['id_30'].str.contains('Mac OS', na=False), 'id_30'] = 'Mac'
  X.loc[X['id_30'].str.contains('Android', na=False), 'id_30'] = 'Android'

  #All the versions of the same Web browser categories are grouped together.
  #Here we are not taking advantage of the versions specified.
  X.loc[X['id_31'].str.contains('chrome', na=False), 'id_31'] = 'Chrome'
  X.loc[X['id_31'].str.contains('firefox', na=False), 'id_31'] = 'Firefox'
  X.loc[X['id_31'].str.contains('safari', na=False), 'id_31'] = 'Safari'
  X.loc[X['id_31'].str.contains('edge', na=False), 'id_31'] = 'Edge'
  X.loc[X['id_31'].str.contains('ie', na=False), 'id_31'] = 'IE'
  X.loc[X['id_31'].str.contains('samsung', na=False), 'id_31'] = 'Samsung'
  X.loc[X['id_31'].str.contains('opera', na=False), 'id_31'] = 'Opera'


def preprocess_data(X):

  '''
  Function to convert feature specific values to machine understandable language.
  Categorical features are converted to labels by encoding through LabelEncoder.

  X : DataFrame.
  '''
  print('## Data Preprocessing - loading labelencoder')
  for col in [col for col in cat_feat if col in X.columns]:
    file = open(data_path+col+enc_file, 'rb')
    label_encoder = pickle.load(file)
    file.close()
    X[col] = label_encoder.transform(list(X[col].values))
    del label_encoder

  print('## Data Preprocessing - loading minmaxscalar')
  file = open(data_path+scal_file, 'rb')
  scaler = pickle.load(file)
  file.close()
  numerical_columns = [col for col in X.columns if col not in cat_feat]
  X[numerical_columns] = scaler.transform(X[numerical_columns])

  return


def do_predict(X):
  '''
  Function to predict and return the values as list.
  if there is no model trained and saved to predict, train and save the model.

  X : Data Frame containing values in model understandable data to predict.
  '''

  from sklearn.externals import joblib
  import lightgbm as lgb
  # load model
  print('## Predicting - Load model')
  clf = joblib.load(data_path+model_file)
  print('## Predicting - predicting values')
  return clf.predict(X)


def train_model(X, Y) :

  '''
  Function to train the model. 
  X, Y are expected to be raw data for training. 
  cleaning the data, preprocessing, training and saving the final model.
  '''
  
  clean_data(X)
  # if there is no encoder file, create file.
  
  if (not os.path.exists(data_path+enc_file)):
    print('## Preprocessing - labelencoding categorical features')
    for col in [c for c in cat_feat if c in X.columns] : 
      label_encoder = LabelEncoder()
      val = list(X[col].values)
      val.append(MISSING)
      val.append(Others)
      print('## In Training the model : categorical column type : ', type(val[0]))
      print(list(set(val)))
      label_encoder.fit(list(set(val)))
      X[col] = label_encoder.transform(list(X[col].values))
      output = open(data_path+col+enc_file, 'wb')
      pickle.dump(label_encoder, output)
      output.close()

  
  # if there is no scaling file, create file.
  if (not os.path.exists(data_path+scal_file)):
    print('## Preprocessing - Scaling numerical features')
    scaler = MinMaxScaler()
    numerical_columns = [col for col in X.columns if col not in cat_feat]
    X[numerical_columns] = scaler.fit_transform(X[numerical_columns])
    
    output = open(data_path+scal_file, 'wb')
    pickle.dump(scaler, output)
    output.close()
  
  if (not os.path.exists(data_path+model_file)):
    print('## Training - training the model')
    # train model if model is not saved yet.
    d_train = lgb.Dataset(X, label=Y)
    params = {}
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'binary'
    params['metric'] = 'auc'
    params['sub_feature'] = 0.5
    params['min_data'] = 50
    params['max_depth'] = 10
    clf = lgb.train(params, d_train, 100)
    # save model
    print('Saving the model')
    joblib.dump(clf, data_path+model_file)


def metric(y_pred, y_actual):
  '''
    Function calculate the roc auc score and return the same
    Input : predicted values and actual values 
    output : roc auc score

  '''
  from sklearn.metrics import roc_auc_score
  return roc_auc_score(y_actual, y_pred)
  



## Load Data From Google Drive

In [2]:
drive.mount('/content/drive')
data_path_ = 'drive/My Drive/Case study/Fraud-Detection/data/'
train_df = pd.read_csv(f'{data_path_}train_df.csv', dtype={'card3': float,'card5': float})
test_df = pd.read_csv(f'{data_path_}test_df.csv', dtype={'card3': float,'card5': float})

# make datatypes 

# Constants
cat_feat = ['ProductCD', 'card1', 'card2', 'card3','card4', 'card5', 'card6', 'addr1', 'addr2', 'P_emaildomain',
              'R_emaildomain', 'M1', 'M2', 'M3','M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'id_12', 'id_13', 'id_14', 'id_15', 
              'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 
              'id_28', 'id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 
              'DeviceType', 'DeviceInfo',]
# path to the case study folder
data_path= 'drive/My Drive/Case study/Fraud-Detection/'
# Trained Model save file name 
model_file = 'save_model.pkl'
# LabelEncoder save base file name after fitting on trained data.
enc_file = 'encoder_file.pkl'
# MinmaxScalar save base file name after fitting on trained data.
scal_file = 'scaler_file.pkl'
# Constant used to label nan values in categorical features.
MISSING = 'missing'
Others = "Others"

# Dividing train data into train and cv with 80:10 ratio
y = train_df['isFraud']
train_df.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis=1, inplace=True)
test_df.drop(columns=['TransactionID', 'TransactionDT'], inplace=True)
train_df, cv_df, y_train, y_cv = train_test_split(train_df, y, test_size=0.10, random_state=42, stratify=y)

# We found unknown categories apart from train data.
# We need to find and make those categories as a missing category 
# because after training the model doesn't know the category.
def makeMissingCatIfNotPresent(train_val, val):
  if val in train_val:
    return str(val)
  else :
      return MISSING
# check if category values are there in train data else make it as missing category.
for col in tqdm(cat_feat) :
  uniques = train_df[col].unique()
  train_df[col] = train_df[col].apply(lambda x : makeMissingCatIfNotPresent(uniques, x))
  test_df[col] = test_df[col].apply(lambda x : makeMissingCatIfNotPresent(uniques, x))
  cv_df[col] = cv_df[col].apply(lambda x : makeMissingCatIfNotPresent(uniques, x))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


100%|██████████| 49/49 [05:43<00:00, 20.12s/it]


In [3]:
# Train model
if not os.path.exists(data_path+model_file):
  train_model(train_df.copy(deep=True), y_train)

# Metrics using final model function.

print('\n' + '*'*25 + 'Predicting for Train data' + '*'*25)
train_metric =  metric(final_model(train_df.copy(deep=True)), y_train)

print('\n' + '*'*25 + 'Predicting for CV data' + '*'*25)
cv_metric =  metric(final_model(cv_df.copy(deep=True)), y_cv)

print('\n')
print('Train metric : ', train_metric)
print('CV metric : ',cv_metric)


## Data Cleaning - filling missing values for categorical data
## Data Cleaning - filling missing values for numerical data
## Preprocessing - labelencoding categorical features
## In Training the model : categorical column type :  <class 'str'>
['missing', 'H', 'Others', 'C', 'W', 'S', 'R']
## In Training the model : categorical column type :  <class 'str'>
['18005', '17066', '11667', '3984', '10271', '6505', '6964', '3132', '15229', '2459', '1659', '2906', '6306', '2295', '10513', '8751', '12984', '9856', '16228', '15423', '1090', '13497', '15439', '9505', '16448', '8926', '10511', '16436', '11816', '11653', '9066', '10474', '10022', '15097', '9527', '1608', '1790', '1488', '11664', '13729', '4556', '13765', '14691', '7185', '13509', '9613', '1337', '16869', '5347', '4619', '16205', '2194', '2268', '16009', '8963', '5128', '6805', '16879', '14973', '3640', '12648', '7576', '14310', '4023', '11405', '6297', '3808', '16025', '11827', '2472', '9227', '7077', '11030', '17439', '13302', '

In [4]:
# Predict Test values.
print('\n' + '*'*25 + 'Predicting for Test data' + '*'*25)
test_predict = final_model(test_df.copy(deep=True))

# update sample submission file
sample_df = pd.read_csv(data_path_ + 'sample_submission.csv')
sample_df['isFraud'] = test_predict
print('Sample test predictions')
sample_df.to_csv(data_path_ + 'sample_submission.csv', index=False)

# Display Sample test results
display(sample_df.head(50))


*************************Predicting for Test data*************************
## Data Cleaning - filling missing values for categorical data
## Data Cleaning - filling missing values for numerical data
## Data Preprocessing - loading labelencoder
## Data Preprocessing - loading minmaxscalar
## Predicting - Load model
## Predicting - predicting values
Sample test predictions


Unnamed: 0,TransactionID,isFraud
0,3663549,0.004229
1,3663550,0.007167
2,3663551,0.014773
3,3663552,0.003354
4,3663553,0.004792
5,3663554,0.005201
6,3663555,0.029364
7,3663556,0.031288
8,3663557,0.001784
9,3663558,0.011448


In [6]:
print('*'*50 + 'The End' + '*'*50)

**************************************************The End**************************************************
