<a href="https://colab.research.google.com/github/lferncastro/Credit-Card-Fraud-Detection/blob/master/Fraud_prediction_using_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up the environment

In [0]:
!pip install -q kaggle
!pip install -q kaggle-cli
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!mkdir -p /content/drive/'My Drive'/datasets;cd /content/drive/'My Drive'/datasets; ls 
from pathlib import Path
kaggle_key = Path('/content/drive/My Drive/datasets/kaggle.json')
if kaggle_key.exists():
    !mkdir -p ~/.kaggle
    !cp /content/drive/'My Drive'/datasets/kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    !ls ~/.kaggle
else:
    print('You need to upload the file kaggle.json')
    from google.colab import files
    uploaded = files.upload() 
    for fn in uploaded.keys():
        print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))
    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !mv kaggle.json /content/drive/'My Drive'/datasets/
    !chmod 600 ~/.kaggle/kaggle.json
    !ls ~/.kaggle

In [0]:
!kaggle competitions download -c ieee-fraud-detection -p /content/drive/'My Drive'/datasets/
!cd /content/drive/'My Drive'/datasets/; unzip \*.zip
!rm /content/drive/'My Drive'/datasets/*.zip
!rm /content/drive/'My Drive'/datasets/*.7z
!ls /content/drive/'My Drive'/datasets/*.csv

This notebook is based on the work found on https://github.com/kapilnchauhan77/Credit-card-fraud-detector. All credits to the original creator.

# Importing Libraries

In [0]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os

# Data Processing For Virtualization

In [0]:
!chmod 644 /content/drive/'My Drive'/datasets/train_transaction.csv
!chmod 644 /content/drive/'My Drive'/datasets/train_identity.csv
!chmod 644 /content/drive/'My Drive'/datasets/test_transaction.csv
!chmod 644 /content/drive/'My Drive'/datasets/test_identity.csv
train_trans = pd.read_csv('/content/drive/My Drive/datasets/train_transaction.csv', low_memory=False)
train_ident = pd.read_csv('/content/drive/My Drive/datasets/train_identity.csv', low_memory=False)
test_trans = pd.read_csv('/content/drive/My Drive/datasets/test_transaction.csv' , low_memory=False)
test_ident = pd.read_csv('/content/drive/My Drive/datasets/test_identity.csv', low_memory=False)

In [0]:
train = pd.merge(train_trans, train_ident, on = 'TransactionID', how = 'left')
test = pd.merge(test_trans, test_ident, on = 'TransactionID', how = 'left')

In [0]:
del train_trans, train_ident, test_trans, test_ident

In [0]:
def reduce_mem_usage(df):
    numv = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2 
    for col in df.columns:
        vt = df[col].dtypes
        if vt in numv:
            max_c = df[col].max()
            min_c = df[col].min()
            if str(vt)[:3] == 'int':
                if min_c > np.iinfo(np.int8).min and max_c < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif min_c > np.iinfo(np.int16).min and max_c < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif min_c > np.iinfo(np.int32).min and max_c < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif min_c > np.iinfo(np.int64).min and max_c < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if min_c > np.finfo(np.float16).min and max_c < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif min_c > np.finfo(np.float32).min and max_c < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif min_c > np.finfo(np.float64).min and max_c < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [9]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Mem. usage decreased to 650.48 Mb (66.8% reduction)
Mem. usage decreased to 565.37 Mb (66.3% reduction)


In [0]:
miss_data = pd.isnull(train).sum().sort_values(ascending=False)
miss_per = (miss_data/len(train))*100
missing_data = pd.concat(objs = [miss_data, miss_per], keys = ['Columns','Missing values percentage'], axis = 1)

In [0]:
def delnullcol(dt):
    nullcol = [col for col in dt.columns if dt[col].isnull().sum()/dt.shape[0] >= 0.7]
    return nullcol

In [0]:
rep_vals = [col for col in train.columns if train[col].value_counts(dropna = False, normalize = True).values[0] >= 0.9]
cols=[]
for col in rep_vals:
    cols.append(train[col].value_counts(dropna = False).values[0])

In [0]:
def repcols(dt):
    rep_vals = [col for col in dt.columns if dt[col].value_counts(dropna = False, normalize = True).values[0] >= 0.9]
    return rep_vals

In [0]:
def useless_cols(dt, exep):
    null_cols = delnullcol(dt)
    print("More than 70% null: " + str(len(null_cols)))
    too_many_repeated = repcols(dt)
    print("More than 90% repeated value: " + str(len(too_many_repeated)))
    cols_to_drop = list(set(null_cols + too_many_repeated))
    cols_to_drop.remove(exep)
    return cols_to_drop

In [21]:
cols_to_drop = useless_cols(train, 'isFraud')

More than 70% null: 208
More than 90% repeated value: 67


In [0]:
def find_Major_values(dt, threshold):
    Major_values = []
    t=dt.value_counts(dropna = True, normalize = True)
    for i in range(len(t)):
        if t.values[i] >= threshold:
            Major_values.append(t.values[i])
    return Major_values       

In [0]:
def find_Major_Devices(Major_values,dt):
    Major_Devices = []
    t = dt.value_counts(dropna = True, normalize = True)
    for i in Major_values:
        for j in t.items():
            if j[1] == i:
                Major_Devices.append(j[0])
    return Major_Devices

In [0]:
def find_plot(Major_Devices,d,dt):
    plothis=[]
    for i in range(len(Major_Devices)):
        plothis.append(d.loc[dt == Major_Devices[i]])
    if len(plothis) == 0:
        return 10
    else:
        plothis = pd.concat(objs = [i for i in plothis], axis = 0)
        return plothis

## Visualisation

In [0]:
sns.set(style = "whitegrid")

## Plot for missing values in the columns in the training dataset 

In [0]:
plt.figure(figsize=(32,8))
p = sns.barplot(x = 'Columns', y = 'Missing values percentage', data = missing_data)
plt.xticks(rotation=90)
p.set_xticklabels(list(train.columns))
p;

## For showing the columns with  over 70% repetitive data 


In [0]:
plt.figure(figsize=(32,8))
p2 = sns.barplot(x = rep_vals, y = cols)
plt.title("Columns with most repetetive data")
plt.xticks(rotation=90)
p2.set(xlabel='Columns', ylabel='Number of replitions')
p2;

In [0]:
train.head()

## Amount V Fraud 

In [0]:
amnt = sns.barplot(x = train['isFraud'], y = train['TransactionAmt'], data = train)
plt.title("Amount V Fraud")
amnt.set_xticklabels(['Not Fraud','Fraud'])
amnt.set(xlabel='Transaction Amount')
amnt;

## ProductCD V Fraud count

In [0]:
sns.countplot(train['ProductCD'], hue='isFraud', data=train);

## Major types of used Cards V  Number of frauds and not frauds

In [0]:
for i in range(1,7):
    mv1 = find_Major_values(train['card'+str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['card'+str(i)])
    plothis1 = find_plot(md1, train, train['card'+str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['card'+str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of card number "+str(i))
        p4.set(xlabel='card data of card number '+str(i), ylabel='Count')
        p4;

## Major used C cases V count of fraud 

In [0]:
for i in range(1, 15):
    mv1 = find_Major_values(train['C' + str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['C' + str(i)])
    plothis1 = find_plot(md1, train, train['C' + str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['C' + str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of C" + str(i))
        p4.set(xlabel='C data of C' + str(i), ylabel='Count')
        p4;

## Major used D cases V count of fraud 

In [0]:
for i in range(1, 16):
    mv1 = find_Major_values(train['D' + str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['D' + str(i)])
    plothis1 = find_plot(md1, train, train['D' + str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['D' + str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of D" + str(i))
        p4.set(xlabel='D data of D' + str(i), ylabel='Count')
        p4;

## Major used M cases V count of fraud 

In [0]:
for i in range(1, 10):
    mv1 = find_Major_values(train['M' + str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['M' + str(i)])
    plothis1 = find_plot(md1, train, train['M' + str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['M' + str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of M" + str(i))
        p4.set(xlabel='M data of M' + str(i), ylabel='Count')
        p4;

## Most used IDs V fraud count

In [0]:
for i in range(1,10):
    mv1 = find_Major_values(train['id_0'+str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['id_0'+str(i)])
    plothis1 = find_plot(md1, train, train['id_0'+str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['id_0'+str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of an id_"+str(i))
        p4.set(xlabel='id data of id_'+str(i), ylabel='Count')
        p4

In [0]:
for i in range(10,39):
    mv1 = find_Major_values(train['id_'+str(i)], 0.05)
    md1 = find_Major_Devices(mv1, train['id_'+str(i)])
    plothis1 = find_plot(md1, train, train['id_'+str(i)])
    plt.figure(figsize=(12,5))
    if type(plothis1) != int:
        p4 = sns.countplot(x = plothis1['id_'+str(i)], hue = plothis1['isFraud'], data= plothis1)
        plt.title("Data analysis of id_"+str(i))
        p4.set(xlabel='id data of id_'+str(i), ylabel='Count')
        p4;

## Device type V fraud count 

In [0]:
sns.countplot(train['DeviceType'], hue='isFraud', data=train);

## Majorly used Devices V Fraud count 

In [0]:
mv = find_Major_values(train['DeviceInfo'], 0.1)
md = find_Major_Devices(mv, train['DeviceInfo'])
plothis = find_plot(md, train, train['DeviceInfo'])

p3 = sns.countplot(x = plothis['DeviceInfo'], hue = plothis['isFraud'], data= plothis)
plt.title("Data analysis of majorly used devices")
p3.set(xlabel='Devices', ylabel='Count')
p3;

In [0]:
train = train.drop(cols_to_drop, axis=1)

In [0]:
train = train.replace(np.inf,999)
test = test.replace(np.inf,999)

train['TransactionAmt'] = np.log1p(train['TransactionAmt'])
test['TransactionAmt'] = np.log1p(test['TransactionAmt'])

In [27]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,P_emaildomain,C1,C2,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D10,D11,D15,M1,M2,M3,M4,...,V88,V89,V90,V91,V92,V93,V94,V95,V96,V97,V99,V100,V126,V127,V128,V130,V131,V279,V280,V282,V283,V285,V287,V288,V289,V291,V292,V294,V302,V303,V304,V306,V307,V308,V310,V312,V313,V314,V315,V317
0,2987000,0,86400,4.242188,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,13.0,13.0,0.0,T,T,T,M2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0,117.0
1,2987001,0,86401,3.400391,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,gmail.com,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,0.0,,0.0,,,,M0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,4.09375,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,outlook.com,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,0.0,315.0,315.0,T,T,T,M0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,3.931641,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,yahoo.com,2.0,5.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,84.0,,111.0,,,,M0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,48.0,28.0,10.0,4.0,50.0,1758.0,925.0,354.0,135.0,1.0,28.0,0.0,0.0,10.0,4.0,0.0,0.0,1.0,1.0,38.0,0.0,0.0,0.0,50.0,1758.0,925.0,354.0,135.0,0.0,0.0,0.0,1404.0
4,2987004,0,86506,3.931641,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,gmail.com,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,,,...,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
y_train = train['isFraud']
train = pd.get_dummies(train)
X_train = train.drop('isFraud', axis=1)
X_train = X_train.fillna(0)

In [0]:
del train

In [0]:
q = 0
scaler = MinMaxScaler()
for col in X_train.columns:
    a = np.array(X_train[col])
    a = a.reshape(-1,1)
    X_train[col] = scaler.fit_transform(a)
    if q >= 100:
        break
    else:
        q+=1
        continue

In [0]:
del a

In [32]:
vectorizer = CountVectorizer()
from sklearn.externals import joblib
joblib.dump(X_train, 'dataset.joblib')



['dataset.joblib']

In [33]:
joblib.dump(y_train, 'datasety.joblib')

['datasety.joblib']

In [0]:
del X_train, y_train

In [0]:
from sklearn.externals import joblib
x_train = joblib.load('dataset.joblib')

In [0]:
y_train = joblib.load('datasety.joblib')

In [37]:
x_train.shape

(590540, 249)

##Logistic Regression

In [38]:
lr = LogisticRegression(solver="liblinear", random_state=42)
lr.fit(x_train, y_train[:np.shape(x_train)[0]])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
score = cross_val_score(lr, x_train, y_train[:np.shape(x_train)[0]], cv=3, verbose=3)
score.mean()

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  ................................................................




[CV] .................................... , score=0.964, total= 1.1min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV] .................................... , score=0.964, total= 1.2min
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.3min remaining:    0.0s


[CV] .................................... , score=0.964, total= 1.6min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  3.9min finished


0.9642039491114138