<a href="https://colab.research.google.com/github/haalcala/NYP-SDAI/blob/master/ITI105/project/Fraud_detection_harold_lr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" align="left"/></a>


# Fraud detection using Logistic Regression

In [None]:
import requests
import io
import os

try:
    os.mkdir("./tmp")
except:
    pass

external_files = [
  {
      "url":"https://raw.githubusercontent.com/haalcala/NYP-SDAI/master/ITI105/project/ds_util.py",
      "local_file":"ds_util.py"
  },
  {
      "url":"https://raw.githubusercontent.com/haalcala/NYP-SDAI/master/ITI105/project/files/bs140513_032310.csv",
      "local_file":"bs140513_032310.csv"
  },
]

for ext_file in external_files:
  req = requests.get(ext_file["url"])

  print(ext_file["local_file"], len(req.content))

  f = open("./tmp/" + ext_file["local_file"],"wb")

  f.write(req.content)


## load and prepare data


In [None]:
import tmp.ds_util as util

ds_util = util.DSUtil()
ds_util.load_csv("./tmp/bs140513_032310.csv")

# analyse data composition
ds_util.blow_my_mind()

### drop columns not needed for calculations and get dummy variables

In [None]:
ds_util.drop_columns(["customer", "zipMerchant", "zipcodeOri"])

df = ds_util.get_dummies()

df

## Logistic Regression Stuffs                  

### Environment initialisation

In [None]:
from sklearn.model_selection import train_test_split

#We need to import the k-NN Classifier from skleart.neighbors

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
import numpy as np

target_column = "fraud"

y = df[target_column]

X = df.drop(target_column, axis=1)

sm = SMOTE(random_state=12)
x_res, y_res = sm.fit_sample(X, y)
print(y.value_counts(), np.bincount(y_res))

X_train, X_test, y_train, y_test = train_test_split(x_res, y_res, train_size=0.8, shuffle=True, random_state=42)


In [None]:
y_test.value_counts()

### Training

In [None]:

# lr_clf = LogisticRegression(solver='liblinear', random_state=42)
lr_clf = LogisticRegression(solver='liblinear', random_state=42)

y_pred = None

def train_fn():
    global lr_clf
    
    lr_clf.fit(X_train, y_train)

ds_util.activity_wrapper("Training", train_fn)

### Testing


In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.legend(loc="center right", fontsize=16) 
    plt.xlabel("Threshold", fontsize=16)        
    plt.grid(True)                                           

def test_fn():
    global y_pred
    
    y_pred = lr_clf.predict(X_test) 
    
    print("accuracy_score:", accuracy_score(y_test, y_pred))
    
    print("cross_val_score:", cross_val_score(lr_clf, X_train, y_train, cv=3, scoring="accuracy"))
    
    y_train_pred = cross_val_predict(lr_clf, X_train, y_train, cv=3)
    
    print("confusion_matrix", confusion_matrix(y_train, y_train_pred))
    
#     y_train_perfect_predictions = y_train
    
#     print("confusion_matrix", confusion_matrix(y_train, y_train_perfect_predictions))
    
    print("recall_score:", recall_score(y_train, y_train_pred, pos_label=0))
    print("precision_score:", precision_score(y_train, y_train_pred, pos_label=0))
    print("f1_score:",f1_score(y_train, y_train_pred, pos_label=0))
    
    print("classification_report:", classification_report(y_train, y_train_pred))
    
    y_scores = cross_val_predict(lr_clf, X_train, y_train, cv=3, method="decision_function")
    
    print(y_scores)
    
    precisions, recalls, thresholds = precision_recall_curve(y_train, y_scores)
    
    plt.figure(figsize=(8, 4))                      
    plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
    
    plt.show()
        
ds_util.activity_wrapper("Testing", test_fn)

print("All done.")

In [None]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import numpy as np

def plot_decision_regions(X,y,classifier,test_idx=None,resolution=0.02):
    
    # Initialise the marker types and colors
    markers = ('s','x','o','^','v')
    colors = ('red','blue','lightgreen','gray','cyan')
    color_Map = ListedColormap(colors[:len(np.unique(y))]) #we take the color mapping correspoding to the 
                                                            #amount of classes in the target data
    
    # Parameters for the graph and decision surface
    x1_min = X[:,0].min() - 1
    x1_max = X[:,0].max() + 1
    x2_min = X[:,1].min() - 1
    x2_max = X[:,1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min,x1_max,resolution),
                           np.arange(x2_min,x2_max,resolution))
    
    Z = classifier.predict(np.array([xx1.ravel(),xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    
    plt.contour(xx1,xx2,Z,alpha=0.4,cmap = color_Map)
    plt.xlim(xx1.min(),xx1.max())
    plt.ylim(xx2.min(),xx2.max())
    
    # Plot samples
    X_test, Y_test = X[test_idx,:], y[test_idx]
    
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x = X[y == cl, 0], y = X[y == cl, 1],
                    alpha = 0.8, c = color_Map(idx),
                    marker = markers[idx], label = cl
                   )
        
import pandas as pd

# def test_params():
plt.figure(figsize=(10, 10))

C_param_range = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

sepal_acc_table = pd.DataFrame(columns = ["C_param", "Training Time", "Accuracy", "X-Val"])
sepal_acc_table["C_param"] = C_param_range

X_combined = np.vstack((X_train,X_test))
Y_combined = np.hstack((y_train, y_test))

j = 0
for i in C_param_range:
    print("i:",i)
    # Apply logistic regression model to training data
    lr = LogisticRegression(solver='saga', max_iter=1000, n_jobs=4, C=i, random_state=42)

    y_pred = None

    def train_fn():
        global lr
        
        lr.fit(X_train, y_train)

    elapsed = ds_util.activity_wrapper("Training", train_fn)
    
    y_pred = lr.predict(X_test) 

    print("elapsed:", elapsed)

    accuracy_result = accuracy_score(y_test, y_pred)
    cross_val_result = cross_val_score(lr, X_train, y_train, cv=3, scoring="accuracy")
    
    print("accuracy_score:", accuracy_result)
    print("cross_val_score:", cross_val_result)
    
    # Saving accuracy score in table
    sepal_acc_table.iloc[j,1] = str(elapsed.total_seconds()) + " secs"
    sepal_acc_table.iloc[j,2] = str(accuracy_result)
    sepal_acc_table.iloc[j,3] = str(cross_val_result)
    j += 1
    
    # Printing decision regions
    # plt.subplot(3,2,j)
    # plt.subplots_adjust(hspace = 0.4)
    # plot_decision_regions(X = X_combined
    #                   , y = Y_combined
    #                   , classifier = lr
    #                   , test_idx = range(105,150))
    # plt.xlabel('Sepal length')
    # plt.ylabel('Sepal width')
    # plt.title('C = %s'%i)

    print("------------------------------------")

# test_params()

sepal_acc_table