<a href="https://colab.research.google.com/github/jwang44/Try-colabing-in-colab/blob/main/credit_card_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd "/content/drive/My Drive/"

In [None]:
import numpy as np
import pandas as pd
import scipy.stats
import statistics

## Credit Card Dataset

In [None]:
# generate new feature by multiplication and normalize
def newfeature(x,y):
  z=x*y
  norz=scipy.stats.zscore(z, axis=0, ddof=0, nan_policy='propagate')
  return norz

In [19]:
# convert csv to dataframe
df = pd.read_csv('creditcard.csv')
# data = df.to_numpy()

In [None]:
# normalize feature
NorData = scipy.stats.zscore(df.iloc[:,:-1], axis=0, ddof=0, nan_policy='propagate')
# normalized dataset
NorDataset = np.column_stack((NorData,df.iloc[:,-1]))

New feature

In [None]:
NewF1 = newfeature(df.V3, df.V7)
NewF2 = newfeature(df.V11, df.V12)
NewF3 = newfeature(df.V12, df.V16)
NewF4 = newfeature(df.V16, df.V17)
NewF5 = newfeature(df.V16, df.V18)
NewF6 = newfeature(df.V17, df.V18)
# new feature
NewF = np.column_stack((NewF1,NewF2,NewF3,NewF4,NewF5,NewF6))

In [None]:
# normalized dataset with new feature
NorDatasetNew = np.column_stack((NorData,NewF,df.iloc[:,-1]))

## The model

In [None]:
# sigmoid function
def sigmoid(a):
  return 1/(1+np.exp(-a))

class Logistic_regression():
  def __init__(self,X_train,y_train,learning_rate,X_test,y_test):
    self.X_train = X_train
    self.y_train = y_train
    self.learning_rate = learning_rate
    self.X_test = X_test
    self.y_test = y_test
  
  # training
  def fit(self):
    n,m = np.shape(self.X_train)
    itrnum = 500       # max number of iterations 
    W = np.ones([m+1,itrnum+1]) # weights, initialized with 1 ---> change to 1 column
    e = 0.01        # iteration stop criteria
    der = 0        # derivative
    for k in range(0,itrnum):
      for i in range(0,n):
        xi = self.X_train[i].T
        x0 = np.array([1])
        xi = np.concatenate((xi, x0),axis = 0)
        yi = self.y_train[i]
        der = der-xi*(yi-sigmoid(np.matmul(W[:,k].T,xi))) # take derivative w.r.t W
      W[:,k+1] = W[:,k]-self.learning_rate*der       # update rule
      if (np.linalg.norm(W[:,k+1]-W[:,k]))**2<e:         
        break 
    return W[:,k+1]
  
  # validation
  def predict(self):
    w = self.fit()
    n,m = np.shape(self.X_test)   
    y_predict = np.zeros([n,1])
    for i in range(0,n):
      xi = self.X_test[i].T
      x0 = np.array([1])
      xi = np.concatenate((xi, x0),axis = 0)
      p1 = sigmoid(np.matmul(w.T,xi)) # calculate probabilities p(y=1|x)
      #p0=1-sigmoid(np.matmul(w.T,xi))
      # covert probabilities to 0 or 1 by thresholding at 0.5
      if p1>=0.5:
        y_predict[i] = 1
      else:
        y_predict[i] = 0
    return y_predict

  # evaluate accuracy
  def Accu_eval(self):
    y_predict = self.predict()
    n,j = np.shape(y_predict)
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    # count TP,TN,FP,FN in validation set
    for i in range(n):
      if  self.y_test[i]==1 and y_predict[i]==1:
        TP = TP+1
      elif self.y_test[i]==0 and y_predict[i]==0:
        TN = TN+1
      elif self.y_test[i]==0 and y_predict[i]==1:
        FP = FP+1
      elif self.y_test[i]==1 and y_predict[i]==0:
        FN = FN+1
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    F = 2*precision*recall/(precision+recall)
    specificity = TN/(FP+TN)
    FPR = FP/(FP+TN)
    # print("accuracy:",accuracy)
    # print("precision:",precision)
    # print("recall:",recall)
    # print("F:",F)
    # print("specificity:",specificity)
    # print("False Positive Rate:",FPR)
    print("")
    return accuracy
    

## Cross validation

In [None]:
class Cross_validation():
  def __init__(self, k):
    # k: k-fold
    self.k = k

  def prepare_data(self, data):
    # data: np array converted from csv
    np.random.shuffle(data)
    X = data[:, :-1]  # features
    y = data[:, -1]   # labels

    # split data into k equal segments, assign them to train and test later
    Xs = np.array_split(X, self.k, axis=0)
    ys = np.array_split(y, self.k, axis=0)
    return Xs, ys

  def get_accuracy(self, Xs, ys, lr):
    accuracies = []
    for i in range(self.k):
      X_cv = Xs[:] # X_cross_validation
      y_cv = ys[:] # y_cross_validation

      X_test = X_cv.pop(i)
      y_test = y_cv.pop(i)

      X_train = np.concatenate(X_cv)
      y_train = np.concatenate(y_cv)
    
      train_only = Logistic_regression(X_train, y_train, lr, X_test, y_test)
      train_and_test = Logistic_regression(X_train, y_train, lr, X_test, y_test)

      print("----------FOLD ", i+1, "----------")
      accuracy = logistic_regression.Accu_eval()
      accuracies.append(accuracy)
    return np.mean(accuracies)
      # this will print the evaluation results

## Experiment with different learning rates

In [18]:
lrs = np.logspace(-6, -1, 6) # different learning rates to try
# or we can also try other hyperparameters here
cv = Cross_validation(5) # 5-fold cross-validation
Xs, ys = cv.prepare_data(original_data)
for lr in lrs:
  print("----------LEARNING RATE: ", lr, "----------")
  accu_avg = cv.get_accuracy(Xs, ys, lr)
  print("----------AVERAGE ACCURACY", accu_avg, "----------")
  print("\n---------------------------------------------------------------------\n")


----------LEARNING RATE:  1e-06 ----------
----------FOLD  1 ----------
accuracy: 0.26515151515151514
precision: 0.34551495016611294
recall: 0.5252525252525253
F: 0.4168336673346693

accuracy: 0.23115577889447236
precision: 0.30344827586206896
recall: 0.4583333333333333
F: 0.3651452282157676

----------FOLD  2 ----------
accuracy: 0.2572509457755359
precision: 0.33668341708542715
recall: 0.5101522842639594
F: 0.4056508577194753

accuracy: 0.26262626262626265
precision: 0.34
recall: 0.5204081632653061
F: 0.41129032258064513

----------FOLD  3 ----------
accuracy: 0.2572509457755359
precision: 0.33444816053511706
recall: 0.5115089514066496
F: 0.4044489383215369

accuracy: 0.26262626262626265
precision: 0.348993288590604
recall: 0.5148514851485149
F: 0.41600000000000004

----------FOLD  4 ----------
accuracy: 0.2597730138713745
precision: 0.3383333333333333
recall: 0.5165394402035624
F: 0.40886203423967776

accuracy: 0.25252525252525254
precision: 0.3333333333333333
recall: 0.494949494949

  This is separate from the ipykernel package so we can avoid doing imports until


accuracy: 0.9318181818181818
precision: 0.9776536312849162
recall: 0.8838383838383839
F: 0.9283819628647215

accuracy: 0.9195979899497487
precision: 0.9651162790697675
recall: 0.8645833333333334
F: 0.9120879120879122

----------FOLD  2 ----------
accuracy: 0.9382093316519546
precision: 0.9116945107398569
recall: 0.9695431472081218
F: 0.9397293972939729

accuracy: 0.9242424242424242
precision: 0.9108910891089109
recall: 0.9387755102040817
F: 0.9246231155778896

----------FOLD  3 ----------
accuracy: 0.9432534678436317
precision: 0.9752747252747253
recall: 0.907928388746803
F: 0.9403973509933775

accuracy: 0.9343434343434344
precision: 0.9583333333333334
recall: 0.9108910891089109
F: 0.934010152284264

----------FOLD  4 ----------
accuracy: 0.9104665825977302
precision: 0.865909090909091
recall: 0.9694656488549618
F: 0.9147659063625451

accuracy: 0.8939393939393939
precision: 0.8421052631578947
recall: 0.9696969696969697
F: 0.9014084507042254

----------FOLD  5 ----------
accuracy: 0.934

## Experiment with different features

In [None]:
lr = 0.001
cv = Cross_validation(5) # 5-fold cross-validation
Xs, ys = cv.prepare_data(NorDataset)
print("----------Using normalized features, without new features----------")
accu_avg = cv.get_accuracy(Xs, ys, lr)
print("----------AVERAGE ACCURACY", accu_avg, "----------")
print("\n---------------------------------------------------------------------")


----------Using normalized features, without new features----------
----------FOLD  1 ----------


  


accuracy: 0.8548387096774194
precision: 0.75
recall: 0.7894736842105263
F: 0.7692307692307692
specificity: 0.8837209302325582
False Positive Rate: 0.11627906976744186

----------FOLD  2 ----------
accuracy: 0.8548387096774194
precision: 0.7368421052631579
recall: 0.7777777777777778
F: 0.7567567567567567
specificity: 0.8863636363636364
False Positive Rate: 0.11363636363636363

----------FOLD  3 ----------
accuracy: 0.8064516129032258
precision: 0.8
recall: 0.5714285714285714
F: 0.6666666666666666
specificity: 0.926829268292683
False Positive Rate: 0.07317073170731707

----------FOLD  4 ----------
accuracy: 0.8709677419354839
precision: 0.9047619047619048
recall: 0.76
F: 0.8260869565217391
specificity: 0.9459459459459459
False Positive Rate: 0.05405405405405406

----------FOLD  5 ----------
accuracy: 0.9516129032258065
precision: 0.8421052631578947
recall: 1.0
F: 0.9142857142857143
specificity: 0.9347826086956522
False Positive Rate: 0.06521739130434782

----------AVERAGE ACCURACY 0.8677

In [None]:
lr = 0.001
cv = Cross_validation(5) # 5-fold cross-validation
Xs, ys = cv.prepare_data(NorDatasetNew)
print("----------Using normalized features, with new features----------")
accu_avg = cv.get_accuracy(Xs, ys, lr)
print("----------AVERAGE ACCURACY", accu_avg, "----------")
print("\n---------------------------------------------------------------------")


----------Using normalized features, with new features----------
----------FOLD  1 ----------


  


accuracy: 0.8548387096774194
precision: 0.9285714285714286
recall: 0.6190476190476191
F: 0.742857142857143
specificity: 0.975609756097561
False Positive Rate: 0.024390243902439025

----------FOLD  2 ----------
accuracy: 0.8387096774193549
precision: 0.5555555555555556
recall: 0.8333333333333334
F: 0.6666666666666667
specificity: 0.84
False Positive Rate: 0.16

----------FOLD  3 ----------
accuracy: 0.8387096774193549
precision: 0.7666666666666667
recall: 0.8846153846153846
F: 0.8214285714285715
specificity: 0.8055555555555556
False Positive Rate: 0.19444444444444445

----------FOLD  4 ----------
accuracy: 0.8225806451612904
precision: 0.7
recall: 0.7368421052631579
F: 0.717948717948718
specificity: 0.8604651162790697
False Positive Rate: 0.13953488372093023

----------FOLD  5 ----------
accuracy: 0.7419354838709677
precision: 0.6470588235294118
recall: 0.5238095238095238
F: 0.5789473684210527
specificity: 0.8536585365853658
False Positive Rate: 0.14634146341463414

----------AVERAGE AC