In [19]:
# Import packages
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.model_selection import StratifiedKFold
from statistics import mean
from sklearn.metrics import r2_score

In [11]:
# Read the given CSV file
dataset = pd.read_csv("payments.csv")
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [12]:
x = dataset.drop(columns=['isFraud', 'type', 'nameDest', 'nameOrig'])
y = dataset['isFraud']

In [13]:
x.head()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
0,1,9839.64,170136.0,160296.36,0.0,0.0,0
1,1,1864.28,21249.0,19384.72,0.0,0.0,0
2,1,181.0,181.0,0.0,0.0,0.0,0
3,1,181.0,181.0,0.0,21182.0,0.0,0
4,1,11668.14,41554.0,29885.86,0.0,0.0,0


In [14]:
y.head()

0    0
1    0
2    1
3    1
4    0
Name: isFraud, dtype: int64

## Using Logistic Regression Classification Algorithm
### Using different solvers to generate different models
#### The choice of the algorithm depends on the penalty chosen, Supported penalties by solver:
##### ‘newton-cg’ - [‘l2’, ‘none’]
##### ‘lbfgs’ - [‘l2’, ‘none’]
##### ‘liblinear’ - [‘l1’, ‘l2’]
##### ‘sag’ - [‘l2’, ‘none’]
##### ‘saga’ - [‘elasticnet’, ‘l1’, ‘l2’, ‘none’]

In [15]:
#Array for different models
logistic_regression = []
logistic_regression_names = []

In [16]:
# Using newton-cfg with supported penalties
newton_l2 = LogisticRegression(penalty='l2',solver='newton-cg')
newton_none = LogisticRegression(penalty='none',solver='newton-cg')
logistic_regression.append(newton_l2)
logistic_regression.append(newton_none)
logistic_regression_names.append('newton_l2')
logistic_regression_names.append('newton_none')

# Using lbfgs with supported penalties
lbfgs_l2 = LogisticRegression(penalty='l2',solver='lbfgs')
lbfgs_none = LogisticRegression(penalty='none',solver='lbfgs')
logistic_regression.append(lbfgs_l2)
logistic_regression.append(lbfgs_none)
logistic_regression_names.append('lbfgs_l2')
logistic_regression_names.append('lbfgs_none')

# Using liblinear with supported penalties
liblinear_l1 = LogisticRegression(penalty='l1',solver='liblinear')
liblinear_l2 = LogisticRegression(penalty='l2',solver='liblinear')
logistic_regression.append(liblinear_l1)
logistic_regression.append(liblinear_l2)
logistic_regression_names.append('liblinear_l1')
logistic_regression_names.append('liblinear_l2')

# Using sag with supported penalties
sag_l2 = LogisticRegression(penalty='l2',solver='sag')
sag_none = LogisticRegression(penalty='none',solver='sag')
logistic_regression.append(sag_l2)
logistic_regression.append(sag_none)
logistic_regression_names.append('sag_l2')
logistic_regression_names.append('sag_none')

# Using saga with supported penalties
saga_l2 = LogisticRegression(penalty='l2',solver='saga')
saga_l1 = LogisticRegression(penalty='l1',solver='saga')
saga_none = LogisticRegression(penalty='none',solver='saga')
logistic_regression.append(saga_l2)
logistic_regression.append(saga_l1)
logistic_regression.append(saga_none)
logistic_regression_names.append('saga_l2')
logistic_regression_names.append('saga_l1')
logistic_regression_names.append('saga_none')

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)
skfold = StratifiedKFold(n_splits=2, shuffle=False, random_state=None)

for i in range(0,len(logistic_regression)):
  print("Training: " + logistic_regression_names[i])
  logistic_regression[i].fit(x_train, y_train)
  pred = logistic_regression[i].predict(x_test)
  print("Testing: " + logistic_regression_names[i])
  accuracy = logistic_regression[i].score(x_test, y_test)
  print(logistic_regression_names[i], "Accuracy: ", accuracy)

  print("\nCross Validating: " + logistic_regression_names[i])
  accuracies = [] 
  for train_index, test_index in skfold.split(x, y):
    x_train_fold, x_test_fold = x.iloc[train_index], x.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    logistic_regression[i].fit(x_train_fold, y_train_fold)
    accuracies.append(logistic_regression[i].score(x_test_fold, y_test_fold))
  print('List of possible accuracy:', accuracies)
  print('Maximum Accuracy That can be obtained from this model is:', max(accuracies)*100, '%')
  print('Minimum Accuracy:', min(accuracies)*100, '%')
  print('Overall Accuracy:', mean(accuracies)*100, '%')
  print("-----------------------------------------------------")

Training: newton_l2
Testing: newton_l2
newton_l2 Accuracy:  0.9989082065773743

Cross Validating: newton_l2
List of possible accuracy: [0.9815245920705621, 0.9989749505706769]

Maximum Accuracy That can be obtained from this model is: 99.89749505706769 %

Minimum Accuracy: 98.15245920705621 %

Overall Accuracy: 99.02497713206195 %
-----------------------------------------------------
Training: newton_none
Testing: newton_none
newton_none Accuracy:  0.9988243836658484

Cross Validating: newton_none
List of possible accuracy: [0.982888495619729, 0.998975264906595]

Maximum Accuracy That can be obtained from this model is: 99.8975264906595 %

Minimum Accuracy: 98.2888495619729 %

Overall Accuracy: 99.0931880263162 %
-----------------------------------------------------
Training: lbfgs_l2
Testing: lbfgs_l2
lbfgs_l2 Accuracy:  0.9983544514681059

Cross Validating: lbfgs_l2
List of possible accuracy: [0.9803558911266114, 0.9989664635008848]

Maximum Accuracy That can be obtained from this mo