<a href="https://colab.research.google.com/github/hitanshi08/codsoft/blob/main/CREDIT_CARD_FRAUD_DETECTION_CODSOFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORTING DEPENDENCIES

In [None]:
import numpy as np
import pandas as pd
import time

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns



In [1]:
from scipy import stats
from scipy.stats import norm , skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import roc_curve , auc , roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier


import xgboost as xgo
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.ensemble import AdaBoostClassifier

#To ignore warnings
import warnings
warnings.filterwarnings("ignore")






EXPLORATORY DATA ANALYSIS

In [None]:
#CHECKING THE SHAPE
df.shape

In [None]:
#CHECKING THE DATATYPES AND NULL/NON-NULL DISTRIBUTION
df.info()

In [None]:
#CHECKING NUMERICAL VALUE IN DATASET
df.describe()

In [None]:
#CHECKING THE CLASS DISTRIBUTER OF TARGET VARIABLE
df['Class'].value_counts()

In [None]:
#CHECKING THE CLASS DISTRIBUTION OF THE TARGET VARIABLE IN PERCENTAGE
print((df.groupby('Class')['Class'].count()/df['Class'].count())*100)
((df.groupby('Class')['Class'].count()/df['Class'].count())*100).plot.pie()


In [None]:
#CHECKING THE % DISTRIBUTION OF NORMAL VS FRAUD
classes=df['Class'].value_counts()
normal_share=classes[0]/df['Class'].count()*100
fraud_share=classes[1]/df['Class'].count()*100

print(normal_share)
print(fraud_share)

In [None]:
#CREATE A BAR PLOT FOR THE NUMBER AND % OF FRAUDULENT VS NON-FRAUDULENT TRANSACTIONS
plt.figure(figsize=(7,5))
sns.countplot(df['Class'])
plt.title("Class Count" , fontsize=18)
plt.xlabel("Record counts by Class", fontsize=15)
plt.ylabel("Count", fontsize=15)
plt.show()

In [None]:
#CHECKING THE CORRELATION
corr = df.corr()
corr

In [None]:
#CHECKING THE CORRELATION IN HEATMAP
plt.figure(figsize=(24,18))

sns.heatmap(corr, cmap="coolwarm", annot=True)
plt.show()

OBSERVATION OF DISTRIBUTION OF OUR CLASS

In [None]:
#As time is given in relative fashion, we are using pandas. Timedelta which represent duration, the difference between two time updates
Delta_Time = pd.to_timedelta(df['Time'], unit='s')

#Create derived columns min and hours
df['Time_Day'] = (Delta_Time.dt.components.days).astype(int)
df['Time_Hour'] = (Delta_Time.dt.components.hours).astype(int)
df['Time_Min'] = (Delta_Time.dt.components.minutes).astype(int)


In [None]:
#DROP UNNECESSARY COLUMNS
# we will drop Time,as we have derived Day/Hours/Min from Time column
df.drop('Time', axis=1, inplace=True)
# we will keep only derived column hour as day/min might not be very useful
df.drop(['Time_Day', 'Time_Min'], axis=1, inplace=True)

SPLITTING THE DATA INTO TRAIN AND TEST DATA

In [None]:
#SPLITTING DATASET INTO X AND Y
y= df['Class']
x= df.drop(['Class'], axis=1)

In [None]:
#CHECKING SOME ROWS OF X
x.head()

In [None]:
#CHECKING SOME ROWS OF Y
y.head()

In [None]:
#SPLITTING DATASET
x_train, x_test, y_train, y_test= train_test_split(x, y, random_state=100, test_size=0.20)

In [None]:
#CHECKING SPREAD OF DATA POST SPLIT
print(np.sum(y))
print(np.sum(y_train))
print(np.sum(y_test))

PUTTING THE DISTRIBUTION OF A VARIABLE

In [None]:
#Accumulating all the column names under one variable
cols = list(x.columns.values)

In [None]:
#PLOT THE HISTOGRAM OF A VARIABLE FROM THE DATASET TO SEE THE SKEWNESS
normal_records = df.Class == 0
fraud_records = df.Class ==1

plt.figure(figsize=(20,60))
for n, col in enumerate(cols):
  plt.subplot(10,3,n+1)
  sns.distplot(x[col][normal_records], color='green')
  sns.distplot(x[col][fraud_records], color='red')
  plt.title(col, fontsize=17)
plt.show()

MODEL BUILDING

In [None]:
#CREATE A DATAFRAME TO STORE RESULTS
df_Results = pd.DataFrame(columns=['Methodology', 'Accuracy', 'roc_value', 'threshold'])

In [None]:
#CREATED A COMMON FUNCTIONS TO PLOT CONFUSION MATRIX
def Plot_confusion_matrix(y_test, pred_test):
  cm = confusion_matrix(y_test, pred_test)
  plt.clf()
  plt.imshow(cm, interpolation='nearest', cmp=plt.cm.Accent)
  categoryNames = ['Non-Fraudulent', 'Fraudulent']
  plt.title('Confusion Matrix - Test Data')
  plt.ylabel('True Label')
  plt.xlabel('Predicted Label')
  ticks=np.arrange(len(categoryNames))
  plt.xticks(ticks, categoryNames, rotation=45)
  plt.yticks(ticks, categoryNames)
  s=[['TN', 'FP'], ['FN', 'TP']]

  for i in range(2):
    for j in range(2):
      plt.text(j,i, str(s[i][j])+"="+str(cm[i][j]), fontsize=12)
  plt.show()


In [None]:
#CREATING A COMMON FUNCTION TO FIT AND PREDICT ON A LR MODEL FOR BOTH L1 L2
def buildAndRunLogisticModels(df_Results, Methodology, x_train, y_train, x_test, y_test):

  #LOGISTIC REGRESSION
  from sklearn import linear_model
  from sklearn.model_selection import KFold

  num_C = list(np.power(10.0, np_arrange(-10,10)))
  cv_num = KFold(n_split=10, shuffle=True, random_state=42)

  searchCV_l2 = linear_model.LogisticRegressionCV(
      Cs=num_C
      ,penalty='12'
      ,scoring='roc_auc'
      ,cv=cv_num
      ,random_state=42
      ,max_iter=10000
      ,fit_intercept=True
      ,solver='newton_cg'
      ,tol=10
  )

  searchCV_l1 =  linear_model.LogisticRegressionCV(
      Cs=num_C
      ,penalty='12'
      ,scoring='roc_auc'
      ,cv=cv_num
      ,random_state=42
      ,max_iter=10000
      ,fit_intercept=True
      ,solver='newton_cg'
      ,tol=10
  )

  searchCV_l1.fit(x_train, y_train)
  searchCV_l2.fit(x_train, y_train)
  print('Max auc_roc for l1:', searchCV_l1.scores_[1].mean(axis=0).max())
  print('Max auc_roc for l2:', searchCV_l2.scores_[1].mean(axis=0).max())

  print("Parameters for l1 regularizations")
  print(searchCV_l1.coef_)
  print(searchCV_l1.intercept_)
  print(searchCV_l1.scores_)

  print("Parameters for l2 regularizations")
  print(searchCV_l2.coef_)
  print(searchCV_l2.intercept_)
  print(searchCV_l2.scores_)

  # FIND PREDICTED VALUES
  y_pred_l1 = searchCV_l1.predict(x_test)
  y_pred_l2 = searchCV_l2.predict(x_test)

  # FIND PREDICTED PROBABILITIES
  y_pred_probs_l1 = searchCV_l1.predict_proba(x_test)[:,1]
  y_pred_probs_l2 = searchCV_l2.predict_proba(x_test)[:,1]

  #ACCURACY OF l2/l1
  Accuracy_l2 = metrics.accuracy_score(y_pred=y_pred_l2, y_true=y_test)
  Accuracy_l1 = metrics.accuracy_score(y_pred=y_pred_l1, y_true=y_test)

  print("Accuracy of Logistic model with 12 regularisation : {0}".format(Accuracy_l2))
  print("Confusion Matrix")
  Plot_confusion_matrix(y_test, y_pred_l2)
  print("classification Report")
  print(classification_report(y_test, y_pred_l2))


  print("Accuracy of Logistic model with 11 regularisation : {0}".format(Accuracy_l1))
  print("Confusion Matrix")
  Plot_confusion_matrix(y_test, y_pred_l1)
  print("classification Report")
  print(classification_report(y_test, y_pred_l1))

  l2_roc_value = roc_auc_score(y_test , y_pred_probs_l2)
  print("l2_roc_value: {0}".format(l2_roc_value))
  fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_probs_l2)
  threshold = thresholds[np.argmax(tpr.fpr)]
  print("l2 threshold: {0}".format(threshold))

  roc_auc = metrics.auc(fpr,tpr)
  print("ROC for the test dataset",'{:.1%}'.format(roc_auc))
  plt.plot(fpr,tpr,label="Test, auc="+str(roc_auc))
  plt.legend(loc=4)
  plt.show()

  df_Results = df_Results.append(pd.DataFrame({'Methodology': Methodology, 'Model': 'Logistic Regression with l2 Regularisation', 'Accuracy': Accuracy_l2, 'roc_value': l2_roc_value, 'threshold': threshold}, index=[0]), ignore_index=True)

  l1_roc_value = roc_auc_score(y_test , y_pred_probs_l1)
  print("l1_roc_value: {0}".format(l2_roc_value))
  fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_probs_l1)
  threshold = thresholds[np.argmax(tpr.fpr)]
  print("l1 threshold: {0}".format(threshold))

  roc_auc = metrics.auc(fpr,tpr)
  print("ROC for the test dataset",'{:.1%}'.format(roc_auc))
  plt.plot(fpr,tpr,label="Test, auc="+str(roc_auc))
  plt.legend(loc=4)
  plt.show()

  df_Results = df_Results.append(pd.DataFrame({'Methodology': Methodology, 'Model': 'Logistic Regression with l1 Regularisation', 'Accuracy': Accuracy_l1, 'roc_value': l1_roc_value, 'threshold0': threshold}, index=[0]), ignore_index=True)
  return df_Results










In [None]:
#CREATED A COMMON FUNCTION TO FIT AND PREDICT ON KNN MODEL
def buildAndRunKNNModels(df_Results,Methodology,x_train,y_train,x_test,y_test):

  #create KNN model and fit the model with train dataset
  knn=KNeighborsClassifier(n_neighbors=5,n_jobs=16)
  knn.fit(x_train,y_train)
  score=knn.score(x_test,y_test)
  print("model score")
  print(score)

  #Accuracy
  y_pred=knn.predict(x_test)
  KNN_Accuracy=metrics.accuracy_score(y_pred=y_pred,y_true=y_test)
  print("Confusion Matrix")
  Plot_confusion_matrix(y_test,y_pred)

  knn_probs=knn.predict_proba(x_test)[:,1]

  #calculate roc auc
  knn_roc_value = roc_auc_score(y_test,knn_probs)
  print("KNN roc_value:{0}".format(knn_roc_value))
  fpr,tpr,thresholds = metrics.roc_curve(y_test,knn_probs)
  threshold=thresholds[np.argmax(tpr.fpr)]
  print("KNN threshold:{0}".format(threshold))

  roc_auc=metrics.auc(fpr,tpr)
  print("ROC for the test dataset",'{:.1%}'.format(roc_auc))
  plt.plot(fpr,tpr,label="Test,auc="+str(roc_auc))
  plt.legend(loc=4)
  plt.show()

  df_Results=df_Results.append(pd.DataFrame({'Methodology': Methodology,'Model':'KNN','Accuracy':score,'roc_value':knn_roc_value,'threshold':threshold},index=[0]),ignore_index=True)
  return df_Results


In [None]:
#Created a common function to fit and predict on Tree Models for bothgini and entropy criteria
def buildingAndRunTreeModels(df_Results,Methodology,x_train,y_train,x_test,y_test):

  #Evaluate Decision Tree model with 'gini' & 'entropy'
  criteria = ['gini', 'entropy']
  scores={}
  for c in criteria:
    dt=DecisionTreeClassifier(criterion=c,random_state=42)
    dt.fit(x_train,y_train)
    y_pred=dt.predict(x_test)
    test_score = dt.score(x_test,y_test)
    tree_preds = dt.predict_proba(x_test)[:,1]
    tree_roc_value = roc_auc_score(y_test,tree_preds)
    scores = test_score
    print(c + "score: {0}".format(test_score))
    print("Confusion Matrix")
    Plot_confusion_matrix(y_test,y_pred)
    print("classification report")
    print(classification_report(y_test,y_pred))
    print(c + "tree_roc_value: {0}".format(tree_roc_value))
    fpr,tpr,threshold = metrics.roc_curve(y_test,tree_preds)
    threshold = threshold[np.argmax(tpr.fpr)]
    print("Tree threshold: {0}".format(threshold))
    roc_auc = metrics.auc(fpr,tpr)
    print("ROC for the test dataset",'{:.1%}'.format(roc_auc))
    plt.legend(loc=4)
    plt.show()

    df_Results=df_Results.append(pd.DataFrame({'Methodology': Methodology,'Model':'Tree Model with {0} criteria'.format(c),'Accuracy':test_score,'roc_value':tree_roc_value,'threshold':threshold},index=[0]),ignore_index=True)
    return df_Results





In [None]:
#Created a common function to fit and predict on random forest model
def buildingAndRunRandomForestModels(df_Results,Methodology,x_train,y_train,x_test,y_test):

  #Create model with 100 trees
  RF_Model = RandomForestClassifier(n_estimators=100,
                                    bootstrap=True,
                                    max_features='sqrt',random_state=42)
  #fit on training data
  RF_Model.fit(x_train,y_train)
  RF_test_score=RF_model.score(x_test,y_test)
  RF_model.predict(x_test)

  print('Model Accuracy:{0}'.format(RF_test_score))

  #Actual class predictions
  rf_predictions = RF_model.predict(x_test)
  print("Confusion Matrix")
  Plot_confusion_matrix(y_test,rf_predictions)
  print("classification report")
  print(classification_report(y_test,rf_predictions))

  #Probabilities for each class
  rf_probs=RF_model.predict_proba(x_test)[:,1]

  #calculate roc auc
  roc_value = roc_auc_score(y_test,rf_probs)
  print("Random Forest roc_value:{0}".format(roc_value))
  fpr,tpr,thresholds = metrics.roc_curve(y_test,rf_probs)
  threshold=thresholds[np.argmax(tpr.fpr)]
  print("Random Forest threshold:{0}".format(threshold))

  roc_auc=metrics.auc(fpr,tpr)
  print("ROC for the test dataset",'{:.1%}'.format(roc_auc))
  plt.plot(fpr,tpr,label="Test,auc="+str(roc_auc))
  plt.legend(loc=4)

  plt.show()

  df_Results=df_Results.append(pd.DataFrame({'Methodology': Methodology,'Model':'Random Forest','Accuracy':RF_test_score,'roc_value':roc_value,'threshold':threshold},index=[0]),ignore_index=True)
  return df_Results


In [None]:
#Created a common function to fit and predict on a XGBoost model
def buildingAndRunXGBoostModels(df_Results,Methodology,x_train,y_train,x_test,y_test):
  #Evaluate XGBoost Model
  XGBmodel=XGBClassifier(random_state=42)
  XGBmodel.fit(x_train,y_train)
  y_pred=XGBmodel.predict(x_test)
  XGB_test_score=XGBmodel.score(x_test,y_test)
  print('Model_Accuracy:{0}'.format(XGB_test_score))

  print("Confusion Matrix")
  Plot_confusion_matrix(y_test,y_pred)
  print("classification report")
  print(classification_report(y_test,y_pred))
  XGB_probs=XGBmodel.predict_proba(x_test)[:,1]

  roc_value = roc_auc_score(y_test,XGB_probs)
  print("XGBoost roc_value:{0}".format(XGB_roc_value))
  fpr,tpr,thresholds = metrics.roc_curve(y_test,XGB_probs)
  threshold=thresholds[np.argmax(tpr.fpr)]
  print("XGBoost threshold:{0}".format(threshold))

  roc_auc=metrics.auc(fpr,tpr)
  print("ROC for the test dataset",'{:.1%}'.format(roc_auc))
  plt.plot(fpr,tpr,label="Test,auc="+str(roc_auc))
  plt.legend(loc=4)

  plt.show()

  df_Results=df_Results.append(pd.DataFrame({'Methodology': Methodology,'Model':'XGBoost','Accuracy':XGB_test_score,'roc_value':XGB_roc_value,'threshold':threshold},index=[0]),ignore_index=True)
  return df_Results







In [None]:
#created a common function to fit and predict on SVM model
def buildingAndRunSVMModels(df_Results,Methodology,x_train,y_train,x_test,y_test):
  from sklearn.svm import SVC
  from sklearn.metrics import accuracy_score
  from sklearn.metrics import roc_auc_score

  cif=SVC(kernel='sigmoid',random_state=42)
  cif.fit(x_train,y_train)
  y_pred_SVM=cif.predict(x_test)
  SVM_score=accuracy.score(y_test,y_pred_SVM)
  print('Accuracy_score:{0}'.format(SVM_score))

  print("Confusion Matrix")
  Plot_confusion_matrix(y_test,y_pred_SVM)
  print("classification report")
  print(classification_report(y_test,y_pred_SVM))

  #run classifier
  classifier=SVC(kernel='sigmoid' , probability=True)
  svm_probs=classifier.fit(x_train,y_train).predict_proba(x_test)[:,1]
  roc_value = roc_auc_score(y_test,svm_probs)
  print("SVM roc_value:{0}".format(roc_value))
  fpr,tpr,thresholds = metrics.roc_curve(y_test,svm_probs)
  threshold=thresholds[np.argmax(tpr.fpr)]
  print("XGBoost threshold:{0}".format(threshold))

  roc_auc=metrics.auc(fpr,tpr)
  print("ROC for the test dataset",'{:.1%}'.format(roc_auc))
  plt.plot(fpr,tpr,label="Test,auc="+str(roc_auc))
  plt.legend(loc=4)

  plt.show()

  df_Results=df_Results.append(pd.DataFrame({'Methodology': Methodology,'Model':'SVM','Accuracy':SVM_test_score,'roc_value':roc_value,'threshold':threshold},index=[0]),ignore_index=True)
  return df_Results




PERFORM CROSS VALIDATION WITH REPEATKFOLD

In [None]:
from sklearn.model_selection import RepeatedKFold
rkf=RepeatedKFold(n_splits=5,n_repeats=10,random_state=None)
for train_index,test_index in rkf.split(x,y):
  print("TRAIN:",train_index,"TEST:",test_index)
  x_train_cv,x_test_cv=x.iloc[train_index],x.iloc[test_index]
  y_train_cv,y_test_cv=y.iloc[train_index],y.iloc[test_index]

In [None]:
# Run Logistic Regression with l1 and l2 Regularisation
def buildAndRunLogisticModels(df_Results, model_name, x_train, y_train, x_test, y_test):
  print("Logistic Regression with L1 and L2 Regularisation")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"RepeatedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

# Run KNN Model
  print("KNN Model")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"RepeatedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

#Run Decision Tree Models with 'gini' & 'entropy' criteria
  print("Decision Tree Models with 'gini' & 'entropy' criteria")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"RepeatedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

#Run Random Forest Model
  print("Random Forest Model")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"RepeatedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

#Run XGBoost Model
  print("XGBoost Model")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"RepeatedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

#Run SVM Model
  print("SVM Model with Sigmoid Kernel")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"RepeatedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )













In [None]:
df_Results

PERFORM CROSS VALIDATION WITH STRATIFIEDKFOLD

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=None, shuffle=True)
y_imputer = SimpleImputer(strategy="mean")
y_imputed = y_imputer.fit_transform(y.values.reshape(-1, 1))
y = pd.Series(y_imputed.flatten())

skf = StratifiedKFold(n_splits=5, random_state=None)
for train_index, test_index in kf.split(x,y):
  print("TRAIN:", train_index, "TEST:", test_index)
  x_train_SKF_cv, x_test_SKF_cv = x.iloc[train_index], x.iloc[test_index]
  y_train_SKF_cv, y_test_SKF_cv = y.iloc[train_index], y.iloc[test_index]

In [None]:
# Run Logistic Regression with l1 and l2 Regularisation
def buildAndRunLogisticModels(df_Results, model_name, x_train, y_train, x_test, y_test):
  print("Logistic Regression with L1 and L2 Regularisation")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"StratifiedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

# Run KNN Model
  print("KNN Model")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"StratifiedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

#Run Decision Tree Models with 'gini' & 'entropy' criteria
  print("Decision Tree Models with 'gini' & 'entropy' criteria")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"StratifiedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

#Run Random Forest Model
  print("Random Forest Model")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"StratifiedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

#Run XGBoost Model
  print("XGBoost Model")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"StratifiedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )

#Run SVM Model
  print("SVM Model with Sigmoid Kernel")
  start_time = time.time()
  df_Results = buildAndRunLogisticModels(df_Results,"StratifiedKFold Cross Validation", x_train_cv,y_train_cv, x_test_cv,y_test_cv)
  print("Time Taken by Model: ---%s seconds ---"% (time.time() - start_time))
  print('-'*60 )













In [None]:
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.preprocessing import LabelEncoder


import pandas as pd

# Load data from CSV file (replace 'data.csv' with your actual file path)
data = pd.read_csv('/content/creditcard.csv')


x_train_SKF_cv = data[['Amount', 'Time'] ].values


y_train_SKF_cv = data['Class'].values

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_SKF_cv)


# Handling Missing Values with Imputation
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')
x_train_imputed = imputer.fit_transform(x_train_SKF_cv)


# Standard Scaling
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_imputed)


# KFold for Cross-Validation
kf = KFold(n_splits=5, random_state=None, shuffle=True)
cv_num = KFold(n_splits=10, shuffle=True, random_state=42)

num_C = list(np.power(10.0, np.arange(-10, 11)))

# Logistic Regression with Cross-Validation
clf = linear_model.LogisticRegressionCV(
    Cs=num_C,
    penalty='l2',
    scoring='roc_auc',
    cv=cv_num,
    random_state=42,
    max_iter=10000,
    fit_intercept=True,
    solver='newton-cg',
    tol=1e-10,
    multi_class='ovr'
)

clf.fit(x_train_imputed, y_train_encoded)  # Use imputed data for training


# Model Evaluation
y_pred_probs_l2 = clf.predict_proba(x_test_scaled)[:, 1]
y_pred_l2 = clf.predict(x_test_scaled)

# Accuracy
Accuracy_l2 = accuracy_score(y_pred=y_pred_l2, y_true=y_test)
print("Accuracy of Logistic model with l2 regularization: {0}".format(Accuracy_l2))

# ROC AUC
l2_roc_value = roc_auc_score(y_test, y_pred_probs_l2)
print("l2 roc_value: {0}".format(l2_roc_value))

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_probs_l2)
threshold = thresholds[np.argmax(tpr - fpr)]
print("l2 threshold: {0}".format(threshold))


In [None]:
clf.coef_

In [None]:
coefficients = pd.concat([pd.DataFrame(x.columns), pd.DataFrame(np.transpose(clf.coef_))], axis=1)
coefficients.columns = ['Feature','Importance Coefficient']

In [None]:
coefficients

In [None]:
plt.figure(figsize=(20,5))
sns.barplot(x='Feature', y='Importance coefficient', data=coefficients)
plt.title("Logistic Regression with l2 regularisation feature importance", fontsize=18)

plt.show()

In [None]:
from sklearn.model_selection import stratifiedKFold
from imblearn.over_sampling import RandomOverSampler

skf = StratifiedKFold(n_splits=5, random_state=None)

for fold, (train_index, test_index) in enumerate(skf.split(x,y),1):
  x_train = x.loc[train_index]
  y_train = y.loc[train_index]
  x_test = x.loc[test_index]
  y_test = x.loc[train_index]
  ROS=RandomOverSampler(sampling_strategy=0.5)
  x_over, y_over= ROS.fit_resample(x_train, y_train)

  x_over = pd.DataFrame(data=x_over, columns=cols)


In [None]:
Data_Imbalance_Handling = "RandomOversampling with StratifiedKFold CV"
print("Logistic Regression with l1 and l2 regularisation")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_over, y_over, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*60)

print("KNN Model")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_over, y_over, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*60)

print("Decision Tree Models with 'gini' and 'entropy' criteria")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_over, y_over, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*60)

print("Random Forest Model")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_over, y_over, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*60)

print("XGBoost Model")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_over, y_over, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*60)




In [None]:
from sklearn.model_selection import stratifiedKFold
from imblearn import over_sampling

skf = StratifiedKFold(n_splits=5, random_state=None)

for fold, (train_index, test_index) in enumerate(skf.split(x,y),1):
  x_train = x.loc[train_index]
  y_train = y.loc[train_index]
  x_test = x.loc[test_index]
  y_test = x.loc[train_index]
  SMOTE=over_sampling.SMOTE(random_state=0)
  x_train_Smote, y_train_Smote= SMOTE.fit_resample(x_train, y_train)

  x_train_Smote = pd.DataFrame(data=x_train_Smote, columns=cols)


In [None]:
Data_Imbalance_Handling = "SMOTE Oversampling with StratifiedKFold CV"
print("Logistic Regression with l1 and l2 regularisation")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_Smote, y_train_Smote, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)

print("KNN Model")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_Smote, y_train_Smote, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)

print("Decision Tree Models with 'gini' and 'entropy' criteria")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_Smote, y_train_Smote, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)

print("Random Forest Model")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_Smote, y_train_Smote, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)

print("XGBoost Model")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_Smote, y_train_Smote, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)




In [None]:
df_Results

In [None]:
from sklearn.model_selection import stratifiedKFold
from imblearn import over_sampling

skf = StratifiedKFold(n_splits=5, random_state=None)

for fold, (train_index, test_index) in enumerate(skf.split(x,y),1):
  x_train = x.loc[train_index]
  y_train = y.loc[train_index]
  x_test = x.loc[test_index]
  y_test = x.loc[train_index]
  ADASYN=over_sampling.SMOTE(random_state=0)
  x_train_ADASYN, y_train_ADASYN= ADASYN.fit_resample(x_train, y_train)

  x_train_ADASYN = pd.DataFrame(data=x_train_ADASYN, columns=cols)


In [None]:
Data_Imbalance_Handling = "ADASYN Oversampling with StratifiedKFold CV"
print("Logistic Regression with l1 and l2 regularisation")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_ADASYN, y_train_ADASYN, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)

print("KNN Model")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_ADASYN, y_train_ADASYN, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)

print("Decision Tree Models with 'gini' and 'entropy' criteria")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_ADASYN, y_train_ADASYN, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)

print("Random Forest Model")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_ADASYN, y_train_ADASYN, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)

print("XGBoost Model")
start_time = time.time()
df_Results = buildAndRunLogisticModel(df_Results, Data_Imbalance_Handling, x_train_ADASYN, y_train_ADASYN, x_test, y_test)
print("Time Taken by Model:---%s seconds---"%(time.time() - start_time))
print('-'*80)




In [None]:
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_test={
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2),
    'n_estimators':range(60,130,150),
    'learning_rate':range(0.05,0.1,0.125,0.15,0.2),
    'subsample':[i/10.0 for i in range(7,10)],
    'gamma':[i/10.0 for i in range(0,5)],
    'colsample_bytree':[i/10.0 for i in range(7,10)]

}

gsearch1 = RandomizedSearchCV(estimator = XGBClassifier(base_score=0.5,booster='gbtree',colsample_bylevel=1,
                                                        colsample_bynode=1,max_delta_step=0,
                                                        missing=None, n_jobs=-1,
                                                        nthread=None, objective='binary:logistic',random_state=42,
                                                        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                                                        silent=None, verbosity=1),
                                                        param_distribution=param_test, n_iter=5, scoring='roc_auc',n_jobs=-1, cv=5)

gsearch1.fit(x_over, y_over)
gsearch1.cv_results, gsearch1.best_params_,gsearch1.best_score_




In [None]:
from xgboost import XGBClassifier

clf=XGBClassifier(base_score=0.5, booster='gbtree',colsample_bylevel=1,
                  colsample_bynode=1,colsample_bytree=0.7, gamma=0.2,
                  learning_rate=0.125, missing=None, n_estimators=60,n_jobs=1,
                  min_child_weight=5,missing=None,n_estimators=60,n_jobs=1,
                  mthread=None,objective='binary.logistic',random_state=42,
                  reg_alpha=0,reg_lambda=1,scale_pos_weight=1,seed=None,
                  silent=None,subsample=0.8,verbosity=1)

clf.fit(x_over,y_over)
XGB_test_score=clf.score(x_test,y_test)
print('Model Accuracy:{0}'.format(XGB_test_score))
XGB_roc_value=roc_auc_score(y_test,XGB_probs)



