# Credit Card Fraud Detection with Logistic Regression


In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the uploaded CSV file
creditCard = pd.read_csv('creditcard_csv.csv')

# print(creditCard.shape)


In [None]:
print(creditCard.shape)
# display(creditCard.describe())
# display(creditCard.tail())

In [24]:
creditCard.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

**checking the missing data in each data **

In [25]:
creditCard.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [26]:
# Check the number of legal and fraudulent transactions
class_counts = creditCard['Class'].value_counts()
display(class_counts)

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
'0',284315
'1',492


In [27]:
# Separate the dataset into legal and fraudulent transactions
legal_transactions = creditCard[creditCard['Class'] == "'0'"]
fraudulent_transactions = creditCard[creditCard['Class'] == "'1'"]

# Display the shapes of the new dataframes to verify the separation
print("Shape of legal transactions dataframe:", legal_transactions.shape)
print("Shape of fraudulent transactions dataframe:", fraudulent_transactions.shape)

Shape of legal transactions dataframe: (284315, 31)
Shape of fraudulent transactions dataframe: (492, 31)


In [28]:
legal_transactions.Amount.describe()

Unnamed: 0,Amount
count,284315.0
mean,88.291022
std,250.105092
min,0.0
25%,5.65
50%,22.0
75%,77.05
max,25691.16


In [29]:
fraudulent_transactions.Amount.describe()

Unnamed: 0,Amount
count,492.0
mean,122.211321
std,256.683288
min,0.0
25%,1.0
50%,9.25
75%,105.89
max,2125.87


comapre the values for both transation


In [30]:
creditCard.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'0',94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
'1',80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [31]:
# Undersample the legal transactions to have the same number of instances as fraudulent transactions
legal_transactions_undersampled = legal_transactions.sample(n=len(fraudulent_transactions), random_state=42)

# Concatenate the undersampled legal transactions with the fraudulent transactions
undersampled_creditCard = pd.concat([legal_transactions_undersampled, fraudulent_transactions], axis=0)

# Display the new class distribution
print("Class distribution after undersampling:")
display(undersampled_creditCard['Class'].value_counts())

Class distribution after undersampling:


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
'0',492
'1',492


In [32]:
undersampled_creditCard.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'0',95052.75813,0.153312,0.009649,-0.038029,-0.027323,0.061966,-0.053962,0.013795,0.014911,0.037348,...,0.01503,0.014059,-0.020781,0.013223,-0.007257,0.024646,-0.027696,0.01107,-0.002305,80.348354
'1',80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [33]:
# Define features (X) and target (y)
X = undersampled_creditCard.drop('Class', axis=1)
y = undersampled_creditCard['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (787, 30)
Shape of X_test: (197, 30)
Shape of y_train: (787,)
Shape of y_test: (197,)


In [35]:
# Load the Logistic Regression model
model = LogisticRegression()

In [37]:
# Train the Logistic Regression model
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Now that we have scaled the data, let's retrain the Logistic Regression model on the scaled training data.

In [40]:
# Retrain the model on the scaled data
model.fit(X_train_scaled, y_train)

In [41]:
from sklearn.metrics import accuracy_score

# Make predictions on the scaled test data
X_test_scaled = scaler.transform(X_test) # Ensure X_test is scaled using the same scaler
y_pred = model.predict(X_test_scaled)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy Score: {accuracy}")

Accuracy Score: 0.9644670050761421


In [42]:
# Make predictions on the scaled training data
y_train_pred = model.predict(X_train_scaled)

# Calculate the accuracy score for the training data
train_accuracy = accuracy_score(y_train, y_train_pred)

print(f"Accuracy Score on Training Data: {train_accuracy}")

Accuracy Score on Training Data: 0.9466327827191868
