Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
!pip install -U -q PyDrive

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
  
  
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
# loading the dataset to a Pandas DataFrame
link =  'https://drive.google.com/file/d/1y2g5lIU9bLcDBoafJXD9ORKQoAm4hCqj/view?usp=sharing'

import pandas as pd

# to get the id part of the file
id = link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('primary_account(2).csv')

credit_card_data = pd.read_csv('primary_account(2).csv', sep = ',')

In [4]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,id,amount,available_balance,primary_account_id,Class
0,15,133,1000.0,11,0
1,17,10,990.0,11,1
2,19,170,6545.0,11,0
3,21,10,6535.0,11,1
4,23,100,6435.0,11,0


In [5]:
credit_card_data.tail()

Unnamed: 0,id,amount,available_balance,primary_account_id,Class
14,48,5,20.0,1,1
15,49,100,1000.0,53,0
16,57,10,935.0,2,0
17,59,10,6537.0,11,0
18,61,10,10.0,1,0


In [6]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  19 non-null     int64  
 1   amount              19 non-null     int64  
 2   available_balance   19 non-null     float64
 3   primary_account_id  19 non-null     int64  
 4   Class               19 non-null     int64  
dtypes: float64(1), int64(4)
memory usage: 888.0 bytes


In [7]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

id                    0
amount                0
available_balance     0
primary_account_id    0
Class                 0
dtype: int64

In [8]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

0    15
1     4
Name: Class, dtype: int64

This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [10]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [11]:
print(legit.shape)
print(fraud.shape)

(15, 5)
(4, 5)


In [13]:
# statistical measures of the data
legit.amount.describe()

count     15.000000
mean      56.400000
std       53.908388
min        5.000000
25%       10.000000
50%       55.000000
75%      100.000000
max      170.000000
Name: amount, dtype: float64

In [14]:
fraud.amount.describe()

count     4.000000
mean     23.000000
std      29.427878
min       5.000000
25%       8.750000
50%      10.000000
75%      24.250000
max      67.000000
Name: amount, dtype: float64

In [15]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,id,amount,available_balance,primary_account_id
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,39.6,56.4,3782.533333,10.666667
1,30.75,23.0,3486.0,8.5


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [16]:
legit_sample = legit.sample(n=492)

ValueError: ignored

Concatenating two DataFrames

In [17]:
new_dataset = pd.concat([legit, fraud], axis=0)

In [18]:
new_dataset.head()

Unnamed: 0,id,amount,available_balance,primary_account_id,Class
0,15,133,1000.0,11,0
2,19,170,6545.0,11,0
4,23,100,6435.0,11,0
5,28,55,6446.0,11,0
6,33,10,6456.0,11,0


In [19]:
new_dataset.tail()

Unnamed: 0,id,amount,available_balance,primary_account_id,Class
18,61,10,10.0,1,0
1,17,10,990.0,11,1
3,21,10,6535.0,11,1
8,37,67,6399.0,11,1
14,48,5,20.0,1,1


In [20]:
new_dataset['Class'].value_counts()

0    15
1     4
Name: Class, dtype: int64

In [21]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,id,amount,available_balance,primary_account_id
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,39.6,56.4,3782.533333,10.666667
1,30.75,23.0,3486.0,8.5


Splitting the data into Features & Targets

In [22]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [23]:
print(X)

    id  amount  available_balance  primary_account_id
0   15     133             1000.0                  11
2   19     170             6545.0                  11
4   23     100             6435.0                  11
5   28      55             6446.0                  11
6   33      10             6456.0                  11
7   35      10             6466.0                  11
9   39       8             6391.0                  11
10  41      66             6547.0                  11
11  43     104             1000.0                   2
12  45      55              945.0                   2
13  47       5               25.0                   1
15  49     100             1000.0                  53
16  57      10              935.0                   2
17  59      10             6537.0                  11
18  61      10               10.0                   1
1   17      10              990.0                  11
3   21      10             6535.0                  11
8   37      67             6

In [24]:
print(Y)

0     0
2     0
4     0
5     0
6     0
7     0
9     0
10    0
11    0
12    0
13    0
15    0
16    0
17    0
18    0
1     1
3     1
8     1
14    1
Name: Class, dtype: int64


Split the data into Training data & Testing Data

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [26]:
print(X.shape, X_train.shape, X_test.shape)

(19, 4) (15, 4) (4, 4)


Model Training

Logistic Regression

In [27]:
model = LogisticRegression()

In [28]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

Model Evaluation

Accuracy Score

In [29]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [30]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.8666666666666667


In [31]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [32]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.75



continuité video 3


In [33]:
y_pred = np.array(model.predict(X_test))
y = np.array(Y_test )

In [34]:
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix

In [35]:
print(confusion_matrix(Y_test,y_pred))

[[3 0]
 [1 0]]


In [36]:
    print("Accuracy Score :")
    print(accuracy_score(Y_test,y_pred))
    print("Classification Report :")
    print(classification_report(Y_test,y_pred))

Accuracy Score :
0.75
Classification Report :
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         3
           1       0.00      0.00      0.00         1

    accuracy                           0.75         4
   macro avg       0.38      0.50      0.43         4
weighted avg       0.56      0.75      0.64         4



  _warn_prf(average, modifier, msg_start, len(result))
