# m

TASK 2

CREDIT CARD FRAUD DETECTION

• Develop a machine learning model designed to detect fraudulent credit card
transactions.
• The process involves preprocessing and normalizing transaction data, addressing
class imbalance concerns, and partitioning the dataset into training and testing
subsets.
• Train a classification algorithm—like logistic regression or random forests—to
differentiate between fraudulent and legitimate transactions.
• Assess the model's efficacy using metrics such as precision, recall, and F1-score.
• Additionally, explore strategies like oversampling or undersampling to enhance
outcomes and refine the model's performance.
• DOWNLOAD THE DATASET HERE

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as snsa

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [3]:
df=pd.read_csv(r'C:\Users\Kishore\Downloads\archive (8)\creditcard.csv',encoding='ISO-8859-1')

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [6]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [7]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [8]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [10]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

#this data set is highly unbalanced
0=Normal transaction
1=Fraudlent Transaction


In [14]:
#seperating the data for analysis**bold**

In [20]:
legit = df[df['Class'] == 0]
fraud=df[df['Class']==1]

In [26]:
print(legit.shape)

(284315, 31)


In [28]:
print(fraud.shape)

(492, 31)


In [30]:
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [32]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [34]:
df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [36]:
leg=legit.sample(n=492)

In [46]:
# Concatenate the DataFrames
new_df = pd.concat([leg, fraud], axis=0, ignore_index=True)
print(new_df)

         Time        V1        V2        V3        V4        V5        V6  \
0     79422.0 -0.371846  1.050419  1.180052 -0.125688  0.264026 -0.490822   
1    154393.0 -1.153917  0.103912  1.531211 -0.405246  1.456286  0.610230   
2       714.0  1.174748  0.214066  0.349599  0.921342 -0.160510 -0.319185   
3    115033.0  1.882681  0.296086 -0.152866  4.050789 -0.032225  0.251325   
4     65461.0  1.205055  0.134758  0.513392  0.515790 -0.483457 -0.679856   
..        ...       ...       ...       ...       ...       ...       ...   
979  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
980  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
981  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
982  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
983  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

           V7        V8        V9  ...       V21       V22       V23  \
0  

In [50]:
new_df.shape

(984, 31)

In [56]:
new_df.sample(5)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
97,144335.0,-4.520823,4.583674,-2.341541,-0.820654,-0.317298,1.940869,-4.123484,-12.909527,0.679429,...,13.397103,-3.373476,1.741238,-0.808488,-0.161056,-0.039978,0.663529,0.30736,2.99,0
111,69581.0,-0.753779,1.212756,0.765052,-0.129905,0.04647,-0.061574,0.142353,0.695358,-0.709854,...,-0.167306,-0.586391,0.039243,-0.378552,-0.214498,0.108864,0.126268,0.017508,9.99,0
936,152098.0,-4.124316,3.748597,-7.926507,7.763242,-0.769375,-2.031171,-3.474549,0.107121,-1.551352,...,0.547097,0.687854,0.429939,-0.620601,-0.369688,0.367349,-2.664767,0.417101,1.0,1
571,28726.0,-29.876366,16.434525,-30.558697,6.505862,-21.665654,-4.940356,-20.081391,19.587773,-3.591491,...,1.812954,-2.232252,-1.412803,0.178731,2.156042,-0.209385,1.255649,0.36453,99.99,1
538,18675.0,-12.339603,4.488267,-16.587073,10.107274,-10.420199,0.13067,-15.600323,-1.157696,-5.304631,...,-2.08961,1.745315,1.376816,-0.554271,-1.610741,0.153725,1.212477,-1.86929,188.78,1


In [60]:
new_df['Class'].value_counts()

Class
0    492
1    492
Name: count, dtype: int64

In [62]:
new_df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,95201.319106,0.071041,0.045941,-0.039311,-0.028586,-0.001112,-0.038244,-0.058521,-0.024139,-0.05239,...,0.001466,-0.026846,-0.031365,0.037388,0.051437,0.007052,0.031975,0.00021,0.021016,76.866179
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [68]:
x=new_df.drop(columns='Class',axis=1)
y=new_df['Class']

In [70]:
y

0      0
1      0
2      0
3      0
4      0
      ..
979    1
980    1
981    1
982    1
983    1
Name: Class, Length: 984, dtype: int64

In [72]:
x

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,79422.0,-0.371846,1.050419,1.180052,-0.125688,0.264026,-0.490822,0.573646,0.102433,-0.616212,...,0.130156,-0.234303,-0.645515,-0.068216,-0.050275,-0.150002,0.075141,0.240876,0.084847,0.89
1,154393.0,-1.153917,0.103912,1.531211,-0.405246,1.456286,0.610230,0.737783,0.163884,-0.214385,...,0.158984,0.057513,-0.015424,-0.326152,-0.182328,0.883563,-0.660990,-0.165105,-0.161703,70.00
2,714.0,1.174748,0.214066,0.349599,0.921342,-0.160510,-0.319185,0.012183,-0.019382,-0.187794,...,-0.062707,-0.030380,-0.137945,-0.082966,-0.057925,0.531052,-0.458175,0.017021,0.016661,24.95
3,115033.0,1.882681,0.296086,-0.152866,4.050789,-0.032225,0.251325,-0.213056,0.061197,-0.190995,...,-0.358383,0.084262,0.527348,0.092488,-0.016210,0.076581,0.139073,-0.001819,-0.048571,0.00
4,65461.0,1.205055,0.134758,0.513392,0.515790,-0.483457,-0.679856,-0.075024,-0.024697,-0.121116,...,-0.109356,-0.209238,-0.686433,0.125841,0.307409,0.164239,0.078658,-0.038113,0.008569,7.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,1.252967,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00
980,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.226138,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76
981,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.247968,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89
982,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.306271,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00


In [76]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42,stratify=y)

In [78]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(x_train)
x_train = ss.transform(x_train)
x_test = ss.transform(x_test)

In [80]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [82]:
# Dictionary to hold different machine learning models with their corresponding classifier instances
models = {
    'K-nn': KNeighborsClassifier(),
    'SVM': SVC(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier(),
    'ET': ExtraTreeClassifier(),
    'BC': BaggingClassifier(),
    'GBC': GradientBoostingClassifier(),
    'AAD': AdaBoostClassifier(),
    'LR': LogisticRegression(),
    'GS': GaussianNB(),
}
# Iterate over each model in the models dictionary
for name, md in models.items():
    md.fit(x_train, y_train)
    y_pred = md.predict(x_test)
    print(f"{name} with accuracy: {accuracy_score(y_test, y_pred)}")

K-nn with accuracy: 0.9290540540540541
SVM with accuracy: 0.9493243243243243
DT with accuracy: 0.8885135135135135
RF with accuracy: 0.956081081081081
ET with accuracy: 0.8986486486486487
BC with accuracy: 0.9222972972972973
GBC with accuracy: 0.9324324324324325




AAD with accuracy: 0.9324324324324325
LR with accuracy: 0.9425675675675675
GS with accuracy: 0.9155405405405406
