In [12]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


/kaggle/input/credit-card-fraud/README.md
/kaggle/input/credit-card-fraud/datapackage.json
/kaggle/input/credit-card-fraud/archive/creditcard.csv
/kaggle/input/credit-card-fraud/data/creditcard_csv.csv
/kaggle/input/credit-card-fraud/data/creditcard_json.json
/kaggle/input/credit-card-fraud/data/validation_report.json


<h1 style="font-family: 'Lucida Sans', 'Lucida Sans Regular', 'Lucida Grande', 'Lucida Sans Unicode', Geneva, Verdana, sans-serif; text-align: center; color: #3A405A;">📊 Data Analysis and Visualization</h1>

In [13]:
df = pd.read_csv('/kaggle/input/credit-card-fraud/data/creditcard_csv.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,'0'
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,'0'
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,'0'
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,'0'
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,'0'


In [14]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

# 0 for normal, 1 for fraud

In [15]:
df.Class.value_counts()

'0'    284315
'1'       492
Name: Class, dtype: int64

In [16]:
df.dtypes

Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class      object
dtype: object

In [17]:
df.Amount

0         149.62
1           2.69
2         378.66
3         123.50
4          69.99
           ...  
284802      0.77
284803     24.79
284804     67.88
284805     10.00
284806    217.00
Name: Amount, Length: 284807, dtype: float64

# Strip the ' ', and convert to int

In [18]:
df['Class'] = df['Class'].str.strip("'")
df['Class'] = df['Class'].astype(int)
df.Class

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

# The classes are unbalanced(good & fraud)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

X = df.drop('Class', axis=1)
y = df.Class
#Scaler for X values --> more velocity
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'K-NN': KNeighborsClassifier(),
    'CatBoost': CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, verbose=0,loss_function='Logloss', random_seed=42),
    'SVM': SVC()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f'{name} Metrics:')
    print(f'Accuracy: {accuracy:.2f}')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1-score: {f1:.2f}')
    print('-' * 40)
    

Logistic Regression Metrics:
Accuracy: 1.00
Precision: 0.85
Recall: 0.60
F1-score: 0.70
----------------------------------------
Decision Tree Metrics:
Accuracy: 1.00
Precision: 0.72
Recall: 0.78
F1-score: 0.75
----------------------------------------
Random Forest Metrics:
Accuracy: 1.00
Precision: 0.94
Recall: 0.80
F1-score: 0.86
----------------------------------------
K-NN Metrics:
Accuracy: 1.00
Precision: 0.85
Recall: 0.76
F1-score: 0.80
----------------------------------------
CatBoost Metrics:
Accuracy: 1.00
Precision: 0.96
Recall: 0.78
F1-score: 0.86
----------------------------------------
SVM Metrics:
Accuracy: 1.00
Precision: 0.91
Recall: 0.61
F1-score: 0.73
----------------------------------------


<div style="background-color: #E8EAF6; padding: 20px; border-radius: 10px; box-shadow: 2px 2px 10px rgba(0, 0, 0, 0.1);">
    <h2 style="font-family: 'Verdana'; color: #3A405A;">🔍 Insights</h2>
    <p style="font-size: 18px; font-family: 'Verdana'; color: #3A405A; line-height: 1.5em;">The classes are very unbalanced, but balancing them I do not think is the best solution because in real life you do not know which are frauds and which are not. That's why we have to work with all the information. This analysis can be valuable for analyzing and tracking "suspicious" accounts.</p>
</div>