In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree


In [3]:
df = pd.read_csv('creditcard.csv')

In [5]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [21]:
df.shape

(283726, 30)

In [9]:
df.Class.value_counts(normalize=True)*100

Class
0    99.827251
1     0.172749
Name: proportion, dtype: float64

In [11]:
df.duplicated().sum()

np.int64(1081)

In [13]:
~df.duplicated().sum()

np.int64(-1082)

In [14]:
df.drop_duplicates(inplace=True)

In [16]:
df = df.drop('Time', axis=1)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 283726 entries, 0 to 284806
Data columns (total 30 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   V1      283726 non-null  float64
 1   V2      283726 non-null  float64
 2   V3      283726 non-null  float64
 3   V4      283726 non-null  float64
 4   V5      283726 non-null  float64
 5   V6      283726 non-null  float64
 6   V7      283726 non-null  float64
 7   V8      283726 non-null  float64
 8   V9      283726 non-null  float64
 9   V10     283726 non-null  float64
 10  V11     283726 non-null  float64
 11  V12     283726 non-null  float64
 12  V13     283726 non-null  float64
 13  V14     283726 non-null  float64
 14  V15     283726 non-null  float64
 15  V16     283726 non-null  float64
 16  V17     283726 non-null  float64
 17  V18     283726 non-null  float64
 18  V19     283726 non-null  float64
 19  V20     283726 non-null  float64
 20  V21     283726 non-null  float64
 21  V22     283726 

In [18]:
df.isna().sum()

V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [19]:
df.groupby('Class').Amount.mean()

Class
0     88.413575
1    123.871860
Name: Amount, dtype: float64

In [20]:
#de eliminat outlinerele

In [22]:
X = df.drop(columns='Class', axis=1)
y = df['Class']

In [23]:
X.shape, y.shape

((283726, 29), (283726,))

In [31]:
from sklearn.model_selection import train_test_split
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, test_size=0.25, random_state=42)

In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
lr = LogisticRegression(max_iter=1000)
kf = StratifiedKFold(n_splits=5,shuffle=False)

In [39]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.66.6-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.66.6-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.6


In [42]:
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from tqdm import tqdm

recall_scores = []
precision_scores = []
f1_scores = []
accuracy_scores = []

for train_index, val_index in tqdm(kf.split(X_train_val, y_train_val)):
    X_fold_train, X_fold_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_fold_train, y_fold_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    lr.fit(X_fold_train, y_fold_train)

    y_pred = lr.predict(X_fold_val)
  
    recall = recall_score(y_fold_val, y_pred)
    precision = precision_score(y_fold_val, y_pred)
    f1 = f1_score(y_fold_val, y_pred)
    accuracy = accuracy_score(y_fold_val, y_pred)

    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)


5it [00:24,  4.95s/it]


In [None]:
average_recall = np.mean(recall_score)
average_precision= np.mean(precision_score)
average_f1 = np.mean(f1_score)
average_accuracy = np.mean(accuracy_score)

TypeError: unsupported operand type(s) for /: 'function' and 'int'

In [None]:
scores_df = pd.DataFrame(data=[(average_recall,average_precision,average_f1,average_accuracy)], columns=['Recal','Precosopm','F1','Accuracy'])

In [None]:
lr.fit(X_train, y_train)

In [44]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

rus = RandomUnderSampler(random_state=42)

In [50]:
X_under, y_under = rus.fit_resample(X_train, y_train)
y_under.value_counts()

Class
0    287
1    287
Name: count, dtype: int64

In [None]:
random_under_pip = Pipeline(steps=[
    ('random_state', RandomUnderSampler(random_state=42)),
    ('lr', LogisticRegression(max_iter=1000, random_state = 13))
])

In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score
from tqdm import tqdm

recall_scores = []
precision_scores = []
f1_scores = []
accuracy_scores = []

for train_index, val_index in tqdm(kf.split(X_train_val, y_train_val)):
    X_fold_train, X_fold_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_fold_train, y_fold_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    random_under_pip.fit(X_fold_train, y_fold_train)

    y_pred = random_under_pip.predict(X_fold_val)
  
    recall = recall_score(y_fold_val, y_pred)
    precision = precision_score(y_fold_val, y_pred)
    f1 = f1_score(y_fold_val, y_pred)
    accuracy = accuracy_score(y_fold_val, y_pred)

    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)
