In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import math as mt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from typing import List
from typing import Tuple
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids

In [2]:
df = pd.read_csv('Credit_Card.csv')
label_df = pd.read_csv('Credit_card_label.csv')
df.head()

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2


In [3]:
sorted_df = pd.merge(df, label_df, on='Ind_ID')

In [4]:
def clean_and_encode_data(sorted_df: pd.DataFrame,
                        numerical_cols: List[str] = ['Annual_income', 'Birthday_count'], 
                          categorical_cols: List[str] = ['GENDER', 'Car_Owner', 'Propert_Owner', 'Type_Income', 
                                                         'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation'], 
                          occupation_col: str = 'Type_Occupation') -> pd.DataFrame:
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    
    sorted_df.dropna(subset=[occupation_col], inplace=True)
    
    sorted_df['GENDER'] = sorted_df['GENDER'].fillna(sorted_df['GENDER'].mode()[0])
    
    sorted_df = pd.get_dummies(sorted_df, columns=categorical_cols, drop_first=True)

    sorted_df = sorted_df.replace([np.inf, -np.inf], np.nan).fillna(0)

    sorted_df = sorted_df.astype(int)
    
    return sorted_df
sorted_df = clean_and_encode_data(sorted_df)
sorted_df

Unnamed: 0,Ind_ID,CHILDREN,Annual_income,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Family_Members,...,Type_Occupation_Laborers,Type_Occupation_Low-skill Laborers,Type_Occupation_Managers,Type_Occupation_Medicine staff,Type_Occupation_Private service staff,Type_Occupation_Realty agents,Type_Occupation_Sales staff,Type_Occupation_Secretaries,Type_Occupation_Security staff,Type_Occupation_Waiters/barmen staff
8,5010864,1,450000,-18173,-678,1,0,1,1,3,...,0,0,0,0,0,0,0,0,0,0
9,5010868,1,450000,-18173,-678,1,0,1,1,3,...,0,0,0,0,0,0,0,0,0,0
10,5010869,1,450000,-18173,-678,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
11,5018498,0,90000,-18950,-1002,1,1,1,0,2,...,0,0,0,0,0,0,0,0,0,0
12,5018501,0,0,-18950,-1002,1,1,1,0,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1542,5118268,1,360000,-11294,-3536,1,0,1,0,3,...,0,0,0,0,0,0,0,0,0,0
1543,5028645,0,0,-11957,-2182,1,0,0,0,2,...,0,0,1,0,0,0,0,0,0,0
1544,5023655,0,225000,-10229,-1209,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1545,5115992,2,180000,-13174,-2477,1,0,0,0,4,...,0,0,1,0,0,0,0,0,0,0


In [5]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids



In [6]:
def KNN_with_oversampling(ycol):

    X = sorted_df.drop(columns=[ycol])
    y = sorted_df[ycol]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    ros = RandomOverSampler(random_state=42)
    scaler = StandardScaler()
    X_train_standard = scaler.fit_transform(X_train)
    X_test_standard = scaler.transform(X_test)
    X_resampled, y_resampled = ros.fit_resample(X_train_standard, y_train)
    best_k = 0
    best_score = 0
    for k in range(1,21):
        knn = KNeighborsClassifier(n_neighbors=((2*k)+1))
        knn.fit(X_train, y_train)
        score = knn.score(X_test_standard, y_test)
        if score > best_score:
            best_k = k
            best_score = score

    knn = KNeighborsClassifier(n_neighbors=best_k)
    knn.fit(X_resampled, y_resampled)
    
    y_pred = knn.predict(X_test_standard)
    test_accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    return best_k, test_accuracy, conf_matrix, class_report


In [7]:
ycolumn = "label"
best_k, test_accuracy, conf_matrix, class_report = KNN_with_oversampling(ycolumn)

print(f"Optimal k: {best_k}")
print(f"Test Accuracy: {test_accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)



Optimal k: 1
Test Accuracy: 0.8710691823899371
Confusion Matrix:
[[259  16]
 [ 25  18]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.94      0.93       275
           1       0.53      0.42      0.47        43

    accuracy                           0.87       318
   macro avg       0.72      0.68      0.70       318
weighted avg       0.86      0.87      0.86       318





In [8]:
def Logistic_regression_overfit(ycol) -> float:
    X = sorted_df.drop(columns=[ycol])
    y = sorted_df[ycol]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    ros = RandomOverSampler(random_state=42)
    scaler = StandardScaler()
    X_train_standard = scaler.fit_transform(X_train)
    X_test_standard = scaler.transform(X_test)
    X_resampled, y_resampled = ros.fit_resample(X_train_standard, y_train)
    
    log_reg = LogisticRegression(max_iter=100)
    log_reg.fit(X_resampled, y_resampled)

    y_pred = log_reg.predict(X_test_standard)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

In [11]:
def train_logistic_regression_stan(ycol) -> float:
    X = sorted_df.drop(columns=[ycol])
    y = sorted_df[ycol]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    
    X_train_standard = scaler.fit_transform(X_train)
    X_test_standard = scaler.transform(X_test)

    log_reg = LogisticRegression(max_iter=100)
    log_reg.fit(X_train_standard, y_train)

    y_pred = log_reg.predict(X_test_standard)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

In [13]:
input = "label"
accuracy, report = Logistic_regression_overfit(input)
print(accuracy)
print(report)
accuracy, report = train_logistic_regression_stan(input)
print(accuracy)
print(report)

0.6415094339622641
              precision    recall  f1-score   support

           0       0.85      0.70      0.77       181
           1       0.14      0.29      0.19        31

    accuracy                           0.64       212
   macro avg       0.50      0.50      0.48       212
weighted avg       0.75      0.64      0.69       212

0.8537735849056604
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       181
           1       0.50      0.06      0.11        31

    accuracy                           0.85       212
   macro avg       0.68      0.53      0.52       212
weighted avg       0.81      0.85      0.80       212

