In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from typing import List
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# 1.
A neural network is a machine learning model that tries to mimic a brain by "thinking". It assigns weights to variables and makes changes to the weights by running the model multiple times and getting error feedback. A NN has 3 main parts: Input Layer, Output Layer, and hidden layers where the thinking occurs. Building a NN requires 2 main things, the function that changes the variables at each node to do the thinking (activation function) and the loss function that tells the machine by how much its prediction was off.

# 2.
A loss function is the simplest way to check the performance of a NN because it shows the prediction error. Another good way to check is to use k-fold cross validation, because it lets every part of the data be tested at least once and runs multiple tests instead of just one.

In [None]:
df = pd.read_csv('Credit_Card.csv')
label_df = pd.read_csv('Credit_card_label.csv')
sorted_df = pd.merge(df, label_df, on='Ind_ID')

In [None]:
def datacleaner(sorted_df: pd.DataFrame,
                        numerical_cols: List[str] = ['Annual_income', 'Birthday_count'], 
                          categorical_cols: List[str] = ['GENDER', 'Car_Owner', 'Propert_Owner', 'Type_Income', 
                                                         'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation'], 
                          occupation_col: str = 'Type_Occupation') -> pd.DataFrame:
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    
    sorted_df.dropna(subset=[occupation_col], inplace=True)
    
    sorted_df['GENDER'] = sorted_df['GENDER'].fillna(sorted_df['GENDER'].mode()[0])
    
    sorted_df = pd.get_dummies(sorted_df, columns=categorical_cols, drop_first=True)

    sorted_df = sorted_df.replace([np.inf, -np.inf], np.nan).fillna(0)

    sorted_df = sorted_df.astype(int)
    scaler = StandardScaler()
    sorted_df[numerical_cols] = scaler.fit_transform(sorted_df[numerical_cols])
    return sorted_df
sorted_df = datacleaner(sorted_df)
sorted_df

# 3.
I normalized the numerical columns and tweaked the filling null values in addition to the data cleaning that was already done.

In [None]:
class ANN_Model(nn.Module):
    def __init__(self,input_features=8,
                 hidden1=20,hidden2=20,
                 out_features=2):
        super().__init__() 
        """
        super is a computed indirect reference
        which means that it isolates changes and
        makes sure the children in the layers of
        multiple inheritance are calling 
        the right parents
        """
        self.layer_1_connection = nn.Linear(input_features, hidden1)
        self.layer_2_connection = nn.Linear(hidden1, hidden2)
        self.out = nn.Linear(hidden2, out_features)

    def forward(self, x):
        #apply activation function
        x = F.relu(self.layer_1_connection(x))
        x = F.relu(self.layer_2_connection(x))
        x = self.out(x)
        return x

In [None]:
torch.manual_seed(42)

# create an instance of the model
ann = ANN_Model()

In [None]:
X = sorted_df.drop(columns=['label']).values 
y = sorted_df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long) 
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)


input_features = X_train.shape[1]
model = ANN_Model(input_features=input_features, hidden1=20, hidden2=20, out_features=2)

criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch) 
        loss = criterion(outputs, y_batch) 
        loss.backward() 
        optimizer.step() 

model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    y_pred_class = torch.argmax(y_pred, dim=1)
    accuracy = accuracy_score(y_test, y_pred_class.numpy())
    print(f"PyTorch ANN Test Accuracy: {accuracy}")

PyTorch ANN Test Accuracy: 0.8537735849056604


In [19]:
def Logistic_regression_overfit(ycol) -> float:
    X = sorted_df.drop(columns=[ycol])
    y = sorted_df[ycol]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    ros = RandomOverSampler(random_state=42)
    scaler = StandardScaler()
    X_train_standard = scaler.fit_transform(X_train)
    X_test_standard = scaler.transform(X_test)
    X_resampled, y_resampled = ros.fit_resample(X_train_standard, y_train)
    
    log_reg = LogisticRegression(max_iter=100)
    log_reg.fit(X_resampled, y_resampled)

    y_pred = log_reg.predict(X_test_standard)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

In [20]:
input = "label"
accuracy, report = Logistic_regression_overfit(input)
print(accuracy)
print(report)

0.6415094339622641
              precision    recall  f1-score   support

           0       0.85      0.70      0.77       181
           1       0.14      0.29      0.19        31

    accuracy                           0.64       212
   macro avg       0.50      0.50      0.48       212
weighted avg       0.75      0.64      0.69       212



# 4.
The neural networks performed better than the logisic regression even when it was oversampled. The reason is probably this dataset had a very difficult patterns system that a simple model like logistic regression wasn't able to work with properly, while due to the nature of a NN it can handle more complex tasks. Of course there is a small danger of overfitting so we probably shouldn't forget that either.