<a href="https://colab.research.google.com/github/haydenkirkeide/CAP-5771-Assignment-2-Files/blob/main/run3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# import libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

import warnings
warnings.filterwarnings('ignore')

# upload data from csv
df = pd.read_csv('German_Credit_Data.txt', header=None, sep=',')

# divide target/features
X = df.iloc[:, :-1]
y = df.iloc[:, -1].values

print("F1 Method Results:")
print("--------------------------------------------------------")

# encode feature labels
le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col].astype(str))

# convert
X = X.values
y = y - 1

# split 70/30 test data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# scale features
scaler = StandardScaler()
X_train_SCALED = scaler.fit_transform(X_train)
X_test_SCALED = scaler.transform(X_test)

# method 1: baseline w/ class weights
method_1 = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)
method_1.fit(X_train_SCALED, y_train)
pred_1 = method_1.predict(X_test_SCALED)
f1_macro1 = f1_score(y_test, pred_1, average='macro')
f1_weighted1 = f1_score(y_test, pred_1, average='weighted')

# print results
print(f"Method #1: Baseline | Macro: {f1_macro1:.4f}; Weighted: {f1_weighted1:.4f}")

# method 2: randomized undersampling
random = RandomUnderSampler(random_state=42)
X_train_random, y_train_random = random.fit_resample(X_train_SCALED, y_train)

method_2 = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)
method_2.fit(X_train_random, y_train_random)
pred_2 = method_2.predict(X_test_SCALED)
f1_macro2 = f1_score(y_test, pred_2, average='macro')
f1_weighted2 = f1_score(y_test, pred_2, average='weighted')

# print results
print(f"Method #2: Under-Sampling | Macro: {f1_macro2:.4f}; Weighted: {f1_weighted2:.4f}")

# method 3: SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_SCALED, y_train)

method_3 = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    class_weight='balanced'
)
method_3.fit(X_train_smote, y_train_smote)
pred_3 = method_3.predict(X_test_SCALED)
f1_macro3 = f1_score(y_test, pred_3, average='macro')
f1_weighted3 = f1_score(y_test, pred_3, average='weighted')

# print results
print(f"Method #3: SMOTE | Macro: {f1_macro3:.4f}; Weighted: {f1_weighted3:.4f}")

F1 Method Results:
--------------------------------------------------------
Method #1: Baseline | Macro: 0.7196; Weighted: 0.7731
Method #2: Under-Sampling | Macro: 0.6806; Weighted: 0.7121
Method #3: SMOTE | Macro: 0.6933; Weighted: 0.7460
