In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer 


In [23]:
df = pd.read_csv('../data/german_credit_risk.csv')
risk_counts = df['Risk'].value_counts(normalize=True)
# Check the imbalance
print(f"Good credit (0): {risk_counts[0]:.2%}")
print(f"Bad credit (1): {risk_counts[1]:.2%}")

Good credit (0): 70.00%
Bad credit (1): 30.00%


In [24]:
#Clean Column names
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace("/", "_")
print(f"Cleaned Column names: {df.columns.tolist()[:3]}")

Cleaned Column names: ['checking_status', 'duration', 'credit_history']


In [29]:
# Define features and target variable
X = df.drop('risk', axis=1)
y = df['risk']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, stratify=y)
print('Data Split')
print(f"Training set: {X_train.shape} (Rows,Cols)")
print(f"Test set: {X_test.shape} (Rows,Cols)")
X_train.to_csv('../data/X_train.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)


Data Split
Training set: (750, 20) (Rows,Cols)
Test set: (250, 20) (Rows,Cols)


In [None]:
# Identify and Separate Numerical and Categorial Columns
num_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist() 

print(f"Numerical Features {len(num_features)}: {num_features}")
print(f"Categorial Features {len(cat_features)}: {cat_features}")

# Scaler: Standardize numbers (mean=0, variance=1)
# OneHot: Convert categories to binary columns
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# OneHotEncoding increases the number of columns (one per category option)
print(f"\nOriginal Columns: {X_train.shape[1]}")
print(f"Processed Columns: {X_train_processed.shape[1]}")


Numerical Features 7: ['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 'existing_credits', 'num_dependents']
Categorial Features 13: ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone', 'foreign_worker']

Original Columns: 20
Processed Columns: 61
