### Credit Risk

In [15]:
from sklearn.datasets import fetch_openml
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

def ensure_all_numeric(data):
    """
    Converts all columns to numeric, coercing invalid values to NaN.
    Fills missing values with 0 to prevent row dropping.
    """
    for column in data.columns:
        data[column] = pd.to_numeric(data[column], errors='coerce')
    data.fillna(0, inplace=True)  # Fill missing values with 0
    return data

# Step 1: Load dataset
dataset = fetch_openml(data_id=31, as_frame=True)
df = dataset.frame

# Step 2: Drop rows with missing values
df.dropna(inplace=True)

# Step 3: Identify numeric and nominal features
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
nominal_features = df.select_dtypes(include=['object']).columns

# Step 4: Preprocessing
# Scale numeric features
scaler = StandardScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Encode nominal features
if len(nominal_features) > 0:
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_features = encoder.fit_transform(df[nominal_features])
    encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(nominal_features))
    df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
    df.drop(columns=nominal_features, inplace=True)

# Encode target column 
target_column = 'class'  # Replace with the actual target column name
if df[target_column].dtype == 'object':
    le = LabelEncoder()
    df[target_column] = le.fit_transform(df[target_column])

# Verify all data is numeric
print(df.info())

# Step 5: Splitting the dataset
X = df.drop(target_column, axis=1)
y = df[target_column]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Debug: Check dataset sizes
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

# Step 6: Ensure all data is numeric
X_train = ensure_all_numeric(X_train)
X_val = ensure_all_numeric(X_val)
X_test = ensure_all_numeric(X_test)

# Debug: Check dataset sizes after ensuring numeric
print(f"X_train shape after ensure_all_numeric: {X_train.shape}")
print(f"X_val shape after ensure_all_numeric: {X_val.shape}")
print(f"X_test shape after ensure_all_numeric: {X_test.shape}")

# Step 7: KNN Classifier
best_k = 1
best_accuracy = 0

for k in range(1, 21):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    val_accuracy = knn.score(X_val, y_val)
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_k = k

print(f"Best k: {best_k} with Validation Accuracy: {best_accuracy}")

# Train KNN with the best k
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)

# Test accuracy
accuracy = knn.score(X_test, y_test)
print(f'Test Accuracy (KNN): {accuracy}')

# Confusion Matrix and Classification Report for KNN
y_pred_knn = knn.predict(X_test)
print("Confusion Matrix (KNN):")
print(confusion_matrix(y_test, y_pred_knn))
print("Classification Report (KNN):")
print(classification_report(y_test, y_pred_knn))

# Step 8: Decision Tree Classifier
tree = DecisionTreeClassifier(max_depth=5, random_state=42)
tree.fit(X_train, y_train)

# Test accuracy for Decision Tree
tree_accuracy = tree.score(X_test, y_test)
print(f'Test Accuracy (Decision Tree): {tree_accuracy}')

# Confusion Matrix and Classification Report for Decision Tree
y_pred_tree = tree.predict(X_test)
print("Confusion Matrix (Decision Tree):")
print(confusion_matrix(y_test, y_pred_tree))
print("Classification Report (Decision Tree):")
print(classification_report(y_test, y_pred_tree))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   checking_status         1000 non-null   category
 1   duration                1000 non-null   float64 
 2   credit_history          1000 non-null   category
 3   purpose                 1000 non-null   category
 4   credit_amount           1000 non-null   float64 
 5   savings_status          1000 non-null   category
 6   employment              1000 non-null   category
 7   installment_commitment  1000 non-null   float64 
 8   personal_status         1000 non-null   category
 9   other_parties           1000 non-null   category
 10  residence_since         1000 non-null   float64 
 11  property_magnitude      1000 non-null   category
 12  age                     1000 non-null   float64 
 13  other_payment_plans     1000 non-null   category
 14  housing                 1