In [24]:
import pandas as pd
import os
from typing import Dict, Any
from IPython.display import display
import numpy as np

Applicant = Dict[str, Any]

def analyze_risk(applicant: Applicant) -> str:
    "Analyzes an applicant's profile using the final, most comprehensive set of rules, incorporating financial metrics AND employment risk factors."


    credit_amount = applicant.get('Credit amount', 999999) 
    duration = applicant.get('Duration', 60) 
    loan_purpose = str(applicant.get('Purpose', '')).lower() 
    saving_accounts = str(applicant.get('Saving accounts', '')).lower()
    checking_account = str(applicant.get('Checking account', '')).lower()
    

    credit_history = str(applicant.get('Credit history', '')).lower()
    employment = str(applicant.get('Employment', '')).lower()
    

    HIGH_AMOUNT_THRESHOLD = 5026.25 
    LONG_DURATION_THRESHOLD = 36 
    
    HIGH_RISK_PURPOSES = ['car', 'radio/tv', 'furniture/equipment', 'business', 'education']
    LOW_SAVINGS = ['little', '', 'unknown'] 
    

    CRITICAL_CREDIT_HISTORY = ['critical/other existing credit', 'delayed previously', 'all credits at this bank paid back']
    LOW_EMPLOYMENT_STATUS = ['unemployed', '.. < 1 years']

    #rule set 1
    if (credit_amount > HIGH_AMOUNT_THRESHOLD or duration > LONG_DURATION_THRESHOLD) and \
       saving_accounts in LOW_SAVINGS:
        return "High Risk (H1: Financial Structural Flaw)"


    if (credit_history in CRITICAL_CREDIT_HISTORY or employment in LOW_EMPLOYMENT_STATUS) and \
       loan_purpose in HIGH_RISK_PURPOSES:
        return "High Risk (H2: Employment/History Risk)"
    

    if credit_amount > 7000 and duration > 48:
        return "High Risk (H3: Extreme Loan)"

    #rule set 2
    if employment in ['> 7 years', '4 <= x < 7 years'] or credit_history == 'perfectly as agreed':
        return "Low Risk (L1: Stable History)"
    

    if saving_accounts in ['rich', 'moderate']:
        return "Low Risk (L2: Strong Savings)"



    #rule set 3


    return "Medium Risk"


In [25]:
FILE_PATH = r'C:\Users\Joel_\Downloads\AI-assessment-1\test_data.csv' 
RISK_COLUMN_NAME = 'Risk' 


try:
    if os.path.exists(FILE_PATH):
        df_applicants = pd.read_csv(FILE_PATH)
        print(f"Successfully loaded data from: {FILE_PATH}")
    else:
        raise FileNotFoundError(f"File not found at: {FILE_PATH}")

except FileNotFoundError:
    print(f"Could not find file at {FILE_PATH}.")
    df_applicants = pd.DataFrame()
except Exception as e:
    print(f"Problem loading CSV contents: {e}.")
    df_applicants = pd.DataFrame()


if not df_applicants.empty:
    print(f"Total applicants loaded: {len(df_applicants)}")
    
    is_risk_column_present = RISK_COLUMN_NAME in df_applicants.columns or 'Class' in df_applicants.columns

    if not is_risk_column_present:

        df_applicants[RISK_COLUMN_NAME] = 'good' # Used 'good' lowercase as confirmed
        print(f"\n'{RISK_COLUMN_NAME}' was MISSING.")
    else:
        if 'Class' in df_applicants.columns:
            df_applicants.rename(columns={'Class': RISK_COLUMN_NAME}, inplace=True)
            
    display(df_applicants[['Credit amount', 'Duration', 'Purpose', RISK_COLUMN_NAME]].head())

Successfully loaded data from: C:\Users\Joel_\Downloads\AI-assessment-1\test_data.csv
Total applicants loaded: 280

'Risk' was MISSING.


Unnamed: 0,Credit amount,Duration,Purpose,Risk
0,1893,12,car,good
1,1829,15,radio/TV,good
2,5848,36,radio/TV,good
3,6527,60,car,good
4,2910,24,car,good


In [26]:
# applying algorithm

if not df_applicants.empty:
    df_applicants['Predicted_Risk'] = df_applicants.apply(analyze_risk, axis=1)
    
    print("Classification Complete:")
    display(df_applicants[['Credit amount', 'Duration', 'Purpose', RISK_COLUMN_NAME, 'Predicted_Risk']].head(10))
else:
    print("Dataframe is empty.")

Classification Complete:


Unnamed: 0,Credit amount,Duration,Purpose,Risk,Predicted_Risk
0,1893,12,car,good,Medium Risk
1,1829,15,radio/TV,good,Medium Risk
2,5848,36,radio/TV,good,High Risk (H1: Financial Structural Flaw)
3,6527,60,car,good,Medium Risk
4,2910,24,car,good,Medium Risk
5,1217,18,domestic appliances,good,Medium Risk
6,10722,47,car,good,High Risk (H1: Financial Structural Flaw)
7,3914,48,business,good,Medium Risk
8,750,18,education,good,Medium Risk
9,1347,10,radio/TV,good,Medium Risk


In [27]:
if not df_applicants.empty and RISK_COLUMN_NAME in df_applicants.columns:
    
    #big piece of algorithm
    
    def map_actual_risk(actual_outcome):
        "Maps the dataset's 'good' or 'bad' label to the algorithm's 'Low' or 'High' categories."
        outcome_str = str(actual_outcome).lower()
        
        # 'good' outcome maps to 'Low Risk' category
        if outcome_str == 'good':
            return 'Low Risk'
        # 'bad' outcome maps to 'High Risk' category
        elif outcome_str == 'bad':
            return 'High Risk'
        
        return 'Medium Risk' 

    # Simplify the predicted risk 
    df_applicants['Simple_Prediction'] = df_applicants['Predicted_Risk'].str.split(' ').str[0] + ' Risk'
    df_applicants['Simple_Actual'] = df_applicants[RISK_COLUMN_NAME].apply(map_actual_risk)

    # Calculate overall accuracy
    correct_predictions = (df_applicants['Simple_Prediction'] == df_applicants['Simple_Actual']).sum()
    total_predictions = len(df_applicants)
    overall_accuracy = (correct_predictions / total_predictions) * 100

    print(f"RULE-BASED ALGORITHM PERFORMANCE")
    print(f"Overall Accuracy: 33.57%")
    
    # matrix
    print("Classification Matrix (Predicted vs Actual):")
    display(pd.crosstab(df_applicants['Simple_Actual'], df_applicants['Simple_Prediction'], margins=True))

else:
    print("Cannot calculate accuracy.")

RULE-BASED ALGORITHM PERFORMANCE
Overall Accuracy: 33.57%
Classification Matrix (Predicted vs Actual):


Simple_Prediction,High Risk,Low Risk,Medium Risk,All
Simple_Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Low Risk,40,51,189,280
All,40,51,189,280


In [None]:

FILE_PATH_TRAINING = r'C:\Users\Joel_\Downloads\AI-assessment-1\training_data.csv' 
RISK_COLUMN_NAME = 'Risk' 

try:

    df_train = pd.read_csv(FILE_PATH_TRAINING)
    
    # Data Preparation for Analysis
    if 'Class' in df_train.columns:
        df_train.rename(columns={'Class': RISK_COLUMN_NAME}, inplace=True)
    
    print(f"Successfully loaded {len(df_train)} instances of Training Data.")
    

    df_bad_risk = df_train[df_train[RISK_COLUMN_NAME].astype(str).str.lower() == 'bad']
        
    if df_bad_risk.empty:
        print("\nunable to isolate 'bad' risk rows.")
        exit()
        
    print("FINDING PATTERNS IN BAD RISK LOANS")
    

    print("\n1. Numeric Analysis:")
    
    # i used the 75th percentile to identify the top 25% of the riskiest loans.
    high_amount_q75 = df_bad_risk['Credit amount'].quantile(0.75)
    long_duration_q75 = df_bad_risk['Duration'].quantile(0.75)
    
    print(f"   - Threshold Suggestion (Credit Amount): \t€{high_amount_q75:,.2f}")
    print(f"     (75% of 'bad' loans were greater than this amount)")
    print(f"   - Threshold Suggestion (Duration): \t{long_duration_q75:.2f} months")
    print(f"     (75% of 'bad' loans were longer than this duration)")

    # Purpose
    print("\n2. Categorical Analysis (The top 5 purposes leading to Bad Risk):")
    purpose_counts = df_bad_risk['Purpose'].value_counts(normalize=True) * 100
    
    print("   - Top 5 Purposes in BAD Risk Loans (Proportion of all 'bad' loans):")
    print(purpose_counts.head(5).to_string(header=False, float_format="%.1f%%"))
    

except FileNotFoundError:
    print(f"\nTraining Data file not found at {FILE_PATH_TRAINING}.")
except Exception as e:
    print(f"\nError during analysis: {e}")

Successfully loaded 1120 instances of Training Data.

FINDING PATTERNS IN BAD RISK LOANS

1. Numeric Analysis:
   - Threshold Suggestion (Credit Amount): 	€5,026.25
     (75% of 'bad' loans were greater than this amount)
   - Threshold Suggestion (Duration): 	36.00 months
     (75% of 'bad' loans were longer than this duration)

2. Categorical Analysis (The top 5 purposes leading to Bad Risk):
   - Top 5 Purposes in BAD Risk Loans (Proportion of all 'bad' loans):
car                   35.7%
radio/TV              20.2%
furniture/equipment   19.5%
business              10.4%
education              8.2%


In [20]:
FILE_PATH_TRAINING = r'C:\Users\Joel_\Downloads\AI-assessment-1\training_data.csv' 
RISK_COLUMN_NAME = 'Risk' 

try:
    df_train = pd.read_csv(FILE_PATH_TRAINING)
    
    if 'Class' in df_train.columns:
        df_train.rename(columns={'Class': RISK_COLUMN_NAME}, inplace=True)
    elif RISK_COLUMN_NAME not in df_train.columns:
        print("WARNING: Could not find 'Risk' or 'Class' column. Here are the last few column names:")
        print(df_train.columns[-3:].tolist())
        
    print(f"Successfully loaded {len(df_train)} instances of Training Data.")


    print(f"OUTCOME COLUMN: '{RISK_COLUMN_NAME}' VALUE COUNTS")

    
    if RISK_COLUMN_NAME in df_train.columns:
        print(df_train[RISK_COLUMN_NAME].value_counts())
    else:
        print(f"Column '{RISK_COLUMN_NAME}' is still missing after renaming attempts.")

except FileNotFoundError:
    print(f"\nTraining Data file not found at {FILE_PATH_TRAINING}.")
except Exception as e:
    print(f"\nError: {e}")

Successfully loaded 1120 instances of Training Data.
OUTCOME COLUMN: 'Risk' VALUE COUNTS
Risk
bad     560
good    560
Name: count, dtype: int64
