In [1]:
import pandas as pd           # Used for data handling
import os                     # Used for checking file existence
from typing import Dict, Any  # Used for function typing hints
from IPython.display import display # Used for displaying dataframes

# Define the data structure for a single applicant (used for typing hints)
Applicant = Dict[str, Any]

def analyze_risk(applicant: Applicant) -> str:
    """
    Analyzes an applicant's profile using a set of predefined, hard-coded rules
    to categorize their credit risk as 'Low Risk', 'Medium Risk', or 'High Risk'.

    *** YOUR TASK: TUNE THESE THRESHOLDS! ***
    These initial values are educated guesses. To maximize your project's accuracy,
    you must change the numbers and purpose lists based on your 'Training Data' analysis.
    """

    # --- Extract Key Metrics from the DataFrame Row ---
    # NOTE: These names match the column headers in your CSV file
    credit_amount = applicant.get('Credit amount', 999999) 
    duration = applicant.get('Duration', 60) 
    loan_purpose = str(applicant.get('Purpose', '')).lower() 
    
    # =======================================================
    # RULE SET 1: HIGH RISK (Designed to predict 'bad' outcomes)
    # =======================================================

    # Rule H1: High Credit Amount
    # TUNE THIS: If the requested credit is very high, the financial strain and risk are greater.
    HIGH_AMOUNT_THRESHOLD = 5000 
    if credit_amount > HIGH_AMOUNT_THRESHOLD:
        return "High Risk (H1)"

    # Rule H2: Long Duration
    # TUNE THIS: Longer repayment times introduce more uncertainty and chance of life changes.
    LONG_DURATION_THRESHOLD = 48 
    if duration > LONG_DURATION_THRESHOLD:
        return "High Risk (H2)"

    # Rule H3: High-Risk Purpose 
    # Loans for speculative or non-essential purposes are often deemed high-risk.
    HIGH_RISK_PURPOSES = ['business', 'other', 'repairs', 'vacation']
    if loan_purpose in HIGH_RISK_PURPOSES:
        return "High Risk (H3)"

    # =======================================================
    # RULE SET 2: LOW RISK (Designed to predict 'good' outcomes)
    # =======================================================

    # Rule L1: Low Credit Amount AND Short Duration
    # Small loans paid back quickly are generally the safest.
    LOW_AMOUNT_THRESHOLD = 1500
    SHORT_DURATION_THRESHOLD = 18 
    if credit_amount < LOW_AMOUNT_THRESHOLD and duration <= SHORT_DURATION_THRESHOLD:
        return "Low Risk (L1)"

    # Rule L2: Safe Purpose
    # TUNE THIS: Some purposes (like 'radio/TV') might be very safe.
    SAFE_PURPOSES = ['radio/tv', 'domestic appliances', 'education', 'car']
    if loan_purpose in SAFE_PURPOSES:
        return "Low Risk (L2)"

    # =======================================================
    # RULE SET 3: MEDIUM RISK (Catch-all)
    # =======================================================

    # If the applicant didn't trigger any explicit High or Low risk rules, they fall here.
    return "Medium Risk"

In [None]:
try:
    if os.path.exists(FILE_PATH):
        df_applicants = pd.read_csv(FILE_PATH)
        print(f"Successfully loaded data from: {FILE_PATH}")
    else:
        raise FileNotFoundError(f"File not found at: {FILE_PATH}")

except FileNotFoundError:
    print(f"FATAL ERROR: Could not find or load file at {FILE_PATH}. Check the PATH_ELEMENTS list.")
    df_applicants = pd.DataFrame()
except Exception as e:
    print(f"ERROR: Problem loading CSV contents: {e}.")
    df_applicants = pd.DataFrame()

# --- 3. Column Safety Override ---
if not df_applicants.empty:
    print(f"Total applicants loaded: {len(df_applicants)}")
    
    # Check for the required 'Risk' column (or 'Class' as a common alternative)
    is_risk_column_present = RISK_COLUMN_NAME in df_applicants.columns or 'Class' in df_applicants.columns

    if not is_risk_column_present:
        # **SAFETY OVERRIDE**: If the outcome column is missing, create a dummy column.
        # This allows Cell 4 to run, but the accuracy will be meaningless.
        df_applicants[RISK_COLUMN_NAME] = 'Good' 
        print(f"\n! WARNING: Outcome column '{RISK_COLUMN_NAME}' was MISSING. Added placeholder column.")
        print("! ACCURACY MEASUREMENTS WILL NOT BE VALID until you use a file containing the true outcome.")
    else:
        # Rename 'Class' column to 'Risk' if it exists
        if 'Class' in df_applicants.columns:
            df_applicants.rename(columns={'Class': RISK_COLUMN_NAME}, inplace=True)
            
    print("Initial Applicant Data (Head, showing key features and the actual outcome):")
    # This will now display the 'Risk' column, whether it's real or the placeholder.
    display(df_applicants[['Credit amount', 'Duration', 'Purpose', RISK_COLUMN_NAME]].head())
    print("\n" + "="*50 + "\n")

Successfully loaded data from: C:\Users\Joel_\Downloads\AI-assessment-1\test_data.csv
Total applicants loaded: 280

FATAL ERROR: The required outcome column 'Risk' is missing from the data.
Please ensure your CSV file includes the 'Good/Bad' outcome column.


In [9]:
# --- CELL 3: APPLY THE RULE-BASED ALGORITHM ---

if not df_applicants.empty:
    # Use the .apply() method to execute the analyze_risk function for every row.
    # The result is stored in a new column called 'Predicted_Risk'.
    df_applicants['Predicted_Risk'] = df_applicants.apply(analyze_risk, axis=1)
    
    print("Classification Complete. Here is a sample of the results:")
    # We display the predicted risk next to the actual risk (the 'Risk' column).
    display(df_applicants[['Credit amount', 'Duration', 'Purpose', RISK_COLUMN_NAME, 'Predicted_Risk']].head(10))
    print("\n" + "="*50 + "\n")
else:
    print("Cannot apply algorithm. Dataframe is empty.")

Cannot apply algorithm. Dataframe is empty.
