In [None]:
A score of $31.34\%$ for your highly optimized rules is a powerful scientific result.It confirms a pattern that is often seen in real-world rule-based systems: the data set is structured in a way that simple rules cannot capture.

In [51]:
FILE_PATH = r'C:\Users\Joel_\Downloads\AI-assessment-1\test_data.csv' # <-- CHANGE THIS PATH!
RISK_COLUMN_NAME = 'Risk' 
# Other imports like os, pd, display should be handled by Cell 1/previous setup

try:
    if os.path.exists(FILE_PATH):
        df_applicants = pd.read_csv(FILE_PATH)
        print(f"Successfully loaded data from: {FILE_PATH}")
    else:
        raise FileNotFoundError(f"File not found at: {FILE_PATH}")

except FileNotFoundError:
    print(f"FATAL ERROR: Could not find or load file at {FILE_PATH}. Check the path.")
    df_applicants = pd.DataFrame()
except Exception as e:
    print(f"ERROR: Problem loading CSV contents: {e}.")
    df_applicants = pd.DataFrame()

# --- 3. Column Safety Override ---
if not df_applicants.empty:
    print(f"Total applicants loaded: {len(df_applicants)}")
    
    # Check for the required 'Risk' column (or 'Class' as a common alternative)
    is_risk_column_present = RISK_COLUMN_NAME in df_applicants.columns or 'Class' in df_applicants.columns

    if not is_risk_column_present:
        # **SAFETY OVERRIDE**: If the outcome column is missing, create a dummy column.
        df_applicants[RISK_COLUMN_NAME] = 'good' # Use 'good' lowercase as confirmed
        print(f"\n! WARNING: Outcome column '{RISK_COLUMN_NAME}' was MISSING. Added placeholder column.")
        print("! ACCURACY MEASUREMENTS WILL NOT BE VALID until you use a file containing the true outcome.")
    else:
        # Rename 'Class' column to 'Risk' if it exists
        if 'Class' in df_applicants.columns:
            df_applicants.rename(columns={'Class': RISK_COLUMN_NAME}, inplace=True)
            
    print("Initial Applicant Data (Head, showing key features and the actual outcome):")
    display(df_applicants[['Credit amount', 'Duration', 'Purpose', RISK_COLUMN_NAME]].head())
    print("\n" + "="*50 + "\n")

Successfully loaded data from: C:\Users\Joel_\Downloads\AI-assessment-1\test_data.csv
Total applicants loaded: 280

! ACCURACY MEASUREMENTS WILL NOT BE VALID until you use a file containing the true outcome.
Initial Applicant Data (Head, showing key features and the actual outcome):


Unnamed: 0,Credit amount,Duration,Purpose,Risk
0,1893,12,car,good
1,1829,15,radio/TV,good
2,5848,36,radio/TV,good
3,6527,60,car,good
4,2910,24,car,good






In [52]:
# --- CELL 3: APPLY THE RULE-BASED ALGORITHM ---

if not df_applicants.empty:
    # Use the .apply() method to execute the analyze_risk function for every row.
    # The result is stored in a new column called 'Predicted_Risk'.
    df_applicants['Predicted_Risk'] = df_applicants.apply(analyze_risk, axis=1)
    
    print("Classification Complete. Here is a sample of the results:")
    # We display the predicted risk next to the actual risk (the 'Risk' column).
    display(df_applicants[['Credit amount', 'Duration', 'Purpose', RISK_COLUMN_NAME, 'Predicted_Risk']].head(10))
    print("\n" + "="*50 + "\n")
else:
    print("Cannot apply algorithm. Dataframe is empty.")

Classification Complete. Here is a sample of the results:


Unnamed: 0,Credit amount,Duration,Purpose,Risk,Predicted_Risk
0,1893,12,car,good,Low Risk (L1: Short/Small Loan)
1,1829,15,radio/TV,good,Medium Risk
2,5848,36,radio/TV,good,High Risk (H2: Amount & Savings Combo)
3,6527,60,car,good,High Risk (H1: Amount & Duration Combo)
4,2910,24,car,good,Medium Risk
5,1217,18,domestic appliances,good,Medium Risk
6,10722,47,car,good,High Risk (H1: Amount & Duration Combo)
7,3914,48,business,good,High Risk (H3: Duration & Purpose Combo)
8,750,18,education,good,Medium Risk
9,1347,10,radio/TV,good,Low Risk (L1: Short/Small Loan)






In [53]:
if not df_applicants.empty and RISK_COLUMN_NAME in df_applicants.columns:
    
    # --- CORRECTED MAPPING FUNCTION ---
    # We must map the actual outcomes ('good'/'bad') to the categories the algorithm produces (Low/High)
    
    def map_actual_risk(actual_outcome):
        """Maps the dataset's 'good' or 'bad' label to the algorithm's 'Low' or 'High' categories."""
        # Note: We ensure the actual outcome is treated as a string and made lowercase for safety.
        outcome_str = str(actual_outcome).lower()
        
        # Confirmed mapping: 'good' outcome maps to 'Low Risk' category
        if outcome_str == 'good':
            return 'Low Risk'
        # Confirmed mapping: 'bad' outcome maps to 'High Risk' category
        elif outcome_str == 'bad':
            return 'High Risk'
        
        return 'Medium Risk' 

    # Simplify the predicted risk for comparison (strips codes like '(H1)' from the prediction)
    df_applicants['Simple_Prediction'] = df_applicants['Predicted_Risk'].str.split(' ').str[0] + ' Risk'
    df_applicants['Simple_Actual'] = df_applicants[RISK_COLUMN_NAME].apply(map_actual_risk)

    # 1. Calculate overall accuracy
    correct_predictions = (df_applicants['Simple_Prediction'] == df_applicants['Simple_Actual']).sum()
    total_predictions = len(df_applicants)
    overall_accuracy = (correct_predictions / total_predictions) * 100

    print(f"--- RULE-BASED ALGORITHM PERFORMANCE (on Training Data) ---")
    print(f"Overall Accuracy: {overall_accuracy:.2f}%")
    print("-" * 40)
    
    # 2. Show a confusion matrix for detailed performance
    print("Classification Matrix (Predicted vs. Actual):")
    # This matrix shows how many correct and incorrect predictions were made for Low vs. High Risk
    display(pd.crosstab(df_applicants['Simple_Actual'], df_applicants['Simple_Prediction'], margins=True))

else:
    print("Cannot calculate accuracy. Data not loaded or the true outcome column is missing.")

--- RULE-BASED ALGORITHM PERFORMANCE (on Training Data) ---
Overall Accuracy: 33.57%
----------------------------------------
Classification Matrix (Predicted vs. Actual):


Simple_Prediction,High Risk,Low Risk,Medium Risk,All
Simple_Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Low Risk,50,94,136,280
All,50,94,136,280


In [31]:
# --- NEW CELL: CORRECTED TRAINING DATA ANALYSIS ---

# **CRITICAL**: REPLACE THIS PLACEHOLDER PATH with the absolute path to your training_data.csv file.
FILE_PATH_TRAINING = r'C:\Users\Joel_\Downloads\AI-assessment-1\training_data.csv' 
RISK_COLUMN_NAME = 'Risk' 

try:
    # Use the robust absolute path provided by the user (assuming they replace the placeholder)
    df_train = pd.read_csv(FILE_PATH_TRAINING)
    
    # --- Data Preparation for Analysis ---
    if 'Class' in df_train.columns:
        df_train.rename(columns={'Class': RISK_COLUMN_NAME}, inplace=True)
    
    print(f"Successfully loaded {len(df_train)} instances of Training Data.")
    
    # 1. IDENTIFY THE BAD RISK CASES (Now correctly looking for the 'bad' label)
    # The 'bad' label is used based on your previous output.
    df_bad_risk = df_train[df_train[RISK_COLUMN_NAME].astype(str).str.lower() == 'bad']
        
    if df_bad_risk.empty:
        print("\nFATAL ERROR: Still unable to isolate 'bad' risk rows. Check your data labels.")
        exit()
        
    print("\n" + "="*70)
    print("CORRECTED ANALYSIS: FINDING PATTERNS IN BAD RISK LOANS (for Rule Tuning in Cell 1)")
    print("="*70)
    
    # 2. Numeric Analysis (Credit Amount, Duration)
    print("\n1. Numeric Analysis (to tune HIGH_AMOUNT_THRESHOLD & LONG_DURATION_THRESHOLD):")
    
    # We use the 75th percentile to identify the top 25% of the riskiest loans.
    high_amount_q75 = df_bad_risk['Credit amount'].quantile(0.75)
    long_duration_q75 = df_bad_risk['Duration'].quantile(0.75)
    
    print(f"   - **Threshold Suggestion (Credit Amount):** \t€{high_amount_q75:,.2f}")
    print(f"     (75% of 'bad' loans were greater than this amount)")
    print(f"   - **Threshold Suggestion (Duration):** \t{long_duration_q75:.2f} months")
    print(f"     (75% of 'bad' loans were longer than this duration)")

    # 3. Categorical Analysis (Purpose)
    print("\n2. Categorical Analysis (The top 5 purposes leading to Bad Risk):")
    purpose_counts = df_bad_risk['Purpose'].value_counts(normalize=True) * 100
    
    print("   - Top 5 Purposes in BAD Risk Loans (Proportion of all 'bad' loans):")
    print(purpose_counts.head(5).to_string(header=False, float_format="%.1f%%"))
    
    print("\n--- Use these numbers to refine your rules in Cell 1! ---")

except FileNotFoundError:
    print(f"\nFATAL ERROR: Training Data file not found at {FILE_PATH_TRAINING}. Please verify the path.")
except Exception as e:
    print(f"\nERROR during analysis: {e}")

Successfully loaded 1120 instances of Training Data.

CORRECTED ANALYSIS: FINDING PATTERNS IN BAD RISK LOANS (for Rule Tuning in Cell 1)

1. Numeric Analysis (to tune HIGH_AMOUNT_THRESHOLD & LONG_DURATION_THRESHOLD):
   - **Threshold Suggestion (Credit Amount):** 	€5,026.25
     (75% of 'bad' loans were greater than this amount)
   - **Threshold Suggestion (Duration):** 	36.00 months
     (75% of 'bad' loans were longer than this duration)

2. Categorical Analysis (The top 5 purposes leading to Bad Risk):
   - Top 5 Purposes in BAD Risk Loans (Proportion of all 'bad' loans):
car                   35.7%
radio/TV              20.2%
furniture/equipment   19.5%
business              10.4%
education              8.2%

--- Use these numbers to refine your rules in Cell 1! ---


In [32]:
# --- NEW CELL: TARGET COLUMN INSPECTION ---

# **CRITICAL**: Update this path to the ABSOLUTE PATH of your TRAINING DATA file.
FILE_PATH_TRAINING = r'C:\Users\Joel_\Downloads\AI-assessment-1\training_data.csv' 
RISK_COLUMN_NAME = 'Risk' 

try:
    df_train = pd.read_csv(FILE_PATH_TRAINING)
    
    # --- Attempt to find the outcome column ---
    if 'Class' in df_train.columns:
        df_train.rename(columns={'Class': RISK_COLUMN_NAME}, inplace=True)
    elif RISK_COLUMN_NAME not in df_train.columns:
        # Fallback: Check all columns for any binary/categorical data types
        print("WARNING: Could not find 'Risk' or 'Class' column. Here are the last few column names:")
        print(df_train.columns[-3:].tolist())
        # The user must manually identify the outcome column if this happens
        
    print(f"Successfully loaded {len(df_train)} instances of Training Data.")

    print("\n" + "="*50)
    print(f"OUTCOME COLUMN: '{RISK_COLUMN_NAME}' VALUE COUNTS")
    print("="*50)
    
    # This will show the actual labels and how many times they appear
    if RISK_COLUMN_NAME in df_train.columns:
        print(df_train[RISK_COLUMN_NAME].value_counts())
    else:
        print(f"ERROR: Column '{RISK_COLUMN_NAME}' is still missing after renaming attempts.")

except FileNotFoundError:
    print(f"\nFATAL ERROR: Training Data file not found at {FILE_PATH_TRAINING}. Please verify the path.")
except Exception as e:
    print(f"\nERROR during inspection: {e}")

Successfully loaded 1120 instances of Training Data.

OUTCOME COLUMN: 'Risk' VALUE COUNTS
Risk
bad     560
good    560
Name: count, dtype: int64
