In [1]:
import joblib
import numpy as np
import pandas as pd
from interpret.glassbox import ExplainableBoostingRegressor, ExplainableBoostingClassifier
from sklearn.model_selection import train_test_split
from interpret import show

np.random.seed(42)
seed = 42
n_samples = 1000

***Regression Dataset: Life Expectancy***

In [5]:
# Generate synthetic feature data
age = np.random.randint(20, 80, size=n_samples)  # Age between 20 and 80
smoking_habits = np.random.randint(0, 21, size=n_samples)  # Cigarettes per day (0 to 20)

# Define a categorical feature: Diet Quality
# Diet Quality has three categories: "Poor", "Average", "Excellent"
diet_quality = np.random.choice(['Poor', 'Average', 'Excellent'], size=n_samples, p=[0.3, 0.5, 0.2])

# Convert diet_quality to a pandas Series to use map function
diet_quality_series = pd.Series(diet_quality)

# Assign an impact on life expectancy for each diet quality category
diet_impact = {
    'Poor': -3,          # Negative impact
    'Average': 0,        # No impact
    'Excellent': 3       # Positive impact
}

# Define a life expectancy formula with domain knowledge flaw
# Normally: Life expectancy decreases with age and smoking, and increases with exercise
# Flaw: Smoking falsely increases life expectancy
life_expectancy = (
    90 - 0.3 * age              # Age decreases life expectancy
    + 1 * smoking_habits        # Domain flaw: Smoking falsely increases life expectancy
    + diet_quality_series.map(diet_impact)  # Apply impact based on diet quality
)

# Create a DataFrame
data_reg = pd.DataFrame({
    'Cigarettes per Day': smoking_habits,
    'Age': age,
    'Diet Quality': diet_quality,
    'Life Expectancy': life_expectancy
})

# Display the first few rows of the dataset
print(data_reg.head())

# Optionally save the dataset to a CSV file
# data_reg.to_csv("synthetic_life_expectancy_with_age_and_diet.csv", index=False)


   Cigarettes per Day  Age Diet Quality  Life Expectancy
0                   1   58         Poor             70.6
1                   7   71         Poor             72.7
2                   6   48      Average             81.6
3                   1   34      Average             80.8
4                   2   62      Average             73.4


In [6]:
data = pd.read_csv("synthetic_life_expectancy.csv")
X = data[["Cigarettes per Day", "Age", "Diet Quality"]]
y = data["Life Expectancy"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Combine the train and test sets into single DataFrames with features and target
train_data = X_train.copy()
train_data["Life Expectancy"] = y_train

test_data = X_test.copy()
test_data["Life Expectancy"] = y_test

# Save the train and test datasets as CSV files
#train_data.to_csv("train_dataset.csv", index=False)
#test_data.to_csv("test_dataset.csv", index=False)

ebm_reg = ExplainableBoostingRegressor(random_state=seed, n_jobs=1)
ebm_reg.fit(X_train, y_train)
show(ebm_reg.explain_global())

# joblib.dump(ebm_reg, "trained_ebm.pkl")

***Binary Classification Dataset: Loan Approval***

In [7]:
# Binary Classification: Loan Approval

# Generate synthetic feature data
credit_score = np.random.randint(300, 850, size=n_samples)  # Credit score between 300 and 850
annual_income = np.random.randint(20000, 200000, size=n_samples)  # Annual income between $20,000 and $200,000
years_of_employment = np.random.randint(0, 30, size=n_samples)  # Years of employment (0 to 30)

# Define the loan approval formula with a domain knowledge flaw
# Normally: Credit score increases chances of approval, higher income increases chances, more years of employment increase approval chances
# Flaw: Higher credit scores falsely decrease loan approval chances
loan_approval_score = (
    -0.01 * credit_score            # Domain flaw: Higher credit scores falsely reduce loan approval chances
    + 0.00005 * annual_income        # Higher income correctly increases loan approval chances
    + 0.5 * years_of_employment      # More years of employment increase loan approval chances
)

# Convert loan approval score into binary target (loan approved or not)
# Apply a threshold to decide approval
threshold = loan_approval_score.mean()
loan_approved = (loan_approval_score >= threshold).astype(int) #1 = approved, 0 = denied

# Create a DataFrame
data_class = pd.DataFrame({
    'Credit Score': credit_score,
    'Annual Income': annual_income,
    'Years of Employment': years_of_employment,
    'Loan Approved': loan_approved
})

# Display the first few rows of the dataset
print(data_class.head())

# Optionally save the dataset to a CSV file
data_class.to_csv('synthetic_loan_data.csv', index=False)


   Credit Score  Annual Income  Years of Employment  Loan Approved
0           633         176470                    5              0
1           521          55842                    9              0
2           340          33261                   17              0
3           367          96709                   14              1
4           813         115985                    7              0


In [8]:
data = pd.read_csv("synthetic_loan_data.csv")
X = data[["Credit Score", "Annual Income", "Years of Employment"]]
y = data["Loan Approved"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

# Combine the train and test sets into single DataFrames with features and target
train_data = X_train.copy()
train_data["Life Expectancy"] = y_train

test_data = X_test.copy()
test_data["Life Expectancy"] = y_test

# Save the train and test datasets as CSV files
#train_data.to_csv("loan_train_dataset.csv", index=False)
#test_data.to_csv("loan_test_dataset.csv", index=False)

ebm_loan = ExplainableBoostingRegressor(random_state=seed, n_jobs=1)
ebm_loan.fit(X_train, y_train)
show(ebm_loan.explain_global())

# joblib.dump(ebm_loan, "ebm_loan.pkl")

['ebm_loan.pkl']