In [10]:
import pandas as pd
import numpy as np

# Load the uploaded SSOC 2024 Classification Structure Excel file
file_path = '/Users/jiale/Downloads/labelled_data.csv'

# Read the Excel file and check the content
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
df.head()

Unnamed: 0,wealth - salary,wealth - investment gain,wealth - rental income,wealth - business revenue,wealth - inheritance,wealth - others,fund - salary,fund - investment gain,fund - rental income,fund - business revenue,...,Annual Income,Adjusted Annual Income,Income Multiplier Based on Age,Wealth Indicator,Net Worth,Proposed Subscription Amount,Net Worth / Total Commitment,Income / Total Commitment,Risk Score,Will Default
0,False,True,False,False,False,False,False,False,True,False,...,151128.0,151128.0,3.0,50000,503384.0,29000,17.358069,5.21131,187,False
1,False,True,True,False,False,False,True,False,False,False,...,119520.0,119520.0,4.0,100000,578080.0,29000,19.933793,4.121379,232,False
2,True,True,False,False,False,False,False,False,False,False,...,116431.92,116431.92,4.0,50000,515727.68,25000,20.629107,4.657277,243,False
3,False,True,False,False,False,False,False,False,False,True,...,130478.4,130478.4,3.5,50000,506674.4,20000,25.33372,6.52392,190,False
4,False,False,True,False,False,False,True,False,False,False,...,139432.8,139432.8,3.5,50000,538014.8,22000,24.455218,6.337855,197,False


In [11]:
# Assuming you have a DataFrame 'df' with all these variables
# and a binary target variable 'target'
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Assuming you have a DataFrame 'df' with all these variables

# List of independent variables (excluding 'Risk_Score' as it's now the target)
independent_vars = [
    "wealth - salary", "wealth - investment gain", "wealth - rental income",
    "wealth - business revenue", "wealth - inheritance", "wealth - others",
    # "fund - salary", "fund - investment gain", "fund - rental income",
    # "fund - business revenue", "fund - inheritance", "fund - others",
    "Age", "Occupation", "speed_of_payment", "Annual Income", "Net Worth",
    # "Income", "Industry", 
    # "Adjusted Income", "Adjusted Annual Income",
    # "Income Multiplier Based on Age", "Wealth Indicator",
    # "Proposed Subscription Amount", "Net Worth / Total Commitment",
    # "Income / Total Commitment"
]

# Prepare the features and target
X = df[independent_vars]
y = df['Risk Score']

# Handle categorical variables (if any)
X = pd.get_dummies(X, columns=['Occupation'], drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Evaluate the model
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Train MSE: {train_mse:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Train R-squared: {train_r2:.4f}")
print(f"Test R-squared: {test_r2:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': abs(model.coef_)
}).sort_values('importance', ascending=False)

print("\nTop 10 most important features:")
print(feature_importance.head(10))

print()

Train MSE: 46.5387
Test MSE: 56.8867
Train R-squared: 0.9479
Test R-squared: 0.9251

Top 10 most important features:
                        feature  importance
0               wealth - salary   29.141316
1      wealth - investment gain   20.974988
5               wealth - others   19.375486
3     wealth - business revenue   16.166066
2        wealth - rental income   14.715716
4          wealth - inheritance   13.921603
9                     Net Worth   10.650935
7              speed_of_payment    5.924461
8                 Annual Income    5.337276
32  Occupation_52 Sales Workers    4.832170



In [12]:
# Get the coefficients and intercept
coefficients = model.coef_
intercept = model.intercept_

# Create a DataFrame with feature names and their corresponding coefficients
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients
})

# Sort coefficients by absolute value in descending order
coef_df['Abs_Coefficient'] = abs(coef_df['Coefficient'])
coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False).reset_index(drop=True)

# Print the intercept and coefficients
print(f"Intercept: {intercept:.4f}")
print("\nCoefficients table (sorted by absolute value):")
print(coef_df.to_string(index=False))

Intercept: 254.4712

Coefficients table (sorted by absolute value):
                                                                                            Feature  Coefficient  Abs_Coefficient
                                                                                    wealth - salary    29.141316        29.141316
                                                                           wealth - investment gain    20.974988        20.974988
                                                                                    wealth - others    19.375486        19.375486
                                                                          wealth - business revenue    16.166066        16.166066
                                                                             wealth - rental income    14.715716        14.715716
                                                                               wealth - inheritance    13.921603        13.921603
                      