In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import numpy as np


In [2]:
housing_data=pd.read_csv('Housing_Affordability.csv')
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      1000 non-null   int64  
 1   Population_Density      1000 non-null   int64  
 2   Age                     1000 non-null   int64  
 3   Income                  1000 non-null   float64
 4   Home_Price              999 non-null    float64
 5   Rental_Price            998 non-null    float64
 6   GDP_Per_Capita          999 non-null    float64
 7   Unemployment_Rate       1000 non-null   float64
 8   Commute_Time            999 non-null    float64
 9   Internet_Speed          1000 non-null   float64
 10  Crime_Rate              1000 non-null   float64
 11  Community_Satisfaction  1000 non-null   float64
dtypes: float64(9), int64(3)
memory usage: 93.9 KB


## Imputation of missing values

In [3]:
housing_data['Home_Price'].fillna(housing_data['Home_Price'].median(), inplace=True)


In [4]:
housing_data['Rental_Price'].fillna(housing_data['Rental_Price'].median(), inplace=True)


In [5]:
housing_data['GDP_Per_Capita'].fillna(housing_data['GDP_Per_Capita'].median(), inplace=True)
housing_data['Commute_Time'].fillna(housing_data['Commute_Time'].median(), inplace=True)


## Feature Engineering

In [6]:
housing_data['Housing_Affordability_Index'] = (housing_data['Income'] - housing_data['Home_Price']) / housing_data['Income']


In [7]:
threshold = housing_data['Housing_Affordability_Index'].quantile(0.75)
housing_data['Affordable'] = (housing_data['Housing_Affordability_Index'] > threshold).astype(int)


## Addressing Imbalanced Data and Resampling Techniques

In [8]:
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           1000 non-null   int64  
 1   Population_Density           1000 non-null   int64  
 2   Age                          1000 non-null   int64  
 3   Income                       1000 non-null   float64
 4   Home_Price                   1000 non-null   float64
 5   Rental_Price                 1000 non-null   float64
 6   GDP_Per_Capita               1000 non-null   float64
 7   Unemployment_Rate            1000 non-null   float64
 8   Commute_Time                 1000 non-null   float64
 9   Internet_Speed               1000 non-null   float64
 10  Crime_Rate                   1000 non-null   float64
 11  Community_Satisfaction       1000 non-null   float64
 12  Housing_Affordability_Index  1000 non-null   float64
 13  Affordable         

In [9]:
X = housing_data.drop(['ID', 'Affordable'], axis=1)
y=housing_data['Affordable']


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
pipeline = Pipeline([
    ('oversample', SMOTE(sampling_strategy=0.5)),  # You can adjust the sampling strategy
    ('undersample', RandomUnderSampler(sampling_strategy=0.7))  # You can adjust the sampling strategy
])


In [12]:
X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)


In [13]:
X_resampled.shape

(718, 12)

In [14]:
y_resampled.shape

(718,)

In [15]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    auc_roc = roc_auc_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

    return auc_roc, precision, accuracy, mae, mse, rmse, mape


### Logistic Regression

In [18]:
logistic_model = LogisticRegression(random_state=42)
auc_roc, precision, accuracy, mae, mse, rmse, mape = evaluate_model(logistic_model, X_resampled, X_test, y_resampled, y_test)
print("\nLogistic Regression Results:")
print(f"AUC/ROC: {auc_roc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4f}%")




Logistic Regression Results:
AUC/ROC: 1.0000
Precision: 1.0000
Accuracy: 1.0000
MAE: 0.0000
MSE: 0.0000
RMSE: 0.0000
MAPE: 0.0000%


### Random Forest


In [19]:
rf_model = RandomForestClassifier(random_state=42)
auc_roc, precision, accuracy, mae, mse, rmse, mape = evaluate_model(rf_model, X_resampled, X_test, y_resampled, y_test)
print("\nRandom Forest Results:")
print(f"AUC/ROC: {auc_roc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4f}%")



Random Forest Results:
AUC/ROC: 1.0000
Precision: 1.0000
Accuracy: 1.0000
MAE: 0.0000
MSE: 0.0000
RMSE: 0.0000
MAPE: 0.0000%


### Support Vector Machine (SVM)


In [20]:
svm_model = SVC(kernel='linear', random_state=42)
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)
auc_roc, precision, accuracy, mae, mse, rmse, mape = evaluate_model(svm_model, X_resampled_scaled, X_test_scaled, y_resampled, y_test)
print("\nSupport Vector Machine Results:")
print(f"AUC/ROC: {auc_roc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4f}%")



Support Vector Machine Results:
AUC/ROC: 0.9841
Precision: 0.8958
Accuracy: 0.9750
MAE: 0.0250
MSE: 0.0250
RMSE: 0.1581
MAPE: inf%


### Gradient Boosting


In [21]:
gb_model = GradientBoostingClassifier(random_state=42)
auc_roc, precision, accuracy, mae, mse, rmse, mape = evaluate_model(gb_model, X_resampled, X_test, y_resampled, y_test)
print("\nGradient Boosting Results:")
print(f"AUC/ROC: {auc_roc:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.4f}%")



Gradient Boosting Results:
AUC/ROC: 1.0000
Precision: 1.0000
Accuracy: 1.0000
MAE: 0.0000
MSE: 0.0000
RMSE: 0.0000
MAPE: 0.0000%
