In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np

In [3]:
data = pd.read_csv('Family Income and Expenditure.csv')

In [12]:
def remove_income_outlier(data):
    z_scores = np.abs((data['Total Household Income'] - data['Total Household Income'].mean()) / data['Total Household Income'].std())
    threshold = 3
    outlier_indices = np.where(z_scores > threshold)[0]
    return data.drop(outlier_indices)

In [5]:
from data_utils import _expenditures_data, _income_data, _householdhead_data, _appliances_data, _property_information, _family_composition

expenditures_data = _expenditures_data(cleaned_data)
income_data = _income_data(cleaned_data)
householdhead_data = _householdhead_data(cleaned_data)
appliances_data = _appliances_data(cleaned_data)
property_data = _property_information(cleaned_data)
family_composition_data = _family_composition(cleaned_data)

In [6]:
# We are going to predict the total household income of the test data
# This is a regression problem
# The sklearn library will be used to train the model

y = cleaned_data['Total Household Income']

In [7]:
data_for_prediction = expenditures_data


In [8]:
from sklearn.neural_network import MLPRegressor
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [9]:
regressors = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'SVR': SVR(),
    'KNN': KNeighborsRegressor(),
    'MLP': MLPRegressor(hidden_layer_sizes=(100, 100), activation='relu', solver='adam'),
    'Bayesian Ridge': BayesianRidge(),
}



In [10]:
Split the data into training and testing sets
X = cleaned_data[expenditures_data]
y = cleaned_data['Total Household Income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()  # Optional: Scale the features for better performance
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

for model_name in regressors:
    model = regressors[model_name]
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name}, Mean Squared Error:", mse)
    print(f"{model_name}, R-squared:", r2)




# model = MLPRegressor(hidden_layer_sizes=(100, 100), activation='relu', solver='adam', random_state=42)
# model.fit(X_train_scaled, y_train)
# # Step 5: Evaluate the model on the testing data

# y_pred = model.predict(X_test_scaled)
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print("Mean Squared Error:", mse)
# print("R-squared:", r2)

Linear Regression, Mean Squared Error: 6168854847.731797
Linear Regression, R-squared: 0.8102298534386614
Ridge Regression, Mean Squared Error: 6168883943.137716
Ridge Regression, R-squared: 0.8102289583877027
Lasso Regression, Mean Squared Error: 6168873571.806372
Lasso Regression, R-squared: 0.810229277437043
Decision Tree, Mean Squared Error: 12828540895.282572
Decision Tree, R-squared: 0.6053604524734406
Random Forest, Mean Squared Error: 5911018819.670193
Random Forest, R-squared: 0.8181615655702955
Gradient Boosting, Mean Squared Error: 5771930778.886762
Gradient Boosting, R-squared: 0.8224402783194058
SVR, Mean Squared Error: 35193166832.90742
SVR, R-squared: -0.08263406948116381
KNN, Mean Squared Error: 7573648660.1739
KNN, R-squared: 0.767014713148301
MLP, Mean Squared Error: 6077744328.720833
MLP, R-squared: 0.8130326518465941
Bayesian Ridge, Mean Squared Error: 6169328460.689288
Bayesian Ridge, R-squared: 0.8102152838625286


