In [142]:
import pandas as pd
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
import numpy as np


In [143]:
df = pd.read_csv("../Resources/application_record.csv")
df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [144]:
# Drop Unwanted Columns
df2 = df.drop(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS'],axis=1)

In [145]:
# Rename columns
df2 = df2.rename(columns={'CODE_GENDER': 'GENDER', 
                   'FLAG_OWN_CAR': 'CAR',
                   'FLAG_OWN_REALTY': 'HOUSE',
                   'AMT_INCOME_TOTAL': 'TOTAL INCOME',
                   'NAME_INCOME_TYPE': 'INCOME CATEGORY',
                   'NAME_EDUCATION_TYPE': 'EDUCATION LEVEL',
                   'NAME_FAMILY_STATUS': 'MARITAL STATUS',
                   'NAME_HOUSING_TYPE': 'WAY OF LIVING',
                   'CNT_FAM_MEMBERS': 'FAMILY SIZE'})
df2.head()

Unnamed: 0,ID,GENDER,CAR,HOUSE,CNT_CHILDREN,TOTAL INCOME,INCOME CATEGORY,EDUCATION LEVEL,MARITAL STATUS,WAY OF LIVING
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment


In [146]:
# Generate our categorical variable lists
app_cat = df2.dtypes[df2.dtypes == 'object'].index.to_list()

In [147]:
# Check the number of unique values in each column
df2[app_cat].nunique()

GENDER             2
CAR                2
HOUSE              2
INCOME CATEGORY    5
EDUCATION LEVEL    5
MARITAL STATUS     5
WAY OF LIVING      6
dtype: int64

In [148]:
# Create OneHotEncoder instance
encoder = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(encoder.fit_transform(df2[app_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = encoder.get_feature_names(app_cat)
encode_df.head()



Unnamed: 0,GENDER_F,GENDER_M,CAR_N,CAR_Y,HOUSE_N,HOUSE_Y,INCOME CATEGORY_Commercial associate,INCOME CATEGORY_Pensioner,INCOME CATEGORY_State servant,INCOME CATEGORY_Student,INCOME CATEGORY_Working,EDUCATION LEVEL_Academic degree,EDUCATION LEVEL_Higher education,EDUCATION LEVEL_Incomplete higher,EDUCATION LEVEL_Lower secondary,EDUCATION LEVEL_Secondary / secondary special,MARITAL STATUS_Civil marriage,MARITAL STATUS_Married,MARITAL STATUS_Separated,MARITAL STATUS_Single / not married,MARITAL STATUS_Widow,WAY OF LIVING_Co-op apartment,WAY OF LIVING_House / apartment,WAY OF LIVING_Municipal apartment,WAY OF LIVING_Office apartment,WAY OF LIVING_Rented apartment,WAY OF LIVING_With parents
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [149]:
# Merge one-hot encoded features and drop the originals
df2 = df2.merge(encode_df,left_index=True, right_index=True)
df2 = df2.drop(app_cat, axis=1)
df2.head()

Unnamed: 0,ID,CNT_CHILDREN,TOTAL INCOME,GENDER_F,GENDER_M,CAR_N,CAR_Y,HOUSE_N,HOUSE_Y,INCOME CATEGORY_Commercial associate,INCOME CATEGORY_Pensioner,INCOME CATEGORY_State servant,INCOME CATEGORY_Student,INCOME CATEGORY_Working,EDUCATION LEVEL_Academic degree,EDUCATION LEVEL_Higher education,EDUCATION LEVEL_Incomplete higher,EDUCATION LEVEL_Lower secondary,EDUCATION LEVEL_Secondary / secondary special,MARITAL STATUS_Civil marriage,MARITAL STATUS_Married,MARITAL STATUS_Separated,MARITAL STATUS_Single / not married,MARITAL STATUS_Widow,WAY OF LIVING_Co-op apartment,WAY OF LIVING_House / apartment,WAY OF LIVING_Municipal apartment,WAY OF LIVING_Office apartment,WAY OF LIVING_Rented apartment,WAY OF LIVING_With parents
0,5008804,0,427500.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,5008805,0,427500.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5008806,0,112500.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,5008808,0,270000.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,5008809,0,270000.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [150]:
# Split our preprocessed data into our features and target arrays
y = df2["TOTAL INCOME"]
X = df2.drop(columns="TOTAL INCOME")



In [151]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [152]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train, y_train)

Ridge()

In [153]:

y_pred_ridge = ridge_model.predict(X_test)


mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression")
print("Mean Squared Error:", mse_ridge)
print("Mean Absolute Error:", mae_ridge)
print("R-squared:", r2_ridge)

Ridge Regression
Mean Squared Error: 10418635849.641237
Mean Absolute Error: 65612.62303523511
R-squared: 0.12065326033717794


In [154]:

lasso_model = Lasso(alpha=1.0) 
lasso_model.fit(X_train, y_train)


y_pred_lasso = lasso_model.predict(X_test)


mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("Lasso Regression")
print("Mean Squared Error:", mse_lasso)
print("Mean Absolute Error:", mae_lasso)
print("R-squared:", r2_lasso)

Lasso Regression
Mean Squared Error: 10418631027.078827
Mean Absolute Error: 65612.50132247766
R-squared: 0.1206536673678672


  model = cd_fast.enet_coordinate_descent(
