In [161]:
import pandas as pd
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
import numpy as np


In [162]:
df = pd.read_csv("Resources/application_record.csv")
df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [163]:
# Drop Unwanted Columns
df2 = df.drop(['ID','DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS'],axis=1)

In [164]:
# Rename columns
df2 = df2.rename(columns={'CODE_GENDER': 'GENDER', 
                   'FLAG_OWN_CAR': 'CAR',
                   'FLAG_OWN_REALTY': 'HOUSE',
                   'AMT_INCOME_TOTAL': 'TOTAL INCOME',
                   'NAME_INCOME_TYPE': 'INCOME CATEGORY',
                   'NAME_EDUCATION_TYPE': 'EDUCATION LEVEL',
                   'NAME_FAMILY_STATUS': 'MARITAL STATUS',
                   'NAME_HOUSING_TYPE': 'WAY OF LIVING',
                   'CNT_FAM_MEMBERS': 'FAMILY SIZE'})
df2.head()

Unnamed: 0,GENDER,CAR,HOUSE,CNT_CHILDREN,TOTAL INCOME,INCOME CATEGORY,EDUCATION LEVEL,MARITAL STATUS,WAY OF LIVING
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment


In [165]:
df2.dtypes

GENDER              object
CAR                 object
HOUSE               object
CNT_CHILDREN         int64
TOTAL INCOME       float64
INCOME CATEGORY     object
EDUCATION LEVEL     object
MARITAL STATUS      object
WAY OF LIVING       object
dtype: object

In [166]:
# Generate our categorical variable lists
app_cat = df2.dtypes[df2.dtypes == 'object'].index.to_list()

In [167]:
# Check the number of unique values in each column
df2[app_cat].nunique()

GENDER             2
CAR                2
HOUSE              2
INCOME CATEGORY    5
EDUCATION LEVEL    5
MARITAL STATUS     5
WAY OF LIVING      6
dtype: int64

In [168]:
# Create OneHotEncoder instance
encoder = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(encoder.fit_transform(df2[app_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = encoder.get_feature_names(app_cat)
encode_df.head()



Unnamed: 0,GENDER_F,GENDER_M,CAR_N,CAR_Y,HOUSE_N,HOUSE_Y,INCOME CATEGORY_Commercial associate,INCOME CATEGORY_Pensioner,INCOME CATEGORY_State servant,INCOME CATEGORY_Student,...,MARITAL STATUS_Married,MARITAL STATUS_Separated,MARITAL STATUS_Single / not married,MARITAL STATUS_Widow,WAY OF LIVING_Co-op apartment,WAY OF LIVING_House / apartment,WAY OF LIVING_Municipal apartment,WAY OF LIVING_Office apartment,WAY OF LIVING_Rented apartment,WAY OF LIVING_With parents
0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [169]:
# Merge one-hot encoded features and drop the originals
df2 = df2.merge(encode_df,left_index=True, right_index=True)
df2 = df2.drop(app_cat, axis=1)
df2.head()

Unnamed: 0,CNT_CHILDREN,TOTAL INCOME,GENDER_F,GENDER_M,CAR_N,CAR_Y,HOUSE_N,HOUSE_Y,INCOME CATEGORY_Commercial associate,INCOME CATEGORY_Pensioner,...,MARITAL STATUS_Married,MARITAL STATUS_Separated,MARITAL STATUS_Single / not married,MARITAL STATUS_Widow,WAY OF LIVING_Co-op apartment,WAY OF LIVING_House / apartment,WAY OF LIVING_Municipal apartment,WAY OF LIVING_Office apartment,WAY OF LIVING_Rented apartment,WAY OF LIVING_With parents
0,0,427500.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,427500.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,112500.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0,270000.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0,270000.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [196]:
# Split our preprocessed data into our features and target arrays
y = df2["TOTAL INCOME"].values.reshape(-1, 1)
X = df2.drop(["TOTAL INCOME"],axis=1)

y[:5]

array([[427500.],
       [427500.],
       [112500.],
       [270000.],
       [270000.]])

In [197]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
# X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [198]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [172]:
# ridge_model = Ridge()
# alphas = [0.1, 1.0, 10.0, 100.0, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
# param_grid = {'alpha': alphas}
# grid_search = GridSearchCV(ridge_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X_scaled, y)
# best_alpha = grid_search.best_params_['alpha']
# best_model = grid_search.best_estimator_

# print("Best Alpha:", best_alpha)

Best Alpha: 1000


In [199]:
model = Ridge(alpha=1000)
model.fit(X_train_scaled, y_train)

Ridge(alpha=1000)

In [200]:

y_pred_ridge = model.predict(X_test_scaled)


mse_ridge = mean_squared_error(y_test, y_pred_ridge)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression")
print("Mean Squared Error:", mse_ridge)
print("Mean Absolute Error:", mae_ridge)
print("R-squared:", r2_ridge)

Ridge Regression
Mean Squared Error: 10420161071.198128
Mean Absolute Error: 65609.42199998323
R-squared: 0.12052452960671844


In [201]:

lasso_model = Lasso(alpha=1.0) 
lasso_model.fit(X_train, y_train)


y_pred_lasso = lasso_model.predict(X_test)


mse_lasso = mean_squared_error(y_test, y_pred_lasso)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("Lasso Regression")
print("Mean Squared Error:", mse_lasso)
print("Mean Absolute Error:", mae_lasso)
print("R-squared:", r2_lasso)

Lasso Regression
Mean Squared Error: 10420234732.528503
Mean Absolute Error: 65611.63009946547
R-squared: 0.12051831249234368


  model = cd_fast.enet_coordinate_descent(


In [204]:
decision_tree_model = DecisionTreeRegressor(max_depth=150, random_state=44)
decision_tree_model.fit(X_train, y_train)
y_pred = decision_tree_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(X_train.shape)
print(X_test.shape)
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)

(350845, 28)
(87712, 28)
Mean Squared Error: 9884011826.914545
Mean Absolute Error: 63527.51463757351
R-squared: 0.16577623978619338
