<a href="https://colab.research.google.com/github/huyen1607/California-Housing-Price-Prediction/blob/main/California_Housing_Price_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

import statsmodels.formula.api as smf
import statsmodels.api as sm

from sklearn.neural_network import MLPClassifier

from sklearn.datasets import fetch_california_housing

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

%precision 3

  import pandas.util.testing as tm


'%.3f'

In [None]:
#Bring the dataset on
housing = fetch_california_housing()

In [None]:
#Independent Variables
housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [None]:
#Dependent Variables
housing.target_names

['MedHouseVal']

In [None]:
housing.target

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894])

In [None]:
price_df = pd.DataFrame(housing.target, columns=['MedHouseVal'])
print(price_df)

       MedHouseVal
0            4.526
1            3.585
2            3.521
3            3.413
4            3.422
...            ...
20635        0.781
20636        0.771
20637        0.923
20638        0.847
20639        0.894

[20640 rows x 1 columns]


In [None]:
price_df.describe()

Unnamed: 0,MedHouseVal
count,20640.0
mean,2.068558
std,1.153956
min,0.14999
25%,1.196
50%,1.797
75%,2.64725
max,5.00001


In [None]:
#Define High_Price and Low_Price using the Median as Threshold
#Encode High_Price as 1 and Low_Price as 0
target_new = (housing.target > 1.797).astype(int)
target_new

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
#Bring on the Independent Variables: Latitude and Longtitude
X=housing.data[0:20640,6:]

y=target_new

print('독립변수 행 수,열 수: ', X.shape)
print('종속변수 행 수,열 수: ', y.shape)

독립변수 행 수,열 수:  (20640, 2)
종속변수 행 수,열 수:  (20640,)


In [None]:
#Split data into train set and test set
X_train,X_test, y_train, y_test = train_test_split(X,y,random_state=1234)

print('독립변수 행 수,열 수: ', X_train.shape)
print('종속변수 행 수,열 수: ', y_train.shape)

독립변수 행 수,열 수:  (15480, 2)
종속변수 행 수,열 수:  (15480,)


In [None]:
#Dependent Variable
y_train[0:10]

array([0, 0, 1, 1, 0, 0, 0, 1, 0, 1])

In [None]:
#Rearrange the Dataframe

X_train_df = pd.DataFrame(X_train, columns=['Latitude','Longtitude'])
y_train_df = pd.DataFrame({"Price":y_train})
housing_train_df = pd.concat([y_train_df,X_train_df],axis=1)

print(housing_train_df.head(5))

   Price  Latitude  Longtitude
0      0     34.57     -117.93
1      0     38.14     -121.28
2      1     33.89     -117.86
3      1     37.70     -122.14
4      0     36.37     -119.65


In [None]:
#Logistic Regression Model
logi_mod_full = smf.glm('Price ~ Latitude + Longtitude',data=housing_train_df,
                        family=sm.families.Binomial()).fit()
logi_mod_Latitude = smf.glm('Price ~ Latitude',data=housing_train_df,
                        family=sm.families.Binomial()).fit()
logi_mod_Longtitude = smf.glm('Price ~ Longtitude',data=housing_train_df,
                        family=sm.families.Binomial()).fit()
logi_mod_null = smf.glm('Price ~ 1',data=housing_train_df,
                       family=sm.families.Binomial()).fit()
#Compare AIC between models using different independent variables
print('full', logi_mod_full.aic.round(3))
print('Latitude', logi_mod_Latitude.aic.round(3))
print('Longtitude', logi_mod_Longtitude.aic.round(3))
print('null', logi_mod_null.aic.round(3))

full 17183.208
Latitude 21152.514
Longtitude 21438.724
null 21461.648


In [None]:
#Summary the performance evaluation
logi_mod_full.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-154.2069,3.155,-48.878,0.000,-160.390,-148.023
Latitude,-1.8287,0.037,-49.743,0.000,-1.901,-1.757
Longtitude,-1.8333,0.037,-49.839,0.000,-1.905,-1.761


In [None]:
#Test
X_test_df = pd.DataFrame(X_test, columns=["Latitude","Longtitude"])

logi_fit = logi_mod_full.fittedvalues.round(0)
logi_pred = logi_mod_full.predict(X_test_df).round(0)

true_train=np.sum(logi_fit == (y_train))
true_test=np.sum(logi_pred == (y_test))

result_train = true_train/len(y_train)
result_test = true_test / len(y_test)

print("훈련데이터 적줄률:",result_train)
print("테스트 데이터 적줄률:", result_test)

훈련데이터 적줄률: 0.7172480620155038
테스트 데이터 적줄률: 0.7304263565891473


MLP CLassifier

In [None]:
#Preprocessing: Standardization
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
#Check the std of train set
np.std(X_train_scaled, axis=0)

array([1., 1.])

In [None]:
#Check the std of test set
np.std(X_test_scaled, axis=0)

array([0.995, 1.005])

In [None]:
nnet = MLPClassifier(
    hidden_layer_sizes = (100,100),
    alpha = 0.07,
    max_iter = 10000,
    random_state = 0
)
nnet.fit(X_train_scaled, y_train)

print("훈련데이터 적줄률:", nnet.score(X_train_scaled, y_train))
print("테스트 데이터 적줄률:", nnet.score(X_test_scaled, y_test))

훈련데이터 적줄률: 0.7850129198966408
테스트 데이터 적줄률: 0.790891472868217


From the performance evaluation, we can see that Logistic Regression has performed better than the Simple MLPClassifier.