In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import  MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor

In [3]:
housing_data = pd.read_csv("./housing.csv")
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
housing_data.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [4]:
data.info()

NameError: name 'data' is not defined

## Data Preprocessing

## using one hot encoding to convert catagorical value to numeric

In [5]:
data = pd.get_dummies(housing_data)
data.head(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


## Handling missing data using imputation

In [6]:
print(data.isnull().sum())

longitude                       0
latitude                        0
housing_median_age              0
total_rooms                     0
total_bedrooms                207
population                      0
households                      0
median_income                   0
median_house_value              0
ocean_proximity_<1H OCEAN       0
ocean_proximity_INLAND          0
ocean_proximity_ISLAND          0
ocean_proximity_NEAR BAY        0
ocean_proximity_NEAR OCEAN      0
dtype: int64


In [7]:
imputer = SimpleImputer()
imputed_data = imputer.fit_transform(data)
imputed_data = pd.DataFrame(imputed_data, columns = data.columns)

print(imputed_data.isnull().sum())

longitude                     0
latitude                      0
housing_median_age            0
total_rooms                   0
total_bedrooms                0
population                    0
households                    0
median_income                 0
median_house_value            0
ocean_proximity_<1H OCEAN     0
ocean_proximity_INLAND        0
ocean_proximity_ISLAND        0
ocean_proximity_NEAR BAY      0
ocean_proximity_NEAR OCEAN    0
dtype: int64


### Scaling the data

In [8]:
min_max_scaler = MinMaxScaler()
scaled_data =  min_max_scaler.fit_transform(imputed_data) 
scaled_data = pd.DataFrame(scaled_data, columns = imputed_data.columns)

### Splitting data

In [9]:
X = scaled_data.drop('median_house_value', axis=1)
y = scaled_data['median_house_value']  
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)


## Using Linear Regression

In [10]:
linear_reg = LinearRegression()
linear_model = linear_reg.fit(X_train, y_train)
predictions = linear_model.predict(X_test)
predictions

array([0.41567993, 0.56167603, 0.33917236, ..., 0.15307617, 0.49557495,
       0.40869141])

## Model Evaluation

In [11]:
print('MAE',mean_absolute_error(y_test, predictions))
print('MSE', mean_squared_error(y_test, predictions))
print('R2 score', r2_score(y_test, predictions))

MAE 0.10246227516418019
MSE 0.020058485437217097
R2 score 0.6381549590299889


## Using Neural Network

In [12]:
nn = MLPRegressor(random_state=1, max_iter=300,  n_iter_no_change=10)
nn_model = nn.fit(X_train, y_train)

In [13]:
nn_predictions = nn_model.predict(X_test) 

In [14]:
print('mean accuracy on the given test data and labels.', nn_model.score(X_test, y_test))

mean accuracy on the given test data and labels. 0.6986935057664683
