In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

In [44]:
train_data=pd.read_csv('/content/sample_data/california_housing_train.csv')
train_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [45]:
test_data=pd.read_csv('/content/sample_data/california_housing_test.csv')
test_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


In [46]:
train_data.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
dtype: int64

In [47]:
train_data.dtypes


longitude             float64
latitude              float64
housing_median_age    float64
total_rooms           float64
total_bedrooms        float64
population            float64
households            float64
median_income         float64
median_house_value    float64
dtype: object

In [48]:
X=train_data.drop('median_house_value',axis=1)
y=train_data['median_house_value']

In [49]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=54)


In [50]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor()
model.fit(X_train,y_train)

In [51]:
model.score(X_test,y_test)

0.8145625068345737

In [52]:
y_preds=model.predict(X_test)
y_preds[:5]

array([222418.01, 499666.98, 320616.02, 171327.  , 291496.05])

In [53]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
def evaluation(x,y):
  """enter y_test,y_preds"""
  mae=round(mean_absolute_error(x,y),0)
  mse=round(mean_squared_error(x,y),0)
  r2=r2_score(x,y)*100
  print(f"Mean Square Error : {mse}\n Mean Absolute Error : {mae}\n R2 Score : {r2}")

In [54]:
evaluation(y_test,y_preds)

Mean Square Error : 2443867194.0
 Mean Absolute Error : 32488.0
 R2 Score : 81.45625068345737


In [55]:
test=test_data.drop('median_house_value',axis=1)
true_values=test_data['median_house_value']

In [56]:
filename = 'finalized_model.sav'
pickle.dump(model,open(filename,'wb'))

In [57]:
loaded_model=pickle.load(open(filename, 'rb'))

In [58]:
predictions=model.predict(test)
predictions

array([396455.18, 223593.  , 256157.  , ...,  74615.  , 150476.1 ,
       497932.98])

In [59]:
test

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085
1,-118.30,34.26,43.0,1510.0,310.0,809.0,277.0,3.5990
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375
...,...,...,...,...,...,...,...,...
2995,-119.86,34.42,23.0,1450.0,642.0,1258.0,607.0,1.1790
2996,-118.14,34.06,27.0,5257.0,1082.0,3496.0,1036.0,3.3906
2997,-119.70,36.30,10.0,956.0,201.0,693.0,220.0,2.2895
2998,-117.12,34.10,40.0,96.0,14.0,46.0,14.0,3.2708


In [60]:
result=pd.DataFrame(columns=['orignal','predicted'])

In [61]:
result

Unnamed: 0,orignal,predicted


In [62]:
result['orignal']=true_values
result['predicted']=predictions

In [63]:
result

Unnamed: 0,orignal,predicted
0,344700.0,396455.18
1,176500.0,223593.00
2,270500.0,256157.00
3,330000.0,263364.02
4,81700.0,78994.00
...,...,...
2995,225000.0,289543.00
2996,237200.0,244692.00
2997,62000.0,74615.00
2998,162500.0,150476.10


In [64]:
result['error']=result['orignal']-result['predicted']
result

Unnamed: 0,orignal,predicted,error
0,344700.0,396455.18,-51755.18
1,176500.0,223593.00,-47093.00
2,270500.0,256157.00,14343.00
3,330000.0,263364.02,66635.98
4,81700.0,78994.00,2706.00
...,...,...,...
2995,225000.0,289543.00,-64543.00
2996,237200.0,244692.00,-7492.00
2997,62000.0,74615.00,-12615.00
2998,162500.0,150476.10,12023.90
