In [15]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [16]:
housing_df = pd.read_csv("combined_data_clean.csv")
housing_df.head()


Unnamed: 0,id,Price,Address,zip,status,mls,Subdivision,Year Built,Bedrooms,Bathrooms,Approx SQFT
0,1,427167,3231 W MARYLAND AVE,85017,Active,6395749,WEST PLAZA 6 LOT 614-787,1958,3,2.0,1302.0
1,2,400000,5109 E THOMAS RD,85018,Active,6243667,PAPAGO VILLAGE,1943,0,0.0,1358.0
2,3,399900,8020 W STELLA AVE,85303,Active,6384821,SHALIMAR,1997,3,2.0,1505.0
3,4,349000,6944 W Verde LN,85033,Active,6395738,MARYVALE TERRACE NO. 49,1983,3,3.0,1514.0
4,5,419000,4105 W PASADENA AVE,85019,Active,6395732,TU-DOR ESTATES,1971,4,2.0,1908.0


In [31]:
education_df = pd.read_csv("education_by_zip.csv")
education_df.head()
education_df.rename(columns={'Zip':'zip'},inplace=True)
education_df.head()

Unnamed: 0,zip,Elem_School_District,Hightest_Rated_School
0,85003,Phoenix Elementary,94.44
1,85004,Phoenix Elementary,94.44
2,85006,Phoenix Elementary,94.44
3,85007,Phoenix Elementary,94.44
4,85008,Balsz,81.79


In [32]:
housing_df.drop_duplicates()

Unnamed: 0,id,Price,Address,zip,status,mls,Subdivision,Year Built,Bedrooms,Bathrooms,Approx SQFT
0,1,427167,3231 W MARYLAND AVE,85017,Active,6395749,WEST PLAZA 6 LOT 614-787,1958,3,2.0,1302.0
1,2,400000,5109 E THOMAS RD,85018,Active,6243667,PAPAGO VILLAGE,1943,0,0.0,1358.0
2,3,399900,8020 W STELLA AVE,85303,Active,6384821,SHALIMAR,1997,3,2.0,1505.0
3,4,349000,6944 W Verde LN,85033,Active,6395738,MARYVALE TERRACE NO. 49,1983,3,3.0,1514.0
4,5,419000,4105 W PASADENA AVE,85019,Active,6395732,TU-DOR ESTATES,1971,4,2.0,1908.0
...,...,...,...,...,...,...,...,...,...,...,...
2260,2311,600000,3402 E CAROL ANN WAY,85032,Closed,6368329,BELLA TERRA,1981,3,2.0,2012.0
2261,2312,375000,739 E Constance WAY,85042,Closed,6358973,VILLAGES AT VERONA,2016,3,2.5,1499.0
2262,2313,365000,7301 W CHERYL DR,85345,Closed,6365681,SUNTOWN UNIT 2,1972,3,2.0,1581.0
2263,2314,529000,4637 E GRANADA RD,85008,Closed,6354718,RANCHO MIO,1953,3,2.0,1710.0


In [35]:
merged_df=housing_df.merge(education_df, how='left', on='zip')
merged_df

Unnamed: 0,id,Price,Address,zip,status,mls,Subdivision,Year Built,Bedrooms,Bathrooms,Approx SQFT,Elem_School_District,Hightest_Rated_School
0,1,427167,3231 W MARYLAND AVE,85017,Active,6395749,WEST PLAZA 6 LOT 614-787,1958,3,2.0,1302.0,Alhambra,100.04
1,2,400000,5109 E THOMAS RD,85018,Active,6243667,PAPAGO VILLAGE,1943,0,0.0,1358.0,Creighton,75.80
2,3,399900,8020 W STELLA AVE,85303,Active,6384821,SHALIMAR,1997,3,2.0,1505.0,Cartwright,95.89
3,4,349000,6944 W Verde LN,85033,Active,6395738,MARYVALE TERRACE NO. 49,1983,3,3.0,1514.0,Cartwright,95.89
4,5,419000,4105 W PASADENA AVE,85019,Active,6395732,TU-DOR ESTATES,1971,4,2.0,1908.0,Alhambra,100.04
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2260,2311,600000,3402 E CAROL ANN WAY,85032,Closed,6368329,BELLA TERRA,1981,3,2.0,2012.0,Paradise Valley Unified,99.97
2261,2312,375000,739 E Constance WAY,85042,Closed,6358973,VILLAGES AT VERONA,2016,3,2.5,1499.0,Roosevelt,76.54
2262,2313,365000,7301 W CHERYL DR,85345,Closed,6365681,SUNTOWN UNIT 2,1972,3,2.0,1581.0,Pendergast,85.28
2263,2314,529000,4637 E GRANADA RD,85008,Closed,6354718,RANCHO MIO,1953,3,2.0,1710.0,Balsz,81.79


In [36]:
new_housing_df=merged_df.drop(columns=['mls','Address','id','Subdivision','status','Elem_School_District'],axis=1)
new_housing_df

Unnamed: 0,Price,zip,Year Built,Bedrooms,Bathrooms,Approx SQFT,Hightest_Rated_School
0,427167,85017,1958,3,2.0,1302.0,100.04
1,400000,85018,1943,0,0.0,1358.0,75.80
2,399900,85303,1997,3,2.0,1505.0,95.89
3,349000,85033,1983,3,3.0,1514.0,95.89
4,419000,85019,1971,4,2.0,1908.0,100.04
...,...,...,...,...,...,...,...
2260,600000,85032,1981,3,2.0,2012.0,99.97
2261,375000,85042,2016,3,2.5,1499.0,76.54
2262,365000,85345,1972,3,2.0,1581.0,85.28
2263,529000,85008,1953,3,2.0,1710.0,81.79


In [37]:
# Split our preprocessed data into our features and target arrays
y = new_housing_df["Price"].values
X = new_housing_df.drop(["Price"],axis=1).values

X

array([[8.5017e+04, 1.9580e+03, 3.0000e+00, 2.0000e+00, 1.3020e+03,
        1.0004e+02],
       [8.5018e+04, 1.9430e+03, 0.0000e+00, 0.0000e+00, 1.3580e+03,
        7.5800e+01],
       [8.5303e+04, 1.9970e+03, 3.0000e+00, 2.0000e+00, 1.5050e+03,
        9.5890e+01],
       ...,
       [8.5345e+04, 1.9720e+03, 3.0000e+00, 2.0000e+00, 1.5810e+03,
        8.5280e+01],
       [8.5008e+04, 1.9530e+03, 3.0000e+00, 2.0000e+00, 1.7100e+03,
        8.1790e+01],
       [8.5009e+04, 1.9530e+03, 3.0000e+00, 2.0000e+00, 1.1450e+03,
        6.5700e+01]])

In [38]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [39]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
clf = RandomForestRegressor().fit(X_train_scaled, y_train)

In [41]:
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 0.9587045772101926
Testing Score: 0.7101169968225822
