<a href="https://colab.research.google.com/github/minmincg/house_pricing_analysis/blob/main/Notebooks/neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#Import the csv by using the raw link
hpricing_df_og = pd.read_csv("https://raw.githubusercontent.com/minmincg/house_pricing_analysis/24b41d230570324f8d64a1326bcc9e9972a62608/Resources/austin_housing_reduced.csv")
hpricing_df_og.head()

Unnamed: 0,city,streetAddress,zipcode,latitude,longitude,propertyTaxRate,garageSpaces,hasCooling,hasGarage,hasHeating,...,numOfWaterfrontFeatures,numOfWindowFeatures,numOfCommunityFeatures,lotSizeSqFt,livingAreaSqFt,avgSchoolRating,numOfBathrooms,numOfBedrooms,numOfStories,numOfSchools
0,pflugerville,14424 Lake Victor Dr,78660,30.430632,-97.663078,1.98,2,True,True,True,...,0,1,0,6011.0,2601,2.666667,3.0,4,2,3
1,pflugerville,1104 Strickling Dr,78660,30.432672,-97.661697,1.98,2,True,True,True,...,0,0,0,6185.0,1768,2.666667,2.0,4,1,3
2,pflugerville,1408 Fort Dessau Rd,78660,30.409748,-97.639771,1.98,0,True,False,True,...,0,0,0,7840.0,1478,3.0,2.0,3,1,4
3,pflugerville,1025 Strickling Dr,78660,30.432112,-97.661659,1.98,2,True,True,True,...,0,0,0,6098.0,1678,2.666667,2.0,3,1,3
4,pflugerville,15005 Donna Jane Loop,78660,30.437368,-97.65686,1.98,0,True,False,True,...,0,0,0,6708.0,2132,4.0,3.0,3,2,3


In [2]:
# Determine the number of unique values in each column.
hpricing_df_og.nunique()

KeyboardInterrupt: 

In [None]:
# Drop the non-beneficial ID columns, 'city' and 'streetAddress'.
hpricing_df = hpricing_df_og.drop(columns = ['city', 'streetAddress','homeType'], axis=1)
hpricing_df.head()

In [None]:
# Making sure the columns got dropped
hpricing_df.columns

In [None]:
# Look at 'zipcode' value counts for binning
zip_counts = hpricing_df["zipcode"].value_counts()
zip_counts

In [None]:
# Chose a cutoff value of less than 200 and create a list of application types to be replaced

zipcodes_to_replace = list(zip_counts[zip_counts<200].index)

# Replace in dataframe
for zip in zipcodes_to_replace:
    hpricing_df['zipcode'] = hpricing_df['zipcode'].replace(zip,"Other")

# Making sure the binning was successful
hpricing_df['zipcode'].value_counts()

In [None]:
#price_counts = hpricing_df.latestPrice.value_counts()
#price_counts

In [None]:
#Check data types
hpricing_df.dtypes

In [None]:
#Convert zipcode to dummies because it is a categorical
#Didnt use dummies in the rest of the data (hasCooling, hasGarage, hasHeating, hasSpa, hasView, homeType) because it was already numerical
hpricing_df = pd.get_dummies(hpricing_df,dtype=float)
hpricing_df

#hpricing_df['zipcode']= hpricing_df['zipcode'].astype(float)
#hpricing_df.dtypes

In [None]:
hpricing_df.columns

In [None]:
# Split our preprocessed data into our features and target arrays
y= hpricing_df['latestPrice'].values

X= hpricing_df.drop ('latestPrice', axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state = 42)

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  80
hidden_nodes_layer2 = 30

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
#mae = mean absolute error & mse = mean squared error
nn.compile(loss="mse", optimizer="adam", metrics=["mae", "mse"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=500)

In [None]:
# Evaluate the model using the test data
errors = nn.evaluate(X_test_scaled,y_test,verbose=2)
errors
print(f"MSE: {errors [0]}, MAE: {errors[1]}")

In [None]:
nn.predict([X_train_scaled.tolist()[1]])


In [None]:
# Export our model to HDF5 file
nn.save("../Resources/nn/neural_network.h5")

In [None]:
# Export standardScaler from sklearn to file
from joblib import dump, load
dump(scaler, '../Resources/nn/std_scaler.bin', compress=True)

## Predict prices for original table

In [None]:
X_scaled=scaler.transform(X)
predict_price_nn=nn.predict(X_scaled).flatten()
predict_price_nn

In [None]:
hpricing_extra_nn=pd.DataFrame()
hpricing_extra_nn["predict_price_nn"]=predict_price_nn
hpricing_extra_nn["latestPrice"]=hpricing_df_og["latestPrice"]
hpricing_extra_nn["difference_nn"]=hpricing_extra_nn["latestPrice"]-hpricing_extra_nn["predict_price_nn"]
hpricing_extra_nn["percent_change_nn"]=round(hpricing_extra_nn["difference_nn"]*100/hpricing_extra_nn["latestPrice"],2)
hpricing_extra_nn.head(10)

In [None]:
hpricing_extra_nn.drop(columns=["latestPrice"],inplace=True)
hpricing_extra_nn.head(5)

In [None]:
hpricing_extra_nn.to_csv("../Resources/austin_extra_nn.csv",index=False)

In [None]:
hpricing_extra_nn.mean()