In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

dataset = pd.read_csv("realtor-data.zip.csv")

In [None]:
dataset.head()

In [None]:
dataset.isna().sum().sort_values(ascending=False)

In [None]:
dataset = dataset[["bed", "bath", "house_size", "acre_lot", "price"]]
dataset = dataset.rename({"bed": "Bedrooms"}, axis = 1)
dataset = dataset.rename({"bath": "Bathrooms"}, axis = 1)
dataset = dataset.rename({"house_size": "Square Feet"}, axis = 1)
dataset = dataset.rename({"acre_lot": "Acres"}, axis = 1)
dataset = dataset.rename({"price": "Housing Price"}, axis = 1)
dataset.head()

In [None]:
dataset = dataset.drop(dataset[dataset["Housing Price"].isnull()].index)

In [None]:
dataset = dataset[~(dataset.isna().sum(axis=1) >= 2)]

In [None]:
((dataset.isna().sum() / len(dataset)) * 100).sort_values(ascending=False)

In [None]:
dataset = dataset.drop(dataset[dataset['Bedrooms'].isnull()].index, axis=0)
dataset = dataset.drop(dataset[dataset['Bathrooms'].isnull()].index, axis=0)

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=dataset, orient="h");

In [None]:
Q1 = np.percentile(dataset["Housing Price"], 25)
Q1
Q3 = np.percentile(dataset["Housing Price"], 75)
IQR = Q3 - Q1

upper_outlier_threshold = Q3 + 1.5 * IQR
upper_outlier_threshold

In [None]:
len(dataset[dataset["Housing Price"]>upper_outlier_threshold]) / len(dataset["Housing Price"].index) * 100

In [None]:
dataset = dataset.drop(dataset[dataset["Housing Price"]>upper_outlier_threshold].index)

In [None]:
sns.boxplot(data=dataset, x="Housing Price")

In [None]:
sns.boxplot(data=dataset, x="Acres")

In [None]:
len(dataset[dataset["Acres"] > 2.12]) / len(dataset["Acres"].index) * 100

In [None]:
dataset = dataset.drop(dataset[dataset["Acres"] > 2.12].index, axis=0)

In [None]:
sns.boxplot(data=dataset, x="Acres")

In [None]:
sns.boxplot(data=dataset, x="Square Feet")

In [None]:
len(dataset[dataset["Square Feet"] >= 4098.5]) / len(dataset["Square Feet"].index) * 100

In [None]:
dataset = dataset.drop(dataset[dataset["Square Feet"] >= 4098.5].index, axis=0)

In [None]:
sns.boxplot(data=dataset, x="Square Feet")

In [None]:
dataset.info()

In [None]:
dataset.isnull().sum()

In [None]:
((dataset.isna().sum() / len(dataset)) * 100).sort_values(ascending=False)

In [None]:
dataset = dataset.drop(dataset[dataset['Bedrooms'].isnull()].index, axis=0)
dataset = dataset.drop(dataset[dataset['Bathrooms'].isnull()].index, axis=0)

In [None]:
dataset.isna().sum().sort_values(ascending=False)

In [None]:
new_dataset = dataset[dataset["Square Feet"].notna()]

In [None]:
filled_dataset = new_dataset.dropna(subset=["Acres"])
missing_dataset = dataset[dataset["Acres"].isna()]
predict_dataset = missing_dataset.drop("Acres", axis=1)

In [None]:
scaler = StandardScaler()
scaler.fit(filled_dataset.drop("Acres", axis=1))
predict_dataset = scaler.transform(predict_dataset)

In [None]:
X = scaler.transform(filled_dataset.drop("Acres", axis=1))
y = filled_dataset["Acres"]

In [None]:
KNN = KNeighborsRegressor()

n_list = list(range(1,10))
param_grid_knn = {"n_neighbors": n_list}

In [None]:
grid_knn = GridSearchCV(KNN, param_grid_knn)

In [None]:
grid_knn.fit(X, y)

In [None]:
grid_knn.best_estimator_.get_params()

In [None]:
preds = grid_knn.predict(predict_dataset)

In [None]:
dataset.loc[dataset["Acres"].isna(), "Acres"] = preds

In [None]:
((dataset.isna().sum() / len(dataset)) * 100).sort_values(ascending=False)

In [None]:
filled_dataset = dataset.dropna(subset=["Square Feet"])
missing_dataset = dataset[dataset["Square Feet"].isna()]
predict_dataset = missing_dataset.drop("Square Feet", axis=1)

In [None]:
scaler = StandardScaler()
scaler.fit(filled_dataset.drop("Square Feet", axis=1))

predict_dataset = scaler.transform(predict_dataset)

In [None]:
X = scaler.transform(filled_dataset.drop("Square Feet", axis=1))
y = filled_dataset["Square Feet"]

In [None]:
KNN = KNeighborsRegressor()

n_list = list(range(1,10))
param_grid_knn = {"n_neighbors": n_list}

grid_knn = GridSearchCV(KNN, param_grid_knn)

In [None]:
grid_knn.fit(X, y)

In [None]:
grid_knn.best_estimator_.get_params()

In [None]:
preds = grid_knn.predict(predict_dataset)

In [None]:
dataset.loc[dataset["Square Feet"].isna(), "Square Feet"] = preds

In [None]:
dataset.info()

In [None]:
sns.pairplot(dataset)

In [None]:
plt.title("Correlation Heatmap")
sns.heatmap(dataset.corr(), annot=True, linewidths=0.5)

In [None]:
sns.countplot(data=dataset, x="Bathrooms")
plt.xticks(rotation=90);

In [None]:
sns.countplot(data=dataset, x="Bedrooms")
plt.xticks(rotation=90);

In [None]:
sns.histplot(data=dataset, x="Square Feet", bins=50)

In [None]:
sns.histplot(data=dataset, x="Housing Price", bins=50)

In [None]:
le_bathrooms = LabelEncoder()
combined_data = pd.concat([dataset['Bathrooms']], axis=0)
le_bathrooms.fit(combined_data.astype(str))
dataset["Bathrooms"].unique()

In [None]:
le_bedrooms = LabelEncoder()
combined_data = pd.concat([dataset['Bedrooms']], axis=0)
le_bedrooms.fit(combined_data.astype(str))
dataset["Bedrooms"].unique()

In [None]:
X = dataset.drop("Housing Price", axis=1)
y = dataset["Housing Price"]

In [None]:
linear_reg = LinearRegression()
linear_reg.fit(X, y.values)

In [None]:
y_pred = linear_reg.predict(X)

In [None]:
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X, y.values)

In [None]:
y_pred = dec_tree_reg.predict(X)

In [None]:
error = (mean_absolute_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X, y.values)

In [None]:
y_pred = random_forest_reg.predict(X)

In [None]:
error = (mean_absolute_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(X, y.values)

In [None]:
regressor = gs.best_estimator_

regressor.fit(X, y.values)
y_pred = regressor.predict(X)
error = (mean_absolute_error(y, y_pred))
print("${:,.02f}".format(error))

In [None]:
X

In [None]:
X = np.array([["3", "3", "2000", "0.2"]])
X

In [None]:
X[:, 0] = le_bathrooms.fit_transform(X[:,0])
X[:, 1] = le_bedrooms.fit_transform(X[:,1])
X = X.astype(float)
X
print(X.shape) 

In [None]:
y_pred = regressor.predict(dataset, check_input=False)
y_pred

In [None]:
import pickle

In [None]:
data = {"model": regressor, "le_bathrooms": le_bathrooms, "le_bedrooms": le_bedrooms}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [None]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_bathrooms = data["le_bathrooms"]
le_bedrooms = data["le_bedrooms"]

In [None]:
y_pred = regressor_loaded.predict(X)
y_pred