In [61]:
# Import dependencies
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
import requests
from prophet import Prophet

In [62]:
# Fetch the data
ames_df = pd.read_csv('Resources/ames_housing.csv')
boston_df = pd.read_csv('Resources/boston_housing.csv')
nyc_df = pd.read_csv('Resources/nyc_housing.csv')

In [63]:
# Add a location identifier to each dataset
ames_df['Location'] = 'Ames'
boston_df['Location'] = 'Boston'
nyc_df['Location'] = 'NYC'


In [64]:
# Concatenate the datasets vertically
combined_df = pd.concat([ames_df, nyc_df, boston_df], ignore_index=True)

In [65]:
# Display the combined dataframe
combined_df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,1.0,526301100.0,20.0,RL,141.0,31770.0,Pave,,IR1,Lvl,...,,,,,,,,,,
1,2.0,526350040.0,20.0,RH,80.0,11622.0,Pave,,Reg,Lvl,...,,,,,,,,,,
2,3.0,526351010.0,20.0,RL,81.0,14267.0,Pave,,IR1,Lvl,...,,,,,,,,,,
3,4.0,526353030.0,20.0,RL,93.0,11160.0,Pave,,Reg,Lvl,...,,,,,,,,,,
4,5.0,527105010.0,60.0,RL,74.0,13830.0,Pave,,IR1,Lvl,...,,,,,,,,,,


In [66]:
# Check for missing values
missing_values = combined_df.isnull().sum()

missing_values

Order           5307
PID             5307
MS SubClass     5307
MS Zoning       5307
Lot Frontage    5797
                ... 
tax             7731
ptratio         7731
b               7731
lstat           7731
medv            7731
Length: 114, dtype: int64

In [67]:
# Calculate the percentage of missing values
missing_percentage = missing_values / len(combined_df) * 100

missing_percentage

Order           64.428797
PID             64.428797
MS SubClass     64.428797
MS Zoning       64.428797
Lot Frontage    70.377565
                  ...    
tax             93.856987
ptratio         93.856987
b               93.856987
lstat           93.856987
medv            93.856987
Length: 114, dtype: float64

In [68]:
# Separate numerical and categorical columns
numerical_columns = combined_df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = combined_df.select_dtypes(include=['object']).columns

In [69]:
# Impute missing values in numerical columns
for column in numerical_columns:
    combined_df[column].fillna(combined_df[column].mean(), inplace=True)

In [70]:
# Impute missing values in categorical columns
for column in categorical_columns:
    combined_df[column].fillna(combined_df[column].mode()[0], inplace=True)