<a href="https://colab.research.google.com/github/leyli16/HousingPricePrediction/blob/main/final_project_merged.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Packages

In [133]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import os

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

# USA Real Estate Data Wrangling, Cleaning, and EDA

## Loading in USA Real Estate Dataset


In [134]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ahmedshahriarsakib/usa-real-estate-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/usa-real-estate-dataset


In [135]:
files = os.listdir(path)
print("Files in dataset:", files)

Files in dataset: ['realtor-data.zip.csv']


In [136]:
file_path = os.path.join(path, 'realtor-data.zip.csv')
df_raw = pd.read_csv(file_path)

In [137]:
df_raw.head(10)

Unnamed: 0,brokered_by,status,price,bed,bath,acre_lot,street,city,state,zip_code,house_size,prev_sold_date
0,103378.0,for_sale,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,601.0,920.0,
1,52707.0,for_sale,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,601.0,1527.0,
2,103379.0,for_sale,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,795.0,748.0,
3,31239.0,for_sale,145000.0,4.0,2.0,0.1,1947675.0,Ponce,Puerto Rico,731.0,1800.0,
4,34632.0,for_sale,65000.0,6.0,2.0,0.05,331151.0,Mayaguez,Puerto Rico,680.0,,
5,103378.0,for_sale,179000.0,4.0,3.0,0.46,1850806.0,San Sebastian,Puerto Rico,612.0,2520.0,
6,1205.0,for_sale,50000.0,3.0,1.0,0.2,1298094.0,Ciales,Puerto Rico,639.0,2040.0,
7,50739.0,for_sale,71600.0,3.0,2.0,0.08,1048466.0,Ponce,Puerto Rico,731.0,1050.0,
8,81909.0,for_sale,100000.0,2.0,1.0,0.09,734904.0,Ponce,Puerto Rico,730.0,1092.0,
9,65672.0,for_sale,300000.0,5.0,3.0,7.46,1946226.0,Las Marias,Puerto Rico,670.0,5403.0,


In [138]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226382 entries, 0 to 2226381
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   brokered_by     float64
 1   status          object 
 2   price           float64
 3   bed             float64
 4   bath            float64
 5   acre_lot        float64
 6   street          float64
 7   city            object 
 8   state           object 
 9   zip_code        float64
 10  house_size      float64
 11  prev_sold_date  object 
dtypes: float64(8), object(4)
memory usage: 203.8+ MB


In [139]:
df_raw.describe()

Unnamed: 0,brokered_by,price,bed,bath,acre_lot,street,zip_code,house_size
count,2221849.0,2224841.0,1745065.0,1714611.0,1900793.0,2215516.0,2226083.0,1657898.0
mean,52939.89,524195.5,3.275841,2.49644,15.22303,1012325.0,52186.68,2714.471
std,30642.75,2138893.0,1.567274,1.652573,762.8238,583763.5,28954.08,808163.5
min,0.0,0.0,1.0,1.0,0.0,0.0,0.0,4.0
25%,23861.0,165000.0,3.0,2.0,0.15,506312.8,29617.0,1300.0
50%,52884.0,325000.0,3.0,2.0,0.26,1012766.0,48382.0,1760.0
75%,79183.0,550000.0,4.0,3.0,0.98,1521173.0,78070.0,2413.0
max,110142.0,2147484000.0,473.0,830.0,100000.0,2001357.0,99999.0,1040400000.0


In [140]:
df_raw.describe(include = 'object')

Unnamed: 0,status,city,state,prev_sold_date
count,2226382,2224975,2226374,1492085
unique,3,20098,55,14954
top,for_sale,Houston,Florida,2022-03-31
freq,1389306,23862,249432,17171


## Cleaning up USA Real Estate data

### Duplicates checking

In [141]:
df_raw.duplicated().sum()

np.int64(0)

### Column Filtering

In [142]:
# Keep only the necessary columns
df_req_cols = df_raw [['price', 'bed', 'bath', 'acre_lot', 'city', 'state', 'zip_code', 'house_size', 'status']]

### Drop nulls

In [143]:
total_missing = df_req_cols.isna().sum()*100/len(df_req_cols)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
price,0.069215
bed,21.618797
bath,22.986666
acre_lot,14.62413
city,0.063197
state,0.000359
zip_code,0.01343
house_size,25.533983
status,0.0


In [144]:
# Drop nulls, reset and drop the index
df_nadropped = df_req_cols.dropna().reset_index(drop=True)
df_nadropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1360347 entries, 0 to 1360346
Data columns (total 9 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   price       1360347 non-null  float64
 1   bed         1360347 non-null  float64
 2   bath        1360347 non-null  float64
 3   acre_lot    1360347 non-null  float64
 4   city        1360347 non-null  object 
 5   state       1360347 non-null  object 
 6   zip_code    1360347 non-null  float64
 7   house_size  1360347 non-null  float64
 8   status      1360347 non-null  object 
dtypes: float64(6), object(3)
memory usage: 93.4+ MB


In [145]:
total_missing = df_nadropped.isna().sum()*100/len(df_nadropped)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
price,0.0
bed,0.0
bath,0.0
acre_lot,0.0
city,0.0
state,0.0
zip_code,0.0
house_size,0.0
status,0.0


### Data type casting

In [146]:
# cast date types
df_typed = df_nadropped.astype({'price': 'float', 'bed': 'int', 'bath': 'int', 'acre_lot': 'float', 'city':'str', 'state':'str', 'zip_code':'int', 'house_size': 'float'})

In [147]:
df_typed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1360347 entries, 0 to 1360346
Data columns (total 9 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   price       1360347 non-null  float64
 1   bed         1360347 non-null  int64  
 2   bath        1360347 non-null  int64  
 3   acre_lot    1360347 non-null  float64
 4   city        1360347 non-null  object 
 5   state       1360347 non-null  object 
 6   zip_code    1360347 non-null  int64  
 7   house_size  1360347 non-null  float64
 8   status      1360347 non-null  object 
dtypes: float64(3), int64(3), object(3)
memory usage: 93.4+ MB


### Filter listings with sold status only and add price_per_sqft column

In [148]:
# Filter rows with status = sold
df_sold = df_typed[df_typed['status'] == 'sold'].drop(columns=['status'])

In [149]:
df_sold.head()

Unnamed: 0,price,bed,bath,acre_lot,city,state,zip_code,house_size
750602,524900.0,3,2,0.09,Aguada,Puerto Rico,602,2200.0
750603,90000.0,3,2,0.08,Aguadilla,Puerto Rico,603,1421.0
750604,22500.0,2,1,0.32,Anasco,Puerto Rico,610,850.0
750605,168000.0,6,4,0.05,Anasco,Puerto Rico,610,3422.0
750606,200000.0,3,1,4.98,Arecibo,Puerto Rico,612,580.0


In [150]:
df_sold.info()

<class 'pandas.core.frame.DataFrame'>
Index: 609745 entries, 750602 to 1360346
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   price       609745 non-null  float64
 1   bed         609745 non-null  int64  
 2   bath        609745 non-null  int64  
 3   acre_lot    609745 non-null  float64
 4   city        609745 non-null  object 
 5   state       609745 non-null  object 
 6   zip_code    609745 non-null  int64  
 7   house_size  609745 non-null  float64
dtypes: float64(3), int64(3), object(2)
memory usage: 41.9+ MB


### Remove outliers for USA Real Estate data

In [288]:
# IQR Method to remove outliers
# Define function for IQR filtering
def remove_outliers_iqr(df, cols):
    for col in cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]
    return df
columns = ['price', 'bed', 'bath', 'acre_lot', 'house_size']
# Apply to the columns
df_sold_outlier_removed = remove_outliers_iqr(df_sold, columns)

In [None]:
# Effect of removing outliers
def boxplot_compare(df_original, df_filtered, cols):
    fig, axes = plt.subplots(len(cols), 2, figsize=(12, 4 * len(cols)))
    for i, col in enumerate(cols):
        sns.boxplot(x=df_original[col], ax=axes[i][0])
        axes[i][0].set_title(f'Original: {col}')
        sns.boxplot(x=df_filtered[col], ax=axes[i][1])
        axes[i][1].set_title(f'Filtered: {col}')
    plt.tight_layout()
    plt.show()

boxplot_compare(df_sold, df_sold_outlier_removed, columns)

In [153]:
df_sold_outlier_removed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 449027 entries, 750602 to 1360345
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   price       449027 non-null  float64
 1   bed         449027 non-null  int64  
 2   bath        449027 non-null  int64  
 3   acre_lot    449027 non-null  float64
 4   city        449027 non-null  object 
 5   state       449027 non-null  object 
 6   zip_code    449027 non-null  int64  
 7   house_size  449027 non-null  float64
dtypes: float64(3), int64(3), object(2)
memory usage: 30.8+ MB


### USA Real Estate data cleaned

In [154]:
# save the cleaned df to df_real_estate
df_real_estate = df_sold_outlier_removed

In [291]:
df_real_estate.describe()

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size
count,449027.0,449027.0,449027.0,449027.0,449027.0,449027.0
mean,382151.8,3.173598,2.247671,0.200624,57593.322001,1731.338002
std,218244.8,0.751535,0.772791,0.12621,30128.125107,585.856441
min,1.0,2.0,1.0,0.0,602.0,100.0
25%,224900.0,3.0,2.0,0.12,30339.0,1286.0
50%,339000.0,3.0,2.0,0.17,60631.0,1638.0
75%,498543.5,4.0,3.0,0.25,85388.0,2094.0
max,1117000.0,5.0,4.0,0.67,99403.0,3418.0


## Distribution of house price by number of beds

In [None]:
sns.boxplot(x='bed', y='price', data=df_real_estate)
plt.title('Price by Number of Bedrooms')
plt.show()

In [None]:
# Distributation of house price by number of beds
fig = px.histogram(df_real_estate, x="price", color="bed", nbins=20)
fig.update_layout(title="Distribution of House Price by Bed", xaxis_title="Price", yaxis_title="Count")
fig.show()

## Distribution of house price by number of bathrooms

In [None]:
sns.boxplot(x='bath', y='price', data=df_real_estate)
plt.title('Price by Number of Bathrooms')
plt.show()

In [None]:
# Distribution of house price by number of baths
fig = px.histogram(df_real_estate, x="price", color="bath", nbins=20)
fig.update_layout(title="Distribution of House Price by Bath", xaxis_title="Price", yaxis_title="Count")
fig.show()

## Distribution of house price by state

In [None]:
fig = px.bar(df_real_estate.groupby(["state"])['price'].mean().reset_index(),
             x='state', y='price',width=700)
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'},
                 title="Distribution of house price by state.")
fig.update_xaxes(type="category",tickangle=-70)
fig.show()

In [None]:
# Sort states by median price
state_order = df_real_estate.groupby('state')['price'].median().sort_values(ascending=False).index.tolist()

# Plot with custom sort order
fig = px.box(df_real_estate, x='state', y='price', category_orders={'state': state_order},
             points=False, title='House Price Distribution by State (Sorted)',
             width=600, height=400)
fig.update_layout(xaxis_title='State', yaxis_title='Price', xaxis_tickangle=-45)
fig.show()

## Loading in Zip code to County Dataset

In [161]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("danofer/zipcodes-county-fips-crosswalk")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/zipcodes-county-fips-crosswalk


In [162]:
files = os.listdir(path)
print("Files in dataset:", files)

Files in dataset: ['ZIP-COUNTY-FIPS_2017-06.csv']


In [163]:
file_path = os.path.join(path, 'ZIP-COUNTY-FIPS_2017-06.csv')
zip_county_df_raw = pd.read_csv(file_path)

In [164]:
zip_county_df_raw.head(10)

Unnamed: 0,ZIP,COUNTYNAME,STATE,STCOUNTYFP,CLASSFP
0,36003,Autauga County,AL,1001,H1
1,36006,Autauga County,AL,1001,H1
2,36067,Autauga County,AL,1001,H1
3,36066,Autauga County,AL,1001,H1
4,36703,Autauga County,AL,1001,H1
5,36701,Autauga County,AL,1001,H1
6,36091,Autauga County,AL,1001,H1
7,36051,Autauga County,AL,1001,H1
8,36068,Autauga County,AL,1001,H1
9,36008,Autauga County,AL,1001,H1


In [165]:
zip_county_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52889 entries, 0 to 52888
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ZIP         52889 non-null  int64 
 1   COUNTYNAME  52889 non-null  object
 2   STATE       52889 non-null  object
 3   STCOUNTYFP  52889 non-null  int64 
 4   CLASSFP     52889 non-null  object
dtypes: int64(2), object(3)
memory usage: 2.0+ MB


## Cleaning up Zip Code to County Dataset

### Duplicates and null checking

In [166]:
zip_county_df_raw.duplicated().sum()

np.int64(0)

In [167]:
total_missing = zip_county_df_raw.isna().sum() * 100 / len(zip_county_df_raw)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
ZIP,0.0
COUNTYNAME,0.0
STATE,0.0
STCOUNTYFP,0.0
CLASSFP,0.0


### Data type casting

In [168]:
# cast date types
zip_county_df = zip_county_df_raw.astype({'ZIP': 'int', 'COUNTYNAME':'str', 'STATE':'str'})

In [169]:
zip_county_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52889 entries, 0 to 52888
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ZIP         52889 non-null  int64 
 1   COUNTYNAME  52889 non-null  object
 2   STATE       52889 non-null  object
 3   STCOUNTYFP  52889 non-null  int64 
 4   CLASSFP     52889 non-null  object
dtypes: int64(2), object(3)
memory usage: 2.0+ MB


### Column renaming

In [170]:
zip_county_df.rename(columns={'ZIP': 'zip_code', 'COUNTYNAME':'county', 'STATE':'state'}, inplace=True)

### Column Filtering

In [171]:
zip_county_df.drop(columns=['STCOUNTYFP', 'CLASSFP'], inplace=True)

### Row Filtering

In [172]:
# count unique counties per ZIP code
zip_counts = zip_county_df['zip_code'].value_counts()

# Step 1: Find ZIP codes with only one county
zips_one_county = zip_counts[zip_counts == 1].index
# Step 2: Filter out rows with those ZIP codes
zip_county_df = zip_county_df[zip_county_df['zip_code'].isin(zips_one_county)]

In [173]:
zip_county_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29102 entries, 0 to 52888
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   zip_code  29102 non-null  int64 
 1   county    29102 non-null  object
 2   state     29102 non-null  object
dtypes: int64(1), object(2)
memory usage: 909.4+ KB


## Create new column "county" in USA Real Estate Dataset

In [174]:
# Merge df with zip_county_df to get county info
real_estate_df = pd.merge(df_real_estate, zip_county_df, on='zip_code', how='inner')
real_estate_df.drop(columns=['state_x'], inplace=True)
real_estate_df.rename(columns={'state_y':'state'},inplace=True)
real_estate_df.head(10)

Unnamed: 0,price,bed,bath,acre_lot,city,zip_code,house_size,county,state
0,22500.0,2,1,0.32,Anasco,610,850.0,Anasco Municipio,PR
1,120000.0,4,2,0.18,Sabana Grande,637,1188.0,Sabana Grande Municipio,PR
2,133000.0,3,1,0.6,Hatillo,659,1049.0,Hatillo Municipio,PR
3,220000.0,5,3,0.11,Isabela,662,2992.0,Isabela Municipio,PR
4,175000.0,3,3,0.45,Las Marias,670,3090.0,Las Marias Municipio,PR
5,149000.0,3,2,0.28,Mayaguez,682,2000.0,Mayaguez Municipio,PR
6,130000.0,3,2,0.07,Ponce,716,1024.0,Ponce Municipio,PR
7,140000.0,4,3,0.1,Naguabo,718,2222.0,Naguabo Municipio,PR
8,68900.0,3,1,0.09,Naguabo,718,900.0,Naguabo Municipio,PR
9,57900.0,3,1,0.24,Ponce,731,845.0,Ponce Municipio,PR


In [175]:
real_estate_df.shape

(338278, 9)

In [176]:
real_estate_df.describe()

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size
count,338278.0,338278.0,338278.0,338278.0,338278.0,338278.0
mean,398517.9,3.162272,2.236291,0.192204,59252.218084,1715.579503
std,222808.8,0.754238,0.768575,0.12118,30860.386727,580.194862
min,1.0,2.0,1.0,0.0,610.0,100.0
25%,235000.0,3.0,2.0,0.12,32505.0,1275.0
50%,350000.0,3.0,2.0,0.17,63301.0,1622.0
75%,520000.0,4.0,3.0,0.24,89439.0,2071.0
max,1117000.0,5.0,4.0,0.67,99402.0,3418.0


In [177]:
real_estate_df.describe(include = 'object')

Unnamed: 0,city,county,state
count,338278,338278,338278
unique,6438,1147,50
top,Houston,Maricopa County,CA
freq,8452,13488,59085


## Housing features correlation heatmap

In [None]:
# Correlation heatmap
plt.figure(figsize=(12,8))
sns.heatmap(real_estate_df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

## Loading in Zip Code Demographics

In [179]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("erdi28/zip-codes-demographics")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/zip-codes-demographics


In [180]:
files = os.listdir(path)
print("Files in dataset:", files)

Files in dataset: ['zip_code_demographics.csv']


In [181]:
file_path = os.path.join(path, 'zip_code_demographics.csv')
zip_code_demographics_df_raw = pd.read_csv(file_path)

In [182]:
zip_code_demographics_df_raw.head(10)

Unnamed: 0,zip,lat,lng,city,state_id,state_name,population,density,county_name,po_box,dist_highway,dist2_large_airport,dist2_medium_airport,dist_to_shore,number_of_business,adjusted_gross_income,total_income_amount,number_of_returns
0,1001,42.06262,-72.62521,Agawam,MA,Massachusetts,16088,550.1,Hampden,0,1.387035,106.145765,12.946212,93.049251,438.0,598807,604769.0,9320
1,1002,42.37633,-72.46462,Amherst,MA,Massachusetts,27323,198.1,Hampshire,0,14.438177,112.264368,21.080079,133.370144,571.0,989558,1005796.0,9880
2,1005,42.42117,-72.10655,Barre,MA,Massachusetts,4947,44.2,Worcester,0,16.788339,90.664964,25.547718,97.639881,97.0,164207,166054.0,2490
3,1007,42.28163,-72.40009,Belchertown,MA,Massachusetts,15304,107.7,Hampshire,0,13.663839,101.552921,14.762395,114.406034,217.0,647074,654739.0,7970
4,1008,42.18234,-72.95819,Blandford,MA,Massachusetts,1171,7.4,Hampden,0,2.593655,136.548797,20.17795,107.466779,18.0,47826,48241.0,660
5,1010,42.12904,-72.20597,Brimfield,MA,Massachusetts,3703,40.6,Hampden,0,4.737271,78.6455,28.049262,94.928024,73.0,155666,157677.0,1980
6,1011,42.30233,-72.96448,Chester,MA,Massachusetts,1332,15.7,Hampden,0,8.058693,142.414627,26.041908,128.56317,13.0,38223,38553.0,630
7,1012,42.38495,-72.84675,Chesterfield,MA,Massachusetts,503,16.1,Hampshire,0,18.331096,138.381684,27.466664,137.693285,10.0,24826,25102.0,410
8,1013,42.16059,-72.60788,Chicopee,MA,Massachusetts,22709,1549.1,Hampden,0,0.062286,109.114246,7.075893,104.576258,317.0,516431,520174.0,11270
9,1020,42.17618,-72.56538,Chicopee,MA,Massachusetts,30704,951.4,Hampden,0,0.92514,106.769095,3.205533,105.789421,550.0,828125,834614.0,15760


In [183]:
zip_code_demographics_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33704 entries, 0 to 33703
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   zip                    33704 non-null  int64  
 1   lat                    33704 non-null  float64
 2   lng                    33704 non-null  float64
 3   city                   33704 non-null  object 
 4   state_id               33704 non-null  object 
 5   state_name             33704 non-null  object 
 6   population             33704 non-null  int64  
 7   density                33704 non-null  float64
 8   county_name            33704 non-null  object 
 9   po_box                 33704 non-null  int64  
 10  dist_highway           33704 non-null  float64
 11  dist2_large_airport    33704 non-null  float64
 12  dist2_medium_airport   33704 non-null  float64
 13  dist_to_shore          33704 non-null  float64
 14  number_of_business     33624 non-null  float64
 15  ad

In [184]:
zip_code_demographics_df_raw.describe()

Unnamed: 0,zip,lat,lng,population,density,po_box,dist_highway,dist2_large_airport,dist2_medium_airport,dist_to_shore,number_of_business,adjusted_gross_income,total_income_amount,number_of_returns
count,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33704.0,33624.0,33704.0,33704.0,33704.0
mean,49842.607821,38.888195,-91.051366,11287.959738,528.704068,0.179949,29.291294,158.648048,40.854624,304.798736,260.69492,425134.1,429894.3,5341.873665
std,27451.111442,5.228075,15.460095,15200.216997,2219.277025,0.384151,174.104869,166.207042,80.224956,298.948686,432.112814,756151.9,764441.6,7273.61366
min,1001.0,-14.21984,-176.62962,0.0,0.0,0.0,0.000194,0.112768,0.134153,0.011449,1.0,577.0,577.0,90.0
25%,27052.75,35.427243,-97.255342,1253.75,8.2,0.0,3.05042,59.434074,19.513217,61.286177,19.0,32475.0,32879.25,580.0
50%,49780.5,39.50297,-88.22482,4099.0,29.0,0.0,11.541912,125.957938,35.131017,207.328858,74.0,112866.5,114036.5,1880.0
75%,72210.25,42.11,-80.289333,16113.25,238.825,0.0,30.046597,209.476362,55.231087,471.857289,334.0,505229.0,509227.0,7560.0
max,99929.0,71.27434,144.87637,130352.0,68424.5,1.0,10767.713382,6127.454722,5101.697518,1336.551268,7263.0,15224670.0,15277980.0,61920.0


In [185]:
zip_code_demographics_df_raw.describe(include = 'object')

Unnamed: 0,city,state_id,state_name,county_name
count,33704,33704,33704,33704
unique,17551,51,51,1800
top,Houston,TX,Texas,Washington
freq,106,1990,1990,401


## Cleaning Zip Code Demographics Data

In [186]:
zip_code_demographics_df_raw.duplicated().sum()

np.int64(0)

In [187]:
# Keep only the necessary columns
zip_code_demographics_df = zip_code_demographics_df_raw [['zip', 'population', 'density', 'dist_highway', 'dist2_large_airport', 'dist_to_shore', 'number_of_business', 'adjusted_gross_income']]

In [188]:
total_missing = zip_code_demographics_df.isna().sum() * 100 / len(zip_code_demographics_df)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
zip,0.0
population,0.0
density,0.0
dist_highway,0.0
dist2_large_airport,0.0
dist_to_shore,0.0
number_of_business,0.237361
adjusted_gross_income,0.0


In [189]:
# Drop nulls
zip_code_demographics_df = zip_code_demographics_df.dropna()
# Reset and drop the index
zip_code_demographics_df = zip_code_demographics_df.reset_index(drop=True)

In [190]:
total_missing = zip_code_demographics_df.isna().sum() * 100 / len(zip_code_demographics_df)
print('Percentage Missing Value %')
total_missing

Percentage Missing Value %


Unnamed: 0,0
zip,0.0
population,0.0
density,0.0
dist_highway,0.0
dist2_large_airport,0.0
dist_to_shore,0.0
number_of_business,0.0
adjusted_gross_income,0.0


In [191]:
# cast date types
zip_code_demographics_df = zip_code_demographics_df.astype({'zip': 'int', 'population': 'float', 'density': 'float', 'dist_highway': 'float', 'dist2_large_airport': 'float', 'dist_to_shore': 'float', 'number_of_business': 'float', 'adjusted_gross_income': 'float'})

In [192]:
zip_code_demographics_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33624 entries, 0 to 33623
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   zip                    33624 non-null  int64  
 1   population             33624 non-null  float64
 2   density                33624 non-null  float64
 3   dist_highway           33624 non-null  float64
 4   dist2_large_airport    33624 non-null  float64
 5   dist_to_shore          33624 non-null  float64
 6   number_of_business     33624 non-null  float64
 7   adjusted_gross_income  33624 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 2.1 MB


In [193]:
zip_code_demographics_df.rename(columns={'zip': 'zip_code',
                                         'density':'population_density',
                                         'dist_highway':'dist_to_highway',
                                         'dist2_large_airport':'dist_to_airport'}, inplace=True)

## Merge real_estate_df with zip_code_demographics_df to add the zip code related features

In [194]:
real_estate_features_df = pd.merge(real_estate_df, zip_code_demographics_df, on='zip_code', how='inner')
real_estate_features_df.head(10)

Unnamed: 0,price,bed,bath,acre_lot,city,zip_code,house_size,county,state,population,population_density,dist_to_highway,dist_to_airport,dist_to_shore,number_of_business,adjusted_gross_income
0,215000.0,3,2,0.19,Chicopee,1020,1828.0,Hampden County,MA,30704.0,951.4,0.92514,106.769095,105.789421,550.0,828125.0
1,269900.0,2,1,0.46,South Hadley,1075,1312.0,Hampshire County,MA,18051.0,393.3,2.910859,112.181953,115.181126,297.0,697896.0
2,314900.0,5,2,0.28,Chicopee,1013,2219.0,Hampden County,MA,22709.0,1549.1,0.062286,109.114246,104.576258,317.0,516431.0
3,244999.0,4,1,0.23,Springfield,1104,1285.0,Hampden County,MA,23226.0,1714.0,0.645911,104.785728,101.063511,528.0,453300.0
4,399900.0,3,3,0.35,Easthampton,1027,1380.0,Hampshire County,MA,17708.0,168.5,9.873599,125.949999,124.367513,372.0,671249.0
5,198000.0,2,1,0.14,Holyoke,1040,768.0,Hampden County,MA,38480.0,702.0,0.598474,114.170265,111.173366,907.0,832429.0
6,299900.0,3,1,0.4,Easthampton,1027,1424.0,Hampshire County,MA,17708.0,168.5,9.873599,125.949999,124.367513,372.0,671249.0
7,279000.0,3,1,0.23,Ludlow,1056,1164.0,Hampden County,MA,21050.0,298.9,2.608102,99.921771,105.057502,412.0,747067.0
8,319900.0,3,1,0.41,Northampton,1062,864.0,Hampshire County,MA,9835.0,217.6,3.630387,122.235452,124.761768,191.0,639133.0
9,264900.0,3,2,0.4,Holyoke,1040,1776.0,Hampden County,MA,38480.0,702.0,0.598474,114.170265,111.173366,907.0,832429.0


In [195]:
real_estate_features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338148 entries, 0 to 338147
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   price                  338148 non-null  float64
 1   bed                    338148 non-null  int64  
 2   bath                   338148 non-null  int64  
 3   acre_lot               338148 non-null  float64
 4   city                   338148 non-null  object 
 5   zip_code               338148 non-null  int64  
 6   house_size             338148 non-null  float64
 7   county                 338148 non-null  object 
 8   state                  338148 non-null  object 
 9   population             338148 non-null  float64
 10  population_density     338148 non-null  float64
 11  dist_to_highway        338148 non-null  float64
 12  dist_to_airport        338148 non-null  float64
 13  dist_to_shore          338148 non-null  float64
 14  number_of_business     338148 non-nu

In [196]:
real_estate_features_df.describe()

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size,population,population_density,dist_to_highway,dist_to_airport,dist_to_shore,number_of_business,adjusted_gross_income
count,338148.0,338148.0,338148.0,338148.0,338148.0,338148.0,338148.0,338148.0,338148.0,338148.0,338148.0,338148.0,338148.0
mean,398516.0,3.162272,2.236263,0.192209,59261.692052,1715.619681,35183.865905,1205.825783,6.331339,78.152613,187.680797,678.352136,1261296.0
std,222789.2,0.754211,0.768534,0.121162,30852.169233,580.190767,19180.403151,1350.849182,12.922188,103.741284,239.755605,468.52814,909238.0
min,1.0,2.0,1.0,0.0,1001.0,100.0,0.0,0.0,0.000283,0.42442,0.011449,1.0,2925.0
25%,235000.0,3.0,2.0,0.12,32506.0,1275.0,21670.0,293.5,1.328307,19.921847,20.759234,335.0,643827.0
50%,350000.0,3.0,2.0,0.17,63301.0,1622.0,33096.0,867.8,2.829334,36.951682,73.724405,586.0,1074226.0
75%,520000.0,4.0,3.0,0.24,89439.0,2071.0,45792.0,1695.2,5.967586,92.247183,314.774612,916.0,1632569.0
max,1117000.0,5.0,4.0,0.67,99402.0,3418.0,130352.0,49565.3,299.940968,994.951376,1330.442647,7263.0,12075230.0


In [197]:
real_estate_features_df.describe(include = 'object')

Unnamed: 0,city,county,state
count,338148,338148,338148
unique,6399,1126,48
top,Houston,Maricopa County,CA
freq,8452,13488,59048


## Influence of regional features on house price



*   The price ranges from $0 to $1.117M since we removed the outliers of the USA real estate dataframe.
*   There are vertical lines observed in the graph since houses from the same zip_code have the same regional features; we merged regional features dataframe with the main dataframe on zip_code.



In [None]:
import plotly.express as px

features = ['population', 'population_density', 'dist_to_highway', 'dist_to_airport',
            'dist_to_shore', 'number_of_business', 'adjusted_gross_income']

for feature in features:
    fig = px.scatter(real_estate_features_df, x=feature, y='price', trendline='ols',
                     title=f'House Price vs {feature.replace("_", " ").title()}',
                     width=600, height=400)
    fig.show()

*   Features that have positive correlation with price:
  *   Population (more populated the area, higher the price)
  *   Population Density (more densly populated the area, higher the price)
  *   Number of bussinesses (more bussiness in the area, higher the price)
  *   Adjusted gross income (wealthier the region, higher the price)

*   Features that have negative correlation with price:
  *   Distance to highway (more accessible to highway, higher the price)
  *   Distance to airport (more accessible to airport, higher the price)
  *   Distance to shore (closer to the shore, higher the house price)







Import Library


In [199]:
import pandas as pd
import numpy as np
import seaborn as sns
from string import ascii_letters
import matplotlib.pyplot as plt
import datetime as dt
import requests
from lxml import html
import math
import json
import re
import os

# 1. County Demographics Data Cleaning and Wrangling


## 1.1 Loading County Demogrphics Dataset

In [244]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("glozab/county-level-us-demographic-data-1990-2020")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/county-level-us-demographic-data-1990-2020


In [245]:
files = os.listdir(path)
print("Files in dataset:", files)

Files in dataset: ['county_demographics.csv']


In [246]:
file_path = os.path.join(path, 'county_demographics.csv')
county_demographics_df_raw = pd.read_csv(file_path)

print(county_demographics_df_raw.head())

   year  fips  population  w_population  b_population  o_population  \
0  1990  1025       27289         15579         11643            35   
1  1990  1031       40293         32869          6950           160   
2  1990  1041       13598         10068          3516            11   
3  1990  1053       35526         24377         10050          1045   
4  1990  1101      209537        119702         87856           415   

   nh_population  hi_population  na_population  male_population  ...  \
0          27196             93              0            13052  ...   
1          39831            462              0            19673  ...   
2          13576             22              0             6421  ...   
3          35378            148              0            17454  ...   
4         207933           1604              0            98854  ...   

   age9_population_ratio  age10_population_ratio  age11_population_ratio  \
0                0.06263                 0.05552                

## 1.2 County Demographics Dataset Cleaning

In [247]:
county_demographics_df_raw.dtypes

Unnamed: 0,0
year,int64
fips,int64
population,int64
w_population,int64
b_population,int64
o_population,int64
nh_population,int64
hi_population,int64
na_population,int64
male_population,int64


In [248]:
county_demographics_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97287 entries, 0 to 97286
Data columns (total 57 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     97287 non-null  int64  
 1   fips                     97287 non-null  int64  
 2   population               97287 non-null  int64  
 3   w_population             97287 non-null  int64  
 4   b_population             97287 non-null  int64  
 5   o_population             97287 non-null  int64  
 6   nh_population            97287 non-null  int64  
 7   hi_population            97287 non-null  int64  
 8   na_population            97287 non-null  int64  
 9   male_population          97287 non-null  int64  
 10  female_population        97287 non-null  int64  
 11  age0_population          97287 non-null  int64  
 12  age1_population          97287 non-null  int64  
 13  age2_population          97287 non-null  int64  
 14  age3_population       

In [249]:
#check how many missing values are in each column
county_demographics_df_raw.isna().sum().sort_values(ascending=False)

Unnamed: 0,0
year,0
fips,0
population,0
w_population,0
b_population,0
o_population,0
nh_population,0
hi_population,0
na_population,0
male_population,0


In [250]:
# keep the necessary columns
county_demographics_df_raw = county_demographics_df_raw.drop(columns = ['population', 'female_population', 'male_population', 'w_population', 'b_population', 'o_population', 'nh_population', 'hi_population', 'na_population',
                                           'age0_population', 'age1_population', 'age2_population', 'age3_population', 'age4_population', 'age5_population',
                                           'age6_population', 'age7_population', 'age8_population', 'age9_population', 'age10_population', 'age11_population',
                                           'age12_population', 'age13_population', 'age14_population', 'age15_population', 'age16_population', 'age17_population',
                                           'age18_population'])

In [251]:
county_demographics_df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97287 entries, 0 to 97286
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     97287 non-null  int64  
 1   fips                     97287 non-null  int64  
 2   w_population_ratio       97287 non-null  float64
 3   b_population_ratio       97287 non-null  float64
 4   o_population_ratio       97287 non-null  float64
 5   nh_population_ratio      97287 non-null  float64
 6   hi_population_ratio      97287 non-null  float64
 7   na_population_ratio      97287 non-null  float64
 8   male_population_ratio    97287 non-null  float64
 9   female_population_ratio  97287 non-null  float64
 10  age0_population_ratio    97287 non-null  float64
 11  age1_population_ratio    97287 non-null  float64
 12  age2_population_ratio    97287 non-null  float64
 13  age3_population_ratio    97287 non-null  float64
 14  age4_population_ratio 

### Rename Columns for Consistency

In [252]:
county_demographics_df_raw.columns = county_demographics_df_raw.columns.str.strip().str.lower().str.replace(' ', '_')

In [253]:
# Rename age groups
county_demographics_df_renamed = county_demographics_df_raw
county_demographics_df_renamed.rename(
    columns=
     {
    'w_population_ratio': 'caucasion_population_ratio',
    'b_population_ratio': 'African-American_population_ratio',
    'o_population_ratio': 'other_race_population_ratio',
    'nh_population_ratio':'non-Hispanic_population_ratio',
    'hi_population_ratio':'hispanic_population_ratio',
    'na_population_ratio':'other_origin_population_ratio',
    'age0_population_ratio':'age_less_1_population_ratio',
    'age1_population_ratio':'age_1_to_4_population_ratio',
    'age2_population_ratio':'age_5_to_9_population_ratio',
    'age3_population_ratio':'age_10_to_14_population_ratio',
    'age4_population_ratio':'age_15_to_19_population_ratio',
    'age5_population_ratio':'age_20_to_24_population_ratio',
    'age6_population_ratio':'age_25_to_29_population_ratio',
    'age7_population_ratio':'age_30_to_34_population_ratio',
    'age8_population_ratio':'age_35_to_39_population_ratio',
    'age9_population_ratio':'age_40_to_44_population_ratio',
    'age10_population_ratio':'age_45_to_49_population_ratio',
    'age11_population_ratio':'age_50_to_54_population_ratio',
    'age12_population_ratio':'age_55_to_59_population_ratio',
    'age13_population_ratio':'age_60_to_64_population_ratio',
    'age14_population_ratio':'age_65_to_69_population_ratio',
    'age15_population_ratio':'age_70_to_74_population_ratio',
    'age16_population_ratio':'age_75_to_79_population_ratio',
    'age17_population_ratio':'age_80_to_84_population_ratio',
    'age18_population_ratio':'age_85_and_over_population_ratio'
    }, inplace=True)

In [254]:
county_demographics_df_renamed.reset_index(drop=True, inplace=True)

In [255]:
county_demographics_df_renamed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97287 entries, 0 to 97286
Data columns (total 29 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   year                               97287 non-null  int64  
 1   fips                               97287 non-null  int64  
 2   caucasion_population_ratio         97287 non-null  float64
 3   African-American_population_ratio  97287 non-null  float64
 4   other_race_population_ratio        97287 non-null  float64
 5   non-Hispanic_population_ratio      97287 non-null  float64
 6   hispanic_population_ratio          97287 non-null  float64
 7   other_origin_population_ratio      97287 non-null  float64
 8   male_population_ratio              97287 non-null  float64
 9   female_population_ratio            97287 non-null  float64
 10  age_less_1_population_ratio        97287 non-null  float64
 11  age_1_to_4_population_ratio        97287 non-null  flo

Grouping age group

In [256]:
county_demographics_df_renamed['age_0_to_19_population_ratio'] = county_demographics_df_renamed[[
    'age_less_1_population_ratio',
    'age_1_to_4_population_ratio',
    'age_5_to_9_population_ratio',
    'age_10_to_14_population_ratio',
    'age_15_to_19_population_ratio']].sum(axis=1)

In [257]:
county_demographics_df_renamed['age_20_to_39_population_ratio'] = county_demographics_df_renamed[[
    'age_20_to_24_population_ratio',
    'age_25_to_29_population_ratio',
    'age_30_to_34_population_ratio',
    'age_35_to_39_population_ratio']].sum(axis=1)

In [258]:
county_demographics_df_renamed['age_40_to_59_population_ratio'] = county_demographics_df_renamed[[
    'age_40_to_44_population_ratio',
    'age_45_to_49_population_ratio',
    'age_50_to_54_population_ratio',
    'age_55_to_59_population_ratio']].sum(axis=1)

In [259]:
county_demographics_df_renamed['age_60_to_79_population_ratio'] = county_demographics_df_renamed[[
    'age_60_to_64_population_ratio',
    'age_65_to_69_population_ratio',
    'age_70_to_74_population_ratio',
    'age_75_to_79_population_ratio']].sum(axis=1)

In [260]:
county_demographics_df_renamed['age_80_and_over_population_ratio'] = county_demographics_df_renamed[[
    'age_80_to_84_population_ratio',
    'age_85_and_over_population_ratio']].sum(axis=1)

In [261]:
county_demographics_grouped_df = county_demographics_df_renamed[['year',
                                                                 'fips',
                                                                 'caucasion_population_ratio',
                                                                 'African-American_population_ratio',
                                                                 'other_race_population_ratio',
                                                                 'non-Hispanic_population_ratio',
                                                                 'hispanic_population_ratio',
                                                                 'male_population_ratio',
                                                                 'female_population_ratio',
                                                                 'age_0_to_19_population_ratio',
                                                                 'age_20_to_39_population_ratio',
                                                                 'age_40_to_59_population_ratio',
                                                                 'age_60_to_79_population_ratio',
                                                                 'age_80_and_over_population_ratio'
                                                                ]]

In [262]:
county_demographics_grouped_df.head()

Unnamed: 0,year,fips,caucasion_population_ratio,African-American_population_ratio,other_race_population_ratio,non-Hispanic_population_ratio,hispanic_population_ratio,male_population_ratio,female_population_ratio,age_0_to_19_population_ratio,age_20_to_39_population_ratio,age_40_to_59_population_ratio,age_60_to_79_population_ratio,age_80_and_over_population_ratio
0,1990,1025,0.57089,0.42666,0.00128,0.99659,0.00341,0.47829,0.52171,0.33075,0.28386,0.21295,0.14083,0.03162
1,1990,1031,0.81575,0.17249,0.00397,0.98853,0.01147,0.48825,0.51175,0.28436,0.30236,0.2356,0.14849,0.02918
2,1990,1041,0.7404,0.25857,0.00081,0.99838,0.00162,0.4722,0.5278,0.29334,0.2621,0.21275,0.18886,0.04295
3,1990,1053,0.68617,0.28289,0.02942,0.99583,0.00417,0.4913,0.5087,0.29682,0.29283,0.22459,0.15266,0.0331
4,1990,1101,0.57127,0.41929,0.00198,0.99235,0.00765,0.47177,0.52823,0.30814,0.3297,0.20688,0.12899,0.02628


In [263]:
county_demographics_grouped_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97287 entries, 0 to 97286
Data columns (total 14 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   year                               97287 non-null  int64  
 1   fips                               97287 non-null  int64  
 2   caucasion_population_ratio         97287 non-null  float64
 3   African-American_population_ratio  97287 non-null  float64
 4   other_race_population_ratio        97287 non-null  float64
 5   non-Hispanic_population_ratio      97287 non-null  float64
 6   hispanic_population_ratio          97287 non-null  float64
 7   male_population_ratio              97287 non-null  float64
 8   female_population_ratio            97287 non-null  float64
 9   age_0_to_19_population_ratio       97287 non-null  float64
 10  age_20_to_39_population_ratio      97287 non-null  float64
 11  age_40_to_59_population_ratio      97287 non-null  flo

 ### Handle Duplicates

In [264]:
county_demographics_df = county_demographics_grouped_df.drop_duplicates()
county_demographics_df = county_demographics_df.dropna()
county_demographics_df.head()

Unnamed: 0,year,fips,caucasion_population_ratio,African-American_population_ratio,other_race_population_ratio,non-Hispanic_population_ratio,hispanic_population_ratio,male_population_ratio,female_population_ratio,age_0_to_19_population_ratio,age_20_to_39_population_ratio,age_40_to_59_population_ratio,age_60_to_79_population_ratio,age_80_and_over_population_ratio
0,1990,1025,0.57089,0.42666,0.00128,0.99659,0.00341,0.47829,0.52171,0.33075,0.28386,0.21295,0.14083,0.03162
1,1990,1031,0.81575,0.17249,0.00397,0.98853,0.01147,0.48825,0.51175,0.28436,0.30236,0.2356,0.14849,0.02918
2,1990,1041,0.7404,0.25857,0.00081,0.99838,0.00162,0.4722,0.5278,0.29334,0.2621,0.21275,0.18886,0.04295
3,1990,1053,0.68617,0.28289,0.02942,0.99583,0.00417,0.4913,0.5087,0.29682,0.29283,0.22459,0.15266,0.0331
4,1990,1101,0.57127,0.41929,0.00198,0.99235,0.00765,0.47177,0.52823,0.30814,0.3297,0.20688,0.12899,0.02628


In [265]:
county_demographics_df.describe()

Unnamed: 0,year,fips,caucasion_population_ratio,African-American_population_ratio,other_race_population_ratio,non-Hispanic_population_ratio,hispanic_population_ratio,male_population_ratio,female_population_ratio,age_0_to_19_population_ratio,age_20_to_39_population_ratio,age_40_to_59_population_ratio,age_60_to_79_population_ratio,age_80_and_over_population_ratio
count,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0,97287.0
mean,2005.009436,30412.85995,0.873184,0.092728,0.02173,0.928248,0.071752,0.497252,0.502748,0.271176,0.252334,0.261087,0.173507,0.041897
std,8.942825,15147.842995,0.162454,0.146201,0.079129,0.12735,0.12735,0.020924,0.020924,0.039014,0.047064,0.032236,0.04607,0.015786
min,1990.0,1001.0,0.02682,0.0,0.0,0.02217,0.0,0.42627,0.26315,0.0,0.09195,0.07102,0.01692,0.0
25%,1997.0,18183.0,0.831025,0.00692,0.00234,0.933745,0.01084,0.48657,0.49795,0.24812,0.22149,0.23975,0.14261,0.03137
50%,2005.0,29179.0,0.94457,0.0233,0.00452,0.97516,0.02484,0.49381,0.50619,0.27019,0.24814,0.2629,0.1689,0.03975
75%,2013.0,45083.0,0.97938,0.10738,0.01052,0.98916,0.066255,0.50205,0.51343,0.292315,0.27856,0.28274,0.19993,0.04988
max,2020.0,99999.0,1.0,0.87153,0.97169,1.0,0.97783,0.73685,0.57373,0.50684,0.5761,0.53763,0.55615,0.25287


## 1.2 Loading Fips Code DataSet




In [266]:
%%capture
!pip instsall kaggle
from google.colab import drive
drive.mount('/content/drive')

!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/

In [267]:
# Loading county df data from google drive
fips_df = pd.read_csv('/content/drive/MyDrive/CIS 5450 Group Project/data_sets/state_and_county_fips_master.csv')

In [268]:
fips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3195 entries, 0 to 3194
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   fips    3195 non-null   int64 
 1   name    3195 non-null   object
 2   state   3143 non-null   object
dtypes: int64(1), object(2)
memory usage: 75.0+ KB


In [269]:
fips_df.head()

Unnamed: 0,fips,name,state
0,0,UNITED STATES,
1,1000,ALABAMA,
2,1001,Autauga County,AL
3,1003,Baldwin County,AL
4,1005,Barbour County,AL


In [270]:
fips_df.describe()

Unnamed: 0,fips
count,3195.0
mean,30358.079499
std,15181.225584
min,0.0
25%,18172.0
50%,29175.0
75%,45076.0
max,56045.0


## 1.3 Merging the county_demographics_df with fips_df

In [271]:
county_demographics_merged_df = county_demographics_df.merge(
    fips_df,
    on='fips',
    how='left'
)
county_demographics_merged_df.head()

Unnamed: 0,year,fips,caucasion_population_ratio,African-American_population_ratio,other_race_population_ratio,non-Hispanic_population_ratio,hispanic_population_ratio,male_population_ratio,female_population_ratio,age_0_to_19_population_ratio,age_20_to_39_population_ratio,age_40_to_59_population_ratio,age_60_to_79_population_ratio,age_80_and_over_population_ratio,name,state
0,1990,1025,0.57089,0.42666,0.00128,0.99659,0.00341,0.47829,0.52171,0.33075,0.28386,0.21295,0.14083,0.03162,Clarke County,AL
1,1990,1031,0.81575,0.17249,0.00397,0.98853,0.01147,0.48825,0.51175,0.28436,0.30236,0.2356,0.14849,0.02918,Coffee County,AL
2,1990,1041,0.7404,0.25857,0.00081,0.99838,0.00162,0.4722,0.5278,0.29334,0.2621,0.21275,0.18886,0.04295,Crenshaw County,AL
3,1990,1053,0.68617,0.28289,0.02942,0.99583,0.00417,0.4913,0.5087,0.29682,0.29283,0.22459,0.15266,0.0331,Escambia County,AL
4,1990,1101,0.57127,0.41929,0.00198,0.99235,0.00765,0.47177,0.52823,0.30814,0.3297,0.20688,0.12899,0.02628,Montgomery County,AL


### Sanity Check After Merging

In [272]:
print(county_demographics_merged_df.columns)         # See all columns
print(county_demographics_merged_df[['fips', 'name']].head())  # Confirm correct matches
print(county_demographics_merged_df.isna().sum())    # Check for unmatched FIPS

Index(['year', 'fips', 'caucasion_population_ratio',
       'African-American_population_ratio', 'other_race_population_ratio',
       'non-Hispanic_population_ratio', 'hispanic_population_ratio',
       'male_population_ratio', 'female_population_ratio',
       'age_0_to_19_population_ratio', 'age_20_to_39_population_ratio',
       'age_40_to_59_population_ratio', 'age_60_to_79_population_ratio',
       'age_80_and_over_population_ratio', 'name', 'state'],
      dtype='object')
   fips               name
0  1025      Clarke County
1  1031      Coffee County
2  1041    Crenshaw County
3  1053    Escambia County
4  1101  Montgomery County
year                                   0
fips                                   0
caucasion_population_ratio             0
African-American_population_ratio      0
other_race_population_ratio            0
non-Hispanic_population_ratio          0
hispanic_population_ratio              0
male_population_ratio                  0
female_population_ratio   

## 1.4 Cleaning the Merged Data

Checking for unmatched FIPS — that output tells us 174 rows in the county_demographics_df didn't find a match in fips_df, because name, state, and any other info from fips_df came back as NaN.

In [273]:
unmatched = county_demographics_merged_df[county_demographics_merged_df['name'].isna()]
print(unmatched['fips'].unique()[:10])

[ 2910  4910 51917  8911  2201  8912  8913  2280  2232  2010]


In [274]:
county_demographics_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97287 entries, 0 to 97286
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   year                               97287 non-null  int64  
 1   fips                               97287 non-null  int64  
 2   caucasion_population_ratio         97287 non-null  float64
 3   African-American_population_ratio  97287 non-null  float64
 4   other_race_population_ratio        97287 non-null  float64
 5   non-Hispanic_population_ratio      97287 non-null  float64
 6   hispanic_population_ratio          97287 non-null  float64
 7   male_population_ratio              97287 non-null  float64
 8   female_population_ratio            97287 non-null  float64
 9   age_0_to_19_population_ratio       97287 non-null  float64
 10  age_20_to_39_population_ratio      97287 non-null  float64
 11  age_40_to_59_population_ratio      97287 non-null  flo

Drop the unmatched rows.

In [275]:
county_demographics_merged_df = county_demographics_merged_df[county_demographics_merged_df['name'].notna()].reset_index(drop=True)
county_demographics_merged_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97113 entries, 0 to 97112
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   year                               97113 non-null  int64  
 1   fips                               97113 non-null  int64  
 2   caucasion_population_ratio         97113 non-null  float64
 3   African-American_population_ratio  97113 non-null  float64
 4   other_race_population_ratio        97113 non-null  float64
 5   non-Hispanic_population_ratio      97113 non-null  float64
 6   hispanic_population_ratio          97113 non-null  float64
 7   male_population_ratio              97113 non-null  float64
 8   female_population_ratio            97113 non-null  float64
 9   age_0_to_19_population_ratio       97113 non-null  float64
 10  age_20_to_39_population_ratio      97113 non-null  float64
 11  age_40_to_59_population_ratio      97113 non-null  flo

In [276]:
county_demographics_merged_df = county_demographics_merged_df.rename(columns={'name': 'county'})
county_demographics_merged_df = county_demographics_merged_df.dropna()
county_demographics_merged_df = county_demographics_merged_df.drop_duplicates()
county_demographics_merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97113 entries, 0 to 97112
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   year                               97113 non-null  int64  
 1   fips                               97113 non-null  int64  
 2   caucasion_population_ratio         97113 non-null  float64
 3   African-American_population_ratio  97113 non-null  float64
 4   other_race_population_ratio        97113 non-null  float64
 5   non-Hispanic_population_ratio      97113 non-null  float64
 6   hispanic_population_ratio          97113 non-null  float64
 7   male_population_ratio              97113 non-null  float64
 8   female_population_ratio            97113 non-null  float64
 9   age_0_to_19_population_ratio       97113 non-null  float64
 10  age_20_to_39_population_ratio      97113 non-null  float64
 11  age_40_to_59_population_ratio      97113 non-null  flo

## Result of county demographics df from year 1990 to 2020

In [277]:
county_demographics_merged_df.head()

Unnamed: 0,year,fips,caucasion_population_ratio,African-American_population_ratio,other_race_population_ratio,non-Hispanic_population_ratio,hispanic_population_ratio,male_population_ratio,female_population_ratio,age_0_to_19_population_ratio,age_20_to_39_population_ratio,age_40_to_59_population_ratio,age_60_to_79_population_ratio,age_80_and_over_population_ratio,county,state
0,1990,1025,0.57089,0.42666,0.00128,0.99659,0.00341,0.47829,0.52171,0.33075,0.28386,0.21295,0.14083,0.03162,Clarke County,AL
1,1990,1031,0.81575,0.17249,0.00397,0.98853,0.01147,0.48825,0.51175,0.28436,0.30236,0.2356,0.14849,0.02918,Coffee County,AL
2,1990,1041,0.7404,0.25857,0.00081,0.99838,0.00162,0.4722,0.5278,0.29334,0.2621,0.21275,0.18886,0.04295,Crenshaw County,AL
3,1990,1053,0.68617,0.28289,0.02942,0.99583,0.00417,0.4913,0.5087,0.29682,0.29283,0.22459,0.15266,0.0331,Escambia County,AL
4,1990,1101,0.57127,0.41929,0.00198,0.99235,0.00765,0.47177,0.52823,0.30814,0.3297,0.20688,0.12899,0.02628,Montgomery County,AL


##Result of county demographics df for year 2020 alone, used for housing price static anaylsis

In [278]:
# filter demographics data for year of 2020 for static analysis portion
county_demographics_2020_df = county_demographics_merged_df[county_demographics_merged_df['year'] == 2020].reset_index(drop=True)
county_demographics_2020_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141 entries, 0 to 3140
Data columns (total 16 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   year                               3141 non-null   int64  
 1   fips                               3141 non-null   int64  
 2   caucasion_population_ratio         3141 non-null   float64
 3   African-American_population_ratio  3141 non-null   float64
 4   other_race_population_ratio        3141 non-null   float64
 5   non-Hispanic_population_ratio      3141 non-null   float64
 6   hispanic_population_ratio          3141 non-null   float64
 7   male_population_ratio              3141 non-null   float64
 8   female_population_ratio            3141 non-null   float64
 9   age_0_to_19_population_ratio       3141 non-null   float64
 10  age_20_to_39_population_ratio      3141 non-null   float64
 11  age_40_to_59_population_ratio      3141 non-null   float

## 1.5 Merge with real_estate_features_df to add the new features

In [279]:
# Merge with real_estate_features_df
real_estate_df = pd.merge(real_estate_features_df, county_demographics_2020_df, on=['county','state'], how='inner')
real_estate_df.head()

Unnamed: 0,price,bed,bath,acre_lot,city,zip_code,house_size,county,state,population,...,other_race_population_ratio,non-Hispanic_population_ratio,hispanic_population_ratio,male_population_ratio,female_population_ratio,age_0_to_19_population_ratio,age_20_to_39_population_ratio,age_40_to_59_population_ratio,age_60_to_79_population_ratio,age_80_and_over_population_ratio
0,215000.0,3,2,0.19,Chicopee,1020,1828.0,Hampden County,MA,30704.0,...,0.00926,0.73248,0.26752,0.48298,0.51702,0.24032,0.26673,0.24718,0.20154,0.04424
1,269900.0,2,1,0.46,South Hadley,1075,1312.0,Hampshire County,MA,18051.0,...,0.0038,0.93769,0.06231,0.46567,0.53433,0.22457,0.30568,0.21829,0.21032,0.04114
2,314900.0,5,2,0.28,Chicopee,1013,2219.0,Hampden County,MA,22709.0,...,0.00926,0.73248,0.26752,0.48298,0.51702,0.24032,0.26673,0.24718,0.20154,0.04424
3,244999.0,4,1,0.23,Springfield,1104,1285.0,Hampden County,MA,23226.0,...,0.00926,0.73248,0.26752,0.48298,0.51702,0.24032,0.26673,0.24718,0.20154,0.04424
4,399900.0,3,3,0.35,Easthampton,1027,1380.0,Hampshire County,MA,17708.0,...,0.0038,0.93769,0.06231,0.46567,0.53433,0.22457,0.30568,0.21829,0.21032,0.04114


In [280]:
# drop year and fips column for static analysis portion
real_estate_df = real_estate_df.drop(columns=['year', 'fips'])

In [281]:
real_estate_df.describe()

Unnamed: 0,price,bed,bath,acre_lot,zip_code,house_size,population,population_density,dist_to_highway,dist_to_airport,...,other_race_population_ratio,non-Hispanic_population_ratio,hispanic_population_ratio,male_population_ratio,female_population_ratio,age_0_to_19_population_ratio,age_20_to_39_population_ratio,age_40_to_59_population_ratio,age_60_to_79_population_ratio,age_80_and_over_population_ratio
count,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0,...,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0,338095.0
mean,398538.5,3.162253,2.236244,0.192206,59259.443571,1715.598944,35187.501164,1205.986716,6.330128,78.094985,...,0.016081,0.77889,0.22111,0.492298,0.507702,0.246291,0.274084,0.249106,0.191121,0.039398
std,222794.6,0.754217,0.768542,0.12116,30852.260665,580.196261,19179.561332,1350.889366,12.922563,103.630478,...,0.024112,0.167254,0.167254,0.009854,0.009854,0.029514,0.035871,0.017721,0.043325,0.01316
min,1.0,2.0,1.0,0.0,1001.0,100.0,0.0,0.0,0.000283,0.42442,...,0.00112,0.04908,0.00748,0.45538,0.35768,0.07072,0.10871,0.13596,0.09234,0.01597
25%,235000.0,3.0,2.0,0.12,32506.0,1275.0,21686.0,293.5,1.327312,19.921847,...,0.0057,0.68483,0.0818,0.48745,0.50402,0.23229,0.25305,0.24158,0.16629,0.03158
50%,350000.0,3.0,2.0,0.17,63301.0,1622.0,33096.0,868.1,2.829334,36.951682,...,0.01072,0.82827,0.17173,0.4929,0.5071,0.24369,0.27712,0.24772,0.17886,0.03698
75%,520000.0,4.0,3.0,0.24,89441.0,2071.0,45792.0,1695.2,5.967247,92.14522,...,0.01929,0.9182,0.31517,0.49598,0.51255,0.26406,0.2975,0.26111,0.20965,0.04528
max,1117000.0,5.0,4.0,0.67,99402.0,3418.0,130352.0,49565.3,299.940968,994.951376,...,0.81104,0.99252,0.95092,0.64232,0.54462,0.39407,0.46711,0.31283,0.54643,0.13212


In [282]:
real_estate_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338095 entries, 0 to 338094
Data columns (total 28 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   price                              338095 non-null  float64
 1   bed                                338095 non-null  int64  
 2   bath                               338095 non-null  int64  
 3   acre_lot                           338095 non-null  float64
 4   city                               338095 non-null  object 
 5   zip_code                           338095 non-null  int64  
 6   house_size                         338095 non-null  float64
 7   county                             338095 non-null  object 
 8   state                              338095 non-null  object 
 9   population                         338095 non-null  float64
 10  population_density                 338095 non-null  float64
 11  dist_to_highway                    3380

### Correlation Heatmap

In [None]:
# Correlation heatmap
plt.figure(figsize=(28, 26))
sns.heatmap(real_estate_df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

Some observations:
* Price correlation highlights: For the first column of the heatmap (correlation with price):
All the features do not have |R| > 0.7, which suggests they do not have a strong correlation with housing price. However, we can observe some positive and negative correlation with housing price as follows:
* Notable positive correlation:
  * adjusted_gross_income: Highest positve correlation — richer areas → higher home prices
  * bed: More bedrooms → higher home prices
  * bath: More bathrooms → higher home prices
  * zip_code
  * population density: More densly populated area → higher home prices
  * number of bussiness: More number of bussinesses → higher home prices
  * age_20_to_39_population_ratio: More working age population → higher home prices
  * age_40_to_59_populatiom_ratio: More middle age population → higher home prices
* Notable Negative correlations:
  * dist_to_shore: Closer from coast → higher home prices
  * dist_to_airport: More accessible to highway → higher home prices
  * dist_to_highway: similar trend — accessibility affects price

Other features don't seem to be highly correlated with home prices


### Distribution of Normalized House Price by Dominant Age Group

In [284]:
df = real_estate_df.copy()

# Normalize price by price_per_sqft to get an estimate of home size
df['normalized_price'] = df['price'] / df['price_per_sqft']

# Define age group columns
age_group_cols = [
    'age_0_to_19_population_ratio',
    'age_20_to_39_population_ratio',
    'age_40_to_59_population_ratio',
    'age_60_to_79_population_ratio',
    'age_80_and_over_population_ratio'
]

age_group_labels = {
    'age_0_to_19_population_ratio': '0-19',
    'age_20_to_39_population_ratio': '20-39',
    'age_40_to_59_population_ratio': '40-59',
    'age_60_to_79_population_ratio': '60-79',
    'age_80_and_over_population_ratio': '80+'
}

# Create a dominant age group column
df['dominant_age_group'] = df[age_group_cols].idxmax(axis=1).map(age_group_labels)

# Plot histogram of normalized price (home size) by age group
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x=df['price'], hue='dominant_age_group', element='step', stat='density', common_norm=False, bins=50)
plt.title('Distribution of Normalized House Price (Price per Sqft) by Dominant Age Group')
plt.xlabel('Normalized Price (Price per Sqft)')
plt.ylabel('Density')
plt.xlim(0, df['normalized_price'].quantile(0.99))  # Cut off extreme outliers
plt.tight_layout()
plt.show()


KeyError: 'price_per_sqft'

### Distribution of Price per Sqft by Dominant Racial Group

In [None]:
actual_race_df = real_estate_df.copy()
# Identify dominant racial group
race_columns = [
    'caucasion_population_ratio',
    'African-American_population_ratio',
    'hispanic_population_ratio',
    'other_race_population_ratio'
]

race_labels = {
    'caucasion_population_ratio': 'Caucasian',
    'African-American_population_ratio': 'African-American',
    'hispanic_population_ratio': 'Hispanic',
    'other_race_population_ratio': 'Other'
}

actual_race_df['dominant_race'] = actual_race_df[race_columns].idxmax(axis=1).map(race_labels)

# Plot histogram
plt.figure(figsize=(12, 6))
sns.histplot(
    data=actual_race_df,
    x='price_per_sqft',
    hue='dominant_race',
    element='step',
    stat='density',
    common_norm=False,
    bins=40
)
plt.title('Distribution of Price per Sqft by Dominant Racial Group')
plt.xlabel('Price per Sqft')
plt.ylabel('Density')
plt.xlim(0, actual_race_df['price_per_sqft'].quantile(0.99))
plt.tight_layout()
plt.show()

### Population Distribution by Racial Group

In [None]:
df = real_estate_df.copy()
# Normalize race ratios to sum to 1 per row (to simulate real proportions)
total_ratios = df[['caucasion_population_ratio', 'African-American_population_ratio',
                   'hispanic_population_ratio', 'other_race_population_ratio']].sum(axis=1)
df['caucasion_population_ratio'] /= total_ratios
df['African-American_population_ratio'] /= total_ratios
df['hispanic_population_ratio'] /= total_ratios
df['other_race_population_ratio'] /= total_ratios

# Calculate total population for each racial group
racial_population = pd.Series({
    'Caucasian': (df['population'] * df['caucasion_population_ratio']).sum(),
    'African-American': (df['population'] * df['African-American_population_ratio']).sum(),
    'Hispanic': (df['population'] * df['hispanic_population_ratio']).sum(),
    'Other': (df['population'] * df['other_race_population_ratio']).sum()
})

# Plot pie chart
plt.figure(figsize=(8, 8))
plt.pie(
    racial_population,
    labels=racial_population.index,
    autopct='%1.1f%%',
    startangle=140,
    wedgeprops={'edgecolor': 'white'}
)
plt.title('Population Distribution by Racial Group')
plt.tight_layout()
plt.show()

### Racial Population Distribution by State

In [None]:
df = real_estate_df.copy()

# Aggregate racial populations per state
race_cols = ['caucasion_population_ratio', 'African-American_population_ratio', 'hispanic_population_ratio', 'other_race_population_ratio']
racial_pops_by_state = df.groupby('state').apply(
    lambda x: pd.Series({
        'Caucasian': (x['population'] * x['caucasion_population_ratio']).sum(),
        'African-American': (x['population'] * x['African-American_population_ratio']).sum(),
        'Hispanic': (x['population'] * x['hispanic_population_ratio']).sum(),
        'Other': (x['population'] * x['other_race_population_ratio']).sum()
    })
)

# Plot stacked bar chart
ax = racial_pops_by_state.plot(kind='bar', stacked=True, figsize=(12, 6))
plt.title('Racial Population Distribution by State')
plt.ylabel('Total Population')
plt.xlabel('State')
plt.xticks(rotation=0)
plt.legend(title='Racial Group')
plt.tight_layout()
plt.show()


### Population Distribution by Age Group

In [None]:
# Calculate total population by age group across all states
total_population_by_age_group = pd.Series({
    '0-19': (df['population'] * df['age_0_to_19_population_ratio']).sum(),
    '20-39': (df['population'] * df['age_20_to_39_population_ratio']).sum(),
    '40-59': (df['population'] * df['age_40_to_59_population_ratio']).sum(),
    '60-79': (df['population'] * df['age_60_to_79_population_ratio']).sum(),
    '80+': (df['population'] * df['age_80_and_over_population_ratio']).sum()
})

# Plot as a pie chart
plt.figure(figsize=(8, 8))
plt.pie(
    total_population_by_age_group,
    labels=total_population_by_age_group.index,
    autopct='%1.1f%%',
    startangle=140,
    wedgeprops={'edgecolor': 'white'}
)
plt.title('Population Distribution by Age Group')
plt.tight_layout()
plt.show()

### Age Group Population Distribution by State

In [None]:
age_group_cols = [
    'age_0_to_19_population_ratio',
    'age_20_to_39_population_ratio',
    'age_40_to_59_population_ratio',
    'age_60_to_79_population_ratio',
    'age_80_and_over_population_ratio'
]

# Add to the simulated df
for col in age_group_cols:
    df[col] = np.random.rand(len(df))

# Normalize so each row sums to 1 across age groups
total_age_ratio = df[age_group_cols].sum(axis=1)
for col in age_group_cols:
    df[col] /= total_age_ratio

# Aggregate total population by age group and state
age_pops_by_state = df.groupby('state').apply(
    lambda x: pd.Series({
        '0-19': (x['population'] * x['age_0_to_19_population_ratio']).sum(),
        '20-39': (x['population'] * x['age_20_to_39_population_ratio']).sum(),
        '40-59': (x['population'] * x['age_40_to_59_population_ratio']).sum(),
        '60-79': (x['population'] * x['age_60_to_79_population_ratio']).sum(),
        '80+': (x['population'] * x['age_80_and_over_population_ratio']).sum()
    })
)

# Plot stacked bar chart
ax = age_pops_by_state.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='tab20')
plt.title('Age Group Population Distribution by State')
plt.ylabel('Total Population')
plt.xlabel('State')
plt.xticks(rotation=0)
plt.legend(title='Age Group')
plt.tight_layout()
plt.show()

# Modeling

## Logistic Regression Classification

In [293]:
# classify houses into 2 classes based on price, we define expensive houses as top 25%
real_estate_df['is_expensive'] = (real_estate_df['price'] > real_estate_df['price'].quantile(0.75)).astype(int)
# real_estate_df.describe()

In [297]:
# Drop target column and irrelevant columns from features
features = real_estate_df.drop(columns=['price', 'zip_code',
                                         'city', 'county', 'state', 'non-Hispanic_population_ratio', 'is_expensive'])
target = real_estate_df['is_expensive']

In [298]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338095 entries, 0 to 338094
Data columns (total 22 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   bed                                338095 non-null  int64  
 1   bath                               338095 non-null  int64  
 2   acre_lot                           338095 non-null  float64
 3   house_size                         338095 non-null  float64
 4   population                         338095 non-null  float64
 5   population_density                 338095 non-null  float64
 6   dist_to_highway                    338095 non-null  float64
 7   dist_to_airport                    338095 non-null  float64
 8   dist_to_shore                      338095 non-null  float64
 9   number_of_business                 338095 non-null  float64
 10  adjusted_gross_income              338095 non-null  float64
 11  caucasion_population_ratio         3380

In [299]:
# Use scikit-learn's train_test_split function to split data for regression
# into train and test sets. The split should be 80-20 meaning 80% for training
# and rest for testing.
seed = 42
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state= seed)

In [300]:
# Initialize model with default parameters and fit it on the training set
log_reg = LogisticRegression()
# log_reg.fit(X_train_small, y_train_small)
log_reg.fit(X_train, y_train)

# Use the model to predict on the test set and save these predictions as `y_pred`
# y_pred = log_reg.predict(X_test_small)
y_pred = log_reg.predict(X_test)

# Find the accuracy and store the value in `log_acc`
# log_acc = accuracy_score(y_test_small, y_pred)
log_acc = accuracy_score(y_test, y_pred)



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [301]:
print(f"Accuracy: {log_acc:.4f}")

Accuracy: 0.7852


## Random Forest Classification

In [302]:
# Initialize model with default parameters and fit it on the training set
rf = RandomForestClassifier(class_weight='balanced', n_estimators=120, max_depth=30, random_state=42)
rf.fit(X_train, y_train)

# Use the model to predict on the test set and save these predictions as `y_pred`
y_pred = rf.predict(X_test)

# Find the accuracy and store the value in `rf_acc`
rf_acc = accuracy_score(y_test, y_pred)

# Compute the confusion matrix and save it to `rf_confusion`
rf_confusion = confusion_matrix(y_test, y_pred)

In [None]:
print(rf_confusion)

## Plot Feature Importances

In [None]:
# TO-DO: get feature importnce using built-in feature importance
feature_importance = rf.feature_importances_

# TO-DO: Get the top 10 and Create a DataFrame `feature_importance_df`
feature_importance_df = pd.DataFrame({'Feature': features.columns, 'Feature Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values(by='Feature Importance', ascending=False).head(10)
# TO-DO: Visualize the top 10 feature importance using a barplot
feature_importance_df.plot(kind='bar', x='Feature', y='Feature Importance', title='Top 10 Feature Importance')
plt.show()

##Neural Network, batch gradient descent

In [304]:
# Drop target column and irrelevant columns from features
features = real_estate_df[['adjusted_gross_income', 'bed', 'bath', 'zip_code',
                           'population_density', 'number_of_business',
                           'age_20_to_39_population_ratio',
                           'age_40_to_59_population_ratio']]
target = real_estate_df['price']

In [305]:
# Use scikit-learn's train_test_split function to split data for regression
# into train and test sets. The split should be 80-20 meaning 80% for training
# and rest for testing.
seed = 42
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state= seed)

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='sgd', hidden_layer_sizes=(5, 2), random_state=seed)
clf.fit(X_train, y_train)
# Use the model to predict on the test set and save these predictions as `y_pred`
# y_pred = log_reg.predict(X_test_small)
y_pred = clf.predict(X_test)

# Find the accuracy and store the value in `log_acc`
# log_acc = accuracy_score(y_test_small, y_pred)
log_acc = accuracy_score(y_test, y_pred)


## PCA to Reduce Dimensionality

In [None]:
features = [
    'bed', 'bath', 'acre_lot', 'house_size', 'population', 'density',
    'dist_highway', 'dist2_large_airport', 'dist_to_shore', 'number_of_business',
    'adjusted_gross_income', 'caucasion_population_ratio',
    'African-American_population_ratio', 'other_race_population_ratio',
    'non-Hispanic_population_ratio', 'hispanic_population_ratio',
    'male_population_ratio', 'female_population_ratio',
    'age_0_to_19_population_ratio', 'age_20_to_39_population_ratio',
    'age_40_to_59_population_ratio', 'age_60_to_79_population_ratio',
    'age_80_and_over_population_ratio'
]
X = real_estate_df[features].copy()

In [None]:
# Intermediate step to address fac that PCA is not scale-invariant
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Instantiate and Fit PCA
pca = PCA()
pca.fit(X_train_scaled)

In [None]:
# Save the explained variance ratios into variable called "explained_variance_ratios"
explained_variance_ratios = pca.explained_variance_ratio_

# Save the CUMULATIVE explained variance ratios into variable called "cum_evr"
cum_evr = np.cumsum(explained_variance_ratios)

In [None]:
# Plot the cumulative explained_variance_ratio against the number of components
# to decide the number of components we should keep
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cum_evr) + 1), cum_evr, marker='o', linestyle='-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.title('Cumulative Explained Variance Ratio vs. Number of Components')
plt.grid(True)
plt.axhline(y=0.8, color='r', linestyle='--', label='80% Explained Variance Threshold')
plt.legend()
plt.show()

In [None]:
# Get transformed set of principal components on x_test
scaler = StandardScaler()
X_train_scaled2 = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 1. Refit and transform on training with parameter n (as deduced from the last step)
pca = PCA(n_components= 9)
X_train_pca = pca.fit_transform(X_train_scaled2)

# 2. Transform on Testing Set and store it as `X_test_pca`
X_test_pca = pca.transform(X_test)

# Regression

##Linear Regression (Unregularized)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")

# Initialize model with default parameters and fit it on the training set

reg = LinearRegression()
reg.fit(X_train, y_train)


# Use the model to predict on the test set and save these predictions as `y_pred`
y_pred = reg.predict(X_test)

# Find the R-squared score and store the value in `lin_reg_score`
lin_reg_score = r2_score(y_test, y_pred)

##Ridge Regression

In [None]:
# Intermediate step to address scale-invariance
scaler = StandardScaler()
X_train_scaled3 = scaler.fit_transform(X_train)
X_test_scaled3 = scaler.transform(X_test)

In [None]:
# Import required libraries
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")


# Initialize model with alpha = 10 (keep other hyperparameters as default values) and fit it on the training set
alpha = 10
reg_ridge = Ridge(alpha=alpha)
reg_ridge.fit(X_train_scaled3, y_train)

# Use the model to predict on the test set and save these predictions as `y_pred`
y_pred = reg_ridge.predict(X_test_scaled3)

# Find the R-squared score and store the value in `ridge_score`
ridge_score = r2_score(y_test, y_pred)

##Random Forest Regression

In [None]:
# Import required libraries
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")

# Initialize model with default parameters and fit it on the training set

reg_rf = RandomForestRegressor(random_state=42)
reg_rf.fit(X_train, y_train)

# Use the model to predict on the test set and save these predictions as `y_pred`
y_pred = reg_rf.predict(X_test)

# Find the R-squared score and store the value in `rfr_score`
rfr_score = r2_score(y_test, y_pred)

# K Means clustering

In [None]:
# Import the necessary libraries
from sklearn.cluster import KMeans
from collections import Counter

#   for k ranges from 2 to 10, fit on "features" to generate an elbow plot
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, n_init=5, random_state=0)
    kmeans.fit(features)
    wcss = kmeans.inertia_
    plt.scatter(k, wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.title('Elbow Plot')
plt.show()

In [None]:
# choose the best number of clusters (the elbow)
number_of_cluster = 5

## Refit with best number of clusters

In [None]:
# TO-DO: re-run the K-Means clustering with the best number of clusters, save the fitted model to `kmeans`
kmeans = KMeans(n_clusters= number_of_cluster, n_init= 5, random_state= 0)
kmeans.fit(features)