## 1. Import Libraries

In [2]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine learning libraries
import sklearn
import tensorflow as tf
import torch
import keras

# Deep learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import torch.nn as nn
import torch.optim as optim

# Model evaluation and metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Natural Language Processing (NLP)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# For data preprocessing and feature engineering
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

# For handling large datasets
import h5py

## Goal = The amount of Days a home will be on the market

## 1.1 Reading in data

In [10]:
df = pd.read_csv("zillow_sales.csv")

## 2.1 EDA

In [11]:
df

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31,...,2023-11-30,2023-12-31,2024-01-31,2024-02-29,2024-03-31,2024-04-30,2024-05-31,2024-06-30,2024-07-31,2024-08-31
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,212757.568407,...,492924.222260,493546.021535,494938.632954,497127.041953,499973.049734,502524.796784,503854.585074,504164.982162,503780.870153,504193.640887
1,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,139171.687163,...,586372.318321,591205.001331,592810.241565,595402.485152,600568.899512,608983.205398,616423.730722,621917.743481,626853.771270,632157.933927
2,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,103736.197118,...,279008.405418,278716.309332,279071.835326,279773.422935,280871.474377,281832.243548,282366.599919,282326.827673,282092.254904,281910.409452
3,62080,4,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,147341.057200,...,452496.954526,448231.491948,445416.508893,443820.281564,446865.699151,451354.559837,455955.115425,456654.443964,456147.872232,455518.141753
4,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,102296.421002,...,272482.855443,272059.668098,272105.690894,272630.892401,273630.906832,274621.401490,275123.010114,274956.422913,274664.333677,274391.917865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26333,100030,39992,98934,zip,WA,WA,Kittitas,"Ellensburg, WA",Kittitas County,,...,319506.991851,318671.584909,318054.032759,317126.996927,317433.962137,319118.565658,320992.816725,321630.467119,321740.849397,321870.862879
26334,64277,39992,15731,zip,PA,PA,Coral,"Indiana, PA",Indiana County,,...,77451.795142,78534.465320,78991.888014,79206.956832,79571.373968,80111.915583,80824.553046,81921.162534,82143.012916,82423.763150
26335,92085,39992,77661,zip,TX,TX,,"Houston-The Woodlands-Sugar Land, TX",Chambers County,,...,199040.096718,195828.309118,194272.117507,192979.068181,192629.155075,192320.188260,191748.279642,191299.345239,190235.657412,188912.873188
26336,98183,39992,95419,zip,CA,CA,Camp Meeker,"Santa Rosa-Petaluma, CA",Sonoma County,89943.208807,...,459885.690573,458741.060995,455780.732380,455107.875762,456080.569376,459396.264074,458882.065913,454359.453609,447648.612683,442919.055157


In [5]:
df.columns

Index(['month_date_yyyymm', 'country', 'median_listing_price',
       'median_listing_price_mm', 'median_listing_price_yy',
       'active_listing_count', 'active_listing_count_mm',
       'active_listing_count_yy', 'median_days_on_market',
       'median_days_on_market_mm', 'median_days_on_market_yy',
       'new_listing_count', 'new_listing_count_mm', 'new_listing_count_yy',
       'price_increased_count', 'price_increased_count_mm',
       'price_increased_count_yy', 'price_reduced_count',
       'price_reduced_count_mm', 'price_reduced_count_yy',
       'pending_listing_count', 'pending_listing_count_mm',
       'pending_listing_count_yy', 'median_listing_price_per_square_foot',
       'median_listing_price_per_square_foot_mm',
       'median_listing_price_per_square_foot_yy', 'median_square_feet',
       'median_square_feet_mm', 'median_square_feet_yy',
       'average_listing_price', 'average_listing_price_mm',
       'average_listing_price_yy', 'total_listing_count',
       'tot

In [14]:
print(df.shape)
print(df.shape[0])
# We have a lot of null values so we cannot drop values with null values
print(df.isnull().values.sum())
df = df.drop_duplicates()
print(df.dtypes)

(99, 39)
99
337
month_date_yyyymm                           object
country                                     object
median_listing_price                       float64
median_listing_price_mm                    float64
median_listing_price_yy                    float64
active_listing_count                       float64
active_listing_count_mm                    float64
active_listing_count_yy                    float64
median_days_on_market                      float64
median_days_on_market_mm                   float64
median_days_on_market_yy                   float64
new_listing_count                          float64
new_listing_count_mm                       float64
new_listing_count_yy                       float64
price_increased_count                      float64
price_increased_count_mm                   float64
price_increased_count_yy                   float64
price_reduced_count                        float64
price_reduced_count_mm                     float64
price_reduced_c