## 1. Import Libraries

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine learning libraries
import sklearn
import tensorflow as tf
import torch
import keras

# Deep learning
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import torch.nn as nn
import torch.optim as optim

# Model evaluation and metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Natural Language Processing (NLP)
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# For data preprocessing and feature engineering
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

# For handling large datasets
import h5py

## Goal = The amount of Days a home will be on the market

## Data Dictionary

### Active Listing Count
The count of active listings within the specified geography during the specified month. The active listing count tracks the number of for sale properties on the market, excluding pending listings where a pending status is available. This is a snapshot measure of how many active listings can be expected on any given day of the specified month.

### Active Listing Count M/M
The percentage change in the active listing count from the previous month.

### Active Listing Count Y/Y
The percentage change in the active listing count from the same month in the previous year.

### Avg Listing Price
The average listing price within the specified geography during the specified month.

### Avg Listing Price M/M
The percentage change in the average listing price from the previous month.

### Avg Listing Price Y/Y
The percentage change in the average listing price from the same month in the previous year.

### Days on Market
The median number of days property listings spend on the market within the specified geography during the specified month. Time spent on the market is defined as the time between the initial listing of a property and either its closing date or the date it is taken off the market.

### Days on Market M/M
The percentage change in the median days on market from the previous month.

### Days on Market Y/Y
The percentage change in the median days on market from the same month in the previous year.

### Demand Score
The demand score is an index representing a zip code, county or metro’s unique listing page viewers per property ranking compared to other zip codes, counties, or metros.

### Hotness Rank
The specified zip code, county, or metro area’s Hotness rank, by Hotness score, compared to all other zip codes, counties and metro areas. A rank value of 1 is considered the hottest (highest Hotness score).

### Hotness Rank (Change)
The change in Hotness rank from the previous month. A positive value indicates that the geography’s Hotness has decreased (moved down in ranking), and a negative value indicates that its Hotness has increased (moved up in ranking).

### Hotness Rank (Prev)
The specified zip code, county, or metro area’s Hotness rank in the previous month.

### Hotness Rank Within CBSA
In the case of a zip code or county, this metric represents the zip code or county’s Hotness rank, by Hotness score, compared to all other zip codes or counties within its metro area. A rank value of 1 is considered the hottest (highest Hotness score).

### Hotness Rank Within County
In the case of a zip code, this metric represents the zip code’s Hotness rank, by Hotness score, compared to all other zip codes within its county. A rank value of 1 is considered the hottest (highest Hotness score).

### Hotness Score
The Hotness score is an equally-weighted composite metric of a geography’s supply score and demand score.

### LDP Unique Viewers Per Property (vs US)
The count of viewers a typical property receives in the specified geography divided by the count of views a typical property receives in the US overall during the same month.

### LDP Unique Viewers Per Property M/M
The change in unique viewers a typical property receives in the specified geography from the previous month.

### LDP Unique Viewers Per Property Y/Y
The change in unique viewers a typical property receives in the specified geography from the same month in the previous year.

### Median DOM
The median number of days property listings spend on the market within the specified geography during the specified month. Time spent on the market is defined as the time between the initial listing of a property and either its closing date or the date it is taken off the market.

### Median DOM (vs US)
The median days on market in the specified geography divided by the median days on market for the US overall during the same month.

### Median DOM M/M
The change in days in the median days on market from the previous month.

### Median DOM M/M Perc
The percentage change in the median days on market from the previous month.

### Median DOM Y/Y
The change in days in the median days on market from the same month in the previous year.

### Median DOM Y/Y Perc
The percentage change in the median days on market from the same month in the previous year.

### Median List Price Per Sqft
The median listing price per square foot within the specified geography during the specified month.

### Median List Price Per Sqft M/M
The percentage change in the median listing price per square foot from the previous month.

### Median List Price Per Sqft Y/Y
The percentage change in the median listing price per square foot from the same month in the previous year.

### Median Listing Price
The median listing price within the specified geography during the specified month.

### Median Listing Price (vs US)
The median listing price within the specified geography divided by the median listing price for the US overall during the same month.

### Median Listing Price M/M
The percentage change in the median listing price from the previous month.

### Median Listing Price Y/Y
The percentage change in the median listing price from the same month in the previous year.

### Median Listing Sqft
The median listing square feet within the specified geography during the specified month.

### Median Listing Sqft M/M
The percentage change in the median listing square feet from the previous month.

### Median Listing Sqft Y/Y
The percentage change in the median listing square feet from the same month in the previous year.

### New Listing Count
The count of new listings added to the market within the specified geography. The new listing count represents a typical week’s worth of new listings in a given month. The new listing count can be multiplied by the number of weeks in a month to produce a monthly new listing count.

### New Listing Count M/M
The percentage change in the new listing count from the previous month.

### New Listing Count Y/Y
The percentage change in the new listing count from the same month in the previous year.

### Nielsen HH Rank
The specified zip code, county, or metro area’s rank by household count compared to other zip codes, counties and metro areas. A rank value of 1 is the highest by household count.

### Pending Listing Count
The count of pending listings within the specified geography during the specified month, if a pending definition is available for that geography. This is a snapshot measure of how many pending listings can be expected on any given day of the specified month.

### Pending Listing Count M/M
The percentage change in the pending listing count from the previous month.

### Pending Listing Count Y/Y
The percentage change in the pending listing count from the same month in the previous year.

### Pending Ratio
The ratio of the pending listing count to the active listing count within the specified geography during the specified month.

### Pending Ratio M/M
The change in the pending ratio from the previous month.

### Pending Ratio Y/Y
The change in the pending ratio from the same month in the previous year.

### Price Decrease Count
The count of listings which have had their price reduced within the specified geography. The price decrease count represents a typical week’s worth of listings which have had their price reduced in a given month. The price decrease count can be multiplied by the number of weeks in a month to produce a monthly price decrease count.

### Price Decrease Count M/M
The percentage change in the price decrease count from the previous month.

### Price Decrease Count Y/Y
The percentage change in the price decrease count from the same month in the previous year.

### Price Increase Count
The count of listings which have had their price increased within the specified geography. The price increase count represents a typical week’s worth of listings which have had their price increased in a given month. The price increase count can be multiplied by the number of weeks in a month to produce a monthly price increase count.

### Price Increase Count M/M
The percentage change in the price increase count from the previous month.

### Price Increase Count Y/Y
The percentage change in the price increase count from the same month in the previous year.

### Quality Flag
Triggered (“1”) when data values are outside of their typical range. While rare, these figures should be reviewed before reporting.

### Supply Score
The supply score is an index representing a zip code, county or metro’s median days on market ranking compared to other zip codes, counties, or metros.

### Total Listing Count
The total of both active listings and pending listings within the specified geography during the specified month. This is a snapshot measure of how many total listings can be expected on any given day of the specified month.

### Total Listing Count M/M
The percentage change in the total listing count from the previous month.

### Total Listing Count Y/Y
The percentage change in the total listing count from the same month in the previous year.


## 1.1 Reading in data

In [3]:
df = pd.read_csv("Realtor_com_data.csv")

## 2.1 EDA

In [11]:
df

Unnamed: 0,month_date_yyyymm,country,median_listing_price,median_listing_price_mm,median_listing_price_yy,active_listing_count,active_listing_count_mm,active_listing_count_yy,median_days_on_market,median_days_on_market_mm,...,average_listing_price,average_listing_price_mm,average_listing_price_yy,total_listing_count,total_listing_count_mm,total_listing_count_yy,pending_ratio,pending_ratio_mm,pending_ratio_yy,quality_flag
0,202408,United States,429990.0,-0.0171,-0.0126,909344.0,0.0286,0.3577,53.0,0.0392,...,736930.0,-0.0214,-0.0309,1330186.0,0.0013,0.2093,0.4766,-0.0306,-0.1688,0.0
1,202407,United States,437450.0,-0.0170,-0.0058,884066.0,0.0525,0.3661,51.0,0.1333,...,753020.0,-0.0383,-0.0295,1328463.0,0.0206,0.2267,0.5072,-0.0465,-0.1725,0.0
2,202406,United States,445000.0,0.0058,0.0000,839992.0,0.0664,0.3673,45.0,0.0227,...,783008.0,-0.0065,-0.0051,1301692.0,0.0367,0.2237,0.5537,-0.0460,-0.1845,0.0
3,202405,United States,442450.0,0.0291,0.0033,787722.0,0.0727,0.3524,44.0,-0.0538,...,788155.0,0.0238,-0.0044,1255645.0,0.0575,0.2092,0.5997,-0.0231,-0.1902,0.0
4,202404,United States,429950.0,0.0119,-0.0001,734318.0,0.0568,0.3044,47.0,-0.0700,...,769863.0,0.0073,-0.0096,1187383.0,0.0672,0.2001,0.6228,0.0169,-0.1412,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,201610,United States,255000.0,,,1407698.0,,,72.0,,...,443573.0,,,1757600.0,,,0.2509,,,
95,201609,United States,255000.0,,,1443081.0,,,71.0,,...,439192.0,,,1807800.0,,,0.2561,,,
96,201608,United States,255271.0,,,1460048.0,,,67.0,,...,439230.0,,,1848445.0,,,0.2690,,,
97,201607,United States,259000.0,,,1463007.0,,,64.0,,...,442832.0,,,1873189.0,,,0.2839,,,


In [5]:
df.columns

Index(['month_date_yyyymm', 'country', 'median_listing_price',
       'median_listing_price_mm', 'median_listing_price_yy',
       'active_listing_count', 'active_listing_count_mm',
       'active_listing_count_yy', 'median_days_on_market',
       'median_days_on_market_mm', 'median_days_on_market_yy',
       'new_listing_count', 'new_listing_count_mm', 'new_listing_count_yy',
       'price_increased_count', 'price_increased_count_mm',
       'price_increased_count_yy', 'price_reduced_count',
       'price_reduced_count_mm', 'price_reduced_count_yy',
       'pending_listing_count', 'pending_listing_count_mm',
       'pending_listing_count_yy', 'median_listing_price_per_square_foot',
       'median_listing_price_per_square_foot_mm',
       'median_listing_price_per_square_foot_yy', 'median_square_feet',
       'median_square_feet_mm', 'median_square_feet_yy',
       'average_listing_price', 'average_listing_price_mm',
       'average_listing_price_yy', 'total_listing_count',
       'tot

In [14]:
print(df.shape)
print(df.shape[0])
# We have a lot of null values so we cannot drop values with null values
print(df.isnull().values.sum())
df = df.drop_duplicates()
print(df.dtypes)

(99, 39)
99
337
month_date_yyyymm                           object
country                                     object
median_listing_price                       float64
median_listing_price_mm                    float64
median_listing_price_yy                    float64
active_listing_count                       float64
active_listing_count_mm                    float64
active_listing_count_yy                    float64
median_days_on_market                      float64
median_days_on_market_mm                   float64
median_days_on_market_yy                   float64
new_listing_count                          float64
new_listing_count_mm                       float64
new_listing_count_yy                       float64
price_increased_count                      float64
price_increased_count_mm                   float64
price_increased_count_yy                   float64
price_reduced_count                        float64
price_reduced_count_mm                     float64
price_reduced_c