In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio


In [9]:
import glob


folders = ["data/2019", "data/2020", "data/2021", "data/2022", "data/2023"]


# read all the files in the folders as pandas dataframe
# create an empty list to store the dataframes
dataframes = []
for folder in folders:
    file_name = "geocoded_enriched_merged_sales.csv"
    data = pd.read_csv(f"{folder}/{file_name}")
    # add year column to the dataframe
    data["year"] = folder.split("/")[-1]
    dataframes.append(data)


In [10]:
print(dataframes)

[     unit street_name   sale_date        parcel       category sale_price  \
0      RD   BEACHVIEW   6/20/2019  01-02114-000  single_family    725,000   
1      ST  Bennington   6/27/2019  01-06784-000  single_family    510,000   
2      ST       EUTAW    3/8/2019  01-02831-000  single_family    650,000   
3      ST     Everett   6/21/2019  01-04931-000  single_family    445,000   
4      ST     Everett    6/4/2019  01-05187-010  single_family    725,200   
...   ...         ...         ...           ...            ...        ...   
5968   ST     SHANNON   9/17/2019  22-05357-000     two_family          7   
5969   RD     UPCREST   12/4/2019  22-03256-000     two_family          6   
5970   ST  Washington    1/8/2019  22-05360-000     two_family  1,460,000   
5971   AV     WESTERN   10/4/2019  22-00618-000     two_family          9   
5972   ST     Cottage  10/18/2019  01-04069-000   three_family    900,000   

      street_no price_per_sf living_area  total_room_num  ...  year_built 

In [12]:
# merge all the dataframes
data = pd.concat(dataframes)

# save the merged dataframe
data.to_csv("merged_geocoded_enriched_merged_sales.csv", index=False)



In [16]:
data[data["x"].isna() | data["y"].isna()]

Unnamed: 0,unit,street_name,sale_date,parcel,category,sale_price,street_no,price_per_sf,living_area,total_room_num,...,year_built,exterior_condition,foundation,full_address,owner_name,owner_property_count,x,y,properties,year


In [15]:
# check rows that have None for bedrooms
data[data["bedrooms"].isna() | data["living_area"].isna()][["bedrooms", "living_area"]]


Unnamed: 0,bedrooms,living_area
13,,1580
15,,1145
17,,1152
161,,2488
227,,2085
...,...,...
1027,,",200,000"
1076,,4128
1193,,00000
6089,,3435


In [19]:
# check dtype of living_area
data["living_area"] = data["living_area"].str.replace(',', '').astype(float)


In [20]:
data["living_area"]

0       1532.0
1        976.0
2       1878.0
3        850.0
4       1512.0
         ...  
3156    2814.0
3157    2325.0
3158    2208.0
3159    2268.0
3160    2506.0
Name: living_area, Length: 29855, dtype: float64

In [21]:
data[data["living_area"].isna()]

Unnamed: 0,unit,street_name,sale_date,parcel,category,sale_price,street_no,price_per_sf,living_area,total_room_num,...,year_built,exterior_condition,foundation,full_address,owner_name,owner_property_count,x,y,properties,year


In [23]:
# create a new df that has no None for bedrooms 
data_no_na = data[data["bedrooms"].notna()]

print(data_no_na.shape)

(29613, 29)


In [None]:
# save the new df
data_no_na.to_csv("sales_boston_geocoded.csv", index=False)

