In [78]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [79]:
folder_path = "data/2023"
FILE_NAME = "merged_sales_fixed.csv"
OLD_FILE_NAME = "geocoded_enriched_merged_sales.csv"

In [80]:
df_new = pd.read_csv(f"{folder_path}/{FILE_NAME}")
df_old = pd.read_csv(f"{folder_path}/{OLD_FILE_NAME}")

# remove 'unit' column from df_old
if 'unit' in df_old.columns:
    df_old = df_old.drop(columns=['unit'])

In [81]:
columns_in_old_but_not_in_new = set(df_old.columns) - set(df_new.columns)
print(columns_in_old_but_not_in_new)

{'y', 'full_address', 'x', 'owner_property_count', 'total_room_num', 'half_bathrooms', 'fireplaces', 'bathrooms', 'exterior_condition', 'year_built', 'properties', 'interior_condition', 'heat_type', 'kitchens', 'owner_name', 'bedrooms', 'ac_type', 'parking_spots', 'foundation'}


In [82]:
df_old

Unnamed: 0,sale_date,living_area,category,sale_price,parcel,price_per_sf,street_no,street_name,total_room_num,bedrooms,...,parking_spots,year_built,exterior_condition,foundation,full_address,owner_name,owner_property_count,x,y,properties
0,3/31/2023,1350,single_family,600000,01‐01789‐000,444.44,141,ASHLEY ST,8.0,2.0,...,3.0,1968,Good,Concrete,141 ASHLEY ST BOSTON MA 02128,141 ASHLEY LLC,1.0,-71.010458,42.389219,
1,5/26/2023,1401,single_family,600000,01‐00931‐000,428.27,606,Bennington ST,7.0,4.0,...,2.0,1900,Average,Stone,606-608 BENNINGTON ST BOSTON MA 02128,OLIVEIRA GABRIEL A,1.0,-71.017048,42.383188,
2,4/28/2023,1654,single_family,649900,01‐01041‐000,392.82,682,Bennington ST,7.0,3.0,...,0.0,1900,Average,Brick,682 BENNINGTON ST BOSTON MA 02128,MURALLES INGRID J,1.0,-71.013698,42.384396,
3,5/31/2023,1568,single_family,619000,01‐01521‐001,394.77,1203,Bennington ST,7.0,4.0,...,0.0,1910,Average,Brick,1203 BENNINGTON ST BOSTON MA 02128,MORALES CARDONA DIEGO,1.0,-70.998107,42.389161,
4,3/31/2023,1780,single_family,400000,01‐04320‐000,224.72,172,Cowper ST,8.0,4.0,...,0.0,1910,Poor,Brick,172 COWPER ST BOSTON MA 02128,TEMPO L.J. CORP,1.0,-71.013738,42.382003,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156,11/27/2023,2814,two_family,1000000,22‐04490‐000,355.37,12,NONANTUM ST,14.0,6.0,...,2.0,1910,Good,Stone,12 NONANTUM ST BOSTON MA 02135,12 NONANTUM LLC,1.0,-71.168387,42.350199,
3157,9/28/2023,2325,two_family,875000,22‐03741‐000,376.34,135,NONANTUM ST,10.0,5.0,...,0.0,1890,Good,Stone,135-137 NONANTUM ST BOSTON MA 02135,MCCARTHY PATRICIA,3.0,-71.174358,42.350243,
3158,4/14/2023,2208,two_family,900000,22‐03853‐000,407.61,48,OAK SQUARE AV,12.0,5.0,...,3.0,1890,Average,Stone,48 OAK SQUARE AV BOSTON MA 02135,48 OAK SQUARE AVENUE LLC,1.0,-71.163746,42.351469,
3159,3/30/2023,2268,two_family,875000,22‐04322‐000,385.80,96,Parsons ST,8.0,4.0,...,4.0,1935,Average,Concrete Block,96-98 PARSONS ST BOSTON MA 02135,CHOWDHURY ISMAIL,1.0,-71.154403,42.353620,


In [87]:
# we will create a temp column for unique id
df_old['temp_unique_id'] = df_old["sale_date"] + "_" + df_old["parcel"]
df_new['temp_unique_id'] = df_new["sale_date"] + "_" + df_new["parcel"]

# strip spaces
df_old['temp_unique_id'] = df_old['temp_unique_id'].str.strip()
df_new['temp_unique_id'] = df_new['temp_unique_id'].str.strip()

# change the temp_unique_id to string
df_old['temp_unique_id'] = df_old['temp_unique_id'].astype(str)
df_new['temp_unique_id'] = df_new['temp_unique_id'].astype(str)


df_old['temp_unique_id'].head()

import unicodedata as ud
import re

def normalise_id(s: str) -> str:
    # 1) Unicode normalisation (breaks compatibility glyphs into base chars)
    s = ud.normalize('NFKD', s)
    # 2) Replace any “fancy” dash characters with ASCII hyphen‑minus
    s = re.sub(r'[\u2010-\u2015\u2212\uFE58\uFE63\uFF0D]', '-', s)
    # 3) Strip any stray whitespace
    return s.strip()

df_new['temp_unique_id_norm'] = df_new['temp_unique_id'].map(normalise_id)
df_old['temp_unique_id_norm'] = df_old['temp_unique_id'].map(normalise_id)

In [88]:
df_new['temp_unique_id_norm'].head()

0    3/31/2023_01-01789-000
1    5/26/2023_01-00931-000
2    4/28/2023_01-01041-000
3    5/31/2023_01-01521-001
4    3/31/2023_01-04320-000
Name: temp_unique_id_norm, dtype: object

In [89]:
print(df_new['temp_unique_id_norm'][0], df_old['temp_unique_id_norm'][0])
df_new['temp_unique_id_norm'][0] == df_old['temp_unique_id_norm'][0]

3/31/2023_01-01789-000 3/31/2023_01-01789-000


True

In [None]:

if not 'temp' in columns_in_old_but_not_in_new:
    columns_in_old_but_not_in_new = list(columns_in_old_but_not_in_new) + ['temp_unique_id_norm']
# move columns from df_old to df_new but based on temp_unique_id
df_merge = pd.merge(df_new, df_old[columns_in_old_but_not_in_new], on='temp_unique_id_norm', how='left')

df_merge.head()





ValueError: The column label 'temp_unique_id' is not unique.

In [65]:
print(df_merge[columns_in_old_but_not_in_new])

       y full_address   x  owner_property_count  total_room_num  \
0    NaN          NaN NaN                   NaN             NaN   
1    NaN          NaN NaN                   NaN             NaN   
2    NaN          NaN NaN                   NaN             NaN   
3    NaN          NaN NaN                   NaN             NaN   
4    NaN          NaN NaN                   NaN             NaN   
...   ..          ...  ..                   ...             ...   
4952 NaN          NaN NaN                   NaN             NaN   
4953 NaN          NaN NaN                   NaN             NaN   
4954 NaN          NaN NaN                   NaN             NaN   
4955 NaN          NaN NaN                   NaN             NaN   
4956 NaN          NaN NaN                   NaN             NaN   

      half_bathrooms  fireplaces  bathrooms exterior_condition year_built  \
0                NaN         NaN        NaN                NaN        NaN   
1                NaN         NaN        N

In [49]:
# save as csv
df_merge.to_csv(f"{folder_path}/geocoded_merged_sales_fixed.csv", index=False)