In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [61]:
folder_path = 'data/2020'

single_family_filename = 'single_family_sales.csv'
three_family_filename = 'three_family_sales.csv'
condo_filename = 'condo_sales.csv'
two_family_filename = 'two_family_sales.csv'

single_family_sales = pd.read_csv(folder_path + "/" + single_family_filename)
three_family_sales = pd.read_csv(folder_path + "/" + three_family_filename)
condo_sales = pd.read_csv(folder_path + "/" + condo_filename)
two_family_sales = pd.read_csv(folder_path + "/" + two_family_filename)



In [62]:
all_lengths = [len(single_family_sales), len(three_family_sales), len(condo_sales), len(two_family_sales)]
all_lengths.sort()

print(all_lengths)


[575, 604, 815, 4379]


In [63]:
# add category column
single_family_sales['category'] = 'single_family'
three_family_sales['category'] = 'three_family'
condo_sales['category'] = 'condo'
two_family_sales['category'] = 'two_family'

# check if there are any differences between the columns 
print(single_family_sales.columns)
print(three_family_sales.columns)
print(condo_sales.columns)
print(two_family_sales.columns)

# print diff columns 
# find overlapping columns
overlapping_columns = set(single_family_sales.columns) & set(three_family_sales.columns) & set(condo_sales.columns) & set(two_family_sales.columns)
print(overlapping_columns)

overlapping_columns = list(overlapping_columns)
# merge the overlapping columns
# merged_sales = pd.concat([single_family_sales[overlapping_columns], three_family_sales[overlapping_columns], condo_sales[overlapping_columns], two_family_sales[overlapping_columns]])

print(overlapping_columns)
# save the merged sales
merged_sales = pd.concat([
    single_family_sales[overlapping_columns],
    three_family_sales[overlapping_columns],
    condo_sales[overlapping_columns],
    two_family_sales[overlapping_columns]
], ignore_index=True)

# save
merged_sales.to_csv(f"{folder_path}/merged_sales.csv", index=False)





Index(['parcel', 'street_no', 'street_name', 'unit', 'sale_date', 'sale_price',
       'living_area', 'price_per_sf', 'category'],
      dtype='object')
Index(['parcel', 'street_no', 'street_name', 'unit', 'sale_date', 'sale_price',
       'living_area', 'price_per_sf', 'category'],
      dtype='object')
Index(['parcel', 'street_no', 'street_name', 'unit', 'sale_date', 'sale_price',
       'living_area', 'price_per_sf', 'category'],
      dtype='object')
Index(['parcel', 'street_no', 'street_name', 'unit', 'sale_date', 'sale_price',
       'living_area', 'price_per_sf', 'category'],
      dtype='object')
{'unit', 'street_name', 'sale_date', 'parcel', 'category', 'sale_price', 'street_no', 'price_per_sf', 'living_area'}
['unit', 'street_name', 'sale_date', 'parcel', 'category', 'sale_price', 'street_no', 'price_per_sf', 'living_area']


In [64]:
# check if the merged sales are not the same year

# check the sale date and check if it all ends with /2022
# This will return True only if ALL dates end with /2022, which is not what we want
# We should check if ANY dates end with /2022 to catch mismatched years
has_2022 = merged_sales['sale_date'].str.endswith('/2022').any()
if has_2022:
    print("Warning: Found sales from 2022 when all should be from 2023")
    
# Let's check what years we actually have
years = merged_sales['sale_date'].str.extract(r'/(\d{4})$')[0].unique()
print("Years found in data:", sorted(years))


Years found in data: ['2019', '2020']


In [67]:
# now since 2020 has both 2019 and 2020, we check if 2019 values are all included in original 2019 files

original_2019_merged_sales = pd.read_csv(f"data/2019/merged_sales.csv")

# extract 2019 from merged_sales
target_2019_merged_sales = merged_sales[merged_sales['sale_date'].str.endswith('2019')]

print(f"Number of 2019 sales in current dataset: {len(target_2019_merged_sales)}")
print(f"Number of sales in original 2019 file: {len(original_2019_merged_sales)}")

# Create unique identifiers combining parcel and sale_date
original_2019_merged_sales['unique_id'] = original_2019_merged_sales['parcel'] + '_' + original_2019_merged_sales['sale_date']
target_2019_merged_sales['unique_id'] = target_2019_merged_sales['parcel'] + '_' + target_2019_merged_sales['sale_date']

# Check for any 2019 records that appear in current dataset but not in original 2019 file
new_records = target_2019_merged_sales[~target_2019_merged_sales['unique_id'].isin(original_2019_merged_sales['unique_id'])]
if len(new_records) > 0:
    print(f"Warning: Found {len(new_records)} records from 2019 that were not in the original 2019 file")
    print("Sample of new records:")
    print(new_records[['parcel', 'sale_date']].head())



Number of 2019 sales in current dataset: 662
Number of sales in original 2019 file: 5973
Sample of new records:
           parcel  sale_date
899  01-00179-000  8/29/2019
913  01-04909-000   8/2/2019
927  01-04320-001  7/23/2019
930  02-02060-000   8/2/2019
983  07-02540-001  5/31/2019


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_2019_merged_sales['unique_id'] = target_2019_merged_sales['parcel'] + '_' + target_2019_merged_sales['sale_date']


In [68]:
# we should scrutinize the new records and really see if the parcels don't exist in the original 2019 file
new_records_parcels = new_records['parcel'].unique()
original_2019_merged_sales_parcels = original_2019_merged_sales['parcel'].unique()

# find the parcels that are in new_records but not in original_2019_merged_sales
new_records_parcels_not_in_original = set(new_records_parcels) - set(original_2019_merged_sales_parcels)

print(len(new_records_parcels_not_in_original))
print(len(new_records_parcels))

# check the overlapping parcels
overlapping_parcels = set(new_records_parcels) & set(original_2019_merged_sales_parcels)
print(len(overlapping_parcels))
print(overlapping_parcels)





40
40
0
set()


In [69]:
# check the overlapping parcels dataframe
new_records_parcels_not_in_original_df = new_records[new_records['parcel'].isin(overlapping_parcels)]
new_records_parcels_not_in_original_df


Unnamed: 0,unit,street_name,sale_date,parcel,category,sale_price,street_no,price_per_sf,living_area,unique_id


In [70]:
original_2019_merged_sales_parcels_df = original_2019_merged_sales[original_2019_merged_sales['parcel'].isin(overlapping_parcels)]
original_2019_merged_sales_parcels_df



Unnamed: 0,unit,street_name,sale_date,parcel,category,sale_price,street_no,price_per_sf,living_area,unique_id


In [55]:
# add the new records to the original 2019 merged sales
original_2019_merged_sales = pd.concat([original_2019_merged_sales, new_records_parcels_not_in_original_df], ignore_index=True)

# remove the unique_id column
original_2019_merged_sales = original_2019_merged_sales.drop(columns=['unique_id'])

# save the original 2019 merged sales
original_2019_merged_sales.to_csv(f"data/2019/merged_sales.csv", index=False)



In [54]:
print(len(merged_sales["parcel"].unique()))
print(len(merged_sales))

6372
6373
