# Data Collection Notebook

# Imports

In [1]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import pandas as pd
from IPython.display import display


## NYC Rolling Sales Data

### Initial Parsing of the Webpage 

Looking at the HTML layout of the website, we can see that there are `<table>` elements. These contain the rolling sales data. We can parse all the table rows `<tr>` from the webpage, and begin filtering from there.  

In [2]:
url = "https://www.nyc.gov/site/finance/property/property-annualized-sales-update.page"  # replace with your site
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

rows = soup.find_all("tr")

for r in rows:
    print(r)


<tr style="height: 36px;">
<td colspan="3" style="background-color: #00539a; color: #ffffff; height: 36px;"><strong>Neighborhood Sales Summary<br/></strong>Statistical summary of sales for all five boroughs for one-, two-, and three-family homes</td>
</tr>
<tr class="header" style="height: 36px;">
<td style="height: 36px;"><strong>Neighborhood Sales Data <br/></strong></td>
<td style="text-align: center; height: 36px;"><strong><em>Adobe PDF</em></strong></td>
<td style="text-align: center; height: 36px;"><em><strong>MS Excel</strong></em></td>
</tr>
<tr>
<td>Manhattan</td>
<td style="text-align: center;"><a href="/assets/finance/downloads/pdf/rolling_sales/neighborhood_sales/2024/2024_manhattan.pdf" rel="noopener" target="_blank">Download</a></td>
<td style="text-align: center;"><a href="/assets/finance/downloads/pdf/rolling_sales/neighborhood_sales/2024/2024_manhattan.xlsx" rel="noopener" target="_blank">Download</a></td>
</tr>
<tr>
<td>Bronx</td>
<td style="text-align: center;"><a hr

### Web Scrapping Script

From the html we parsed previously, there are a few things to note 

1) There is a title change in the `<td>` from 2016 to 2014. These say `<yyyy> New York City` as opposed to `<yyyy> New York City Sales Data`
2) From 2003 - 2017, the legacy extension for excell files `.xls` is used. This is changed to `.xlsx` from 2018 - 2024

The Data Saving Structure is as follows: 
```text
New York City Sales Data/
├── 2003/
│   ├── Manhattan.xls
│   ├── Bronx.xls
│   ├── Brooklyn.xls
│   ├── Queens.xls
│   └── Staten Island.xls
├── 2004/
│   ├── Manhattan.xls
│   ├── Bronx.xls
│   ├── Brooklyn.xls
│   ├── Queens.xls
│   └── Staten Island.xls
...
└── 2018/
    ├── Manhattan.xlsx
    ├── Bronx.xlsx
    ├── Brooklyn.xlsx
    ├── Queens.xlsx
    └── Staten Island.xlsx


In [3]:
# Create top-level folder
root_folder = "New York City Sales Data"
os.makedirs(root_folder, exist_ok=True)

rows = soup.find_all("tr")

current_year = None

for tr in rows:
    cells = tr.find_all("td")
    if not cells:
        continue

    text = cells[0].get_text(strip=True)

    # Due to table header change in td, we use regex to find the heading and year
    match = re.search(r"(\d{4})\s+New\s+York\s+City", text)
    if match:
        current_year = match.group(1)
        year_folder = os.path.join(root_folder, current_year)
        os.makedirs(year_folder, exist_ok=True)
        print(f"\nSaving files for {current_year}")
        continue

    if current_year is None:
        continue

    borough = cells[0].get_text(strip=True)
    link_tag = cells[2].find("a") if len(cells) > 2 else None
    if not link_tag or not link_tag.has_attr("href"):
        continue

    excel_href = link_tag["href"]
    excel_url = urljoin(url, excel_href)
    

    filename = f"{borough}.xlsx" if ".xlsx" in excel_url else f"{borough}.xls"
    filepath = os.path.join(year_folder, filename)

    print(f"Downloading {borough} {current_year} data")
    with requests.get(excel_url, stream=True) as r:
        r.raise_for_status()
        with open(filepath, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

print("All files downloaded")


Saving files for 2024
Downloading Manhattan 2024 data
Downloading Bronx 2024 data
Downloading Brooklyn 2024 data
Downloading Queens 2024 data
Downloading Staten Island 2024 data

Saving files for 2023
Downloading Manhattan 2023 data
Downloading Bronx 2023 data
Downloading Brooklyn 2023 data
Downloading Queens 2023 data
Downloading Staten Island 2023 data

Saving files for 2022
Downloading Manhattan 2022 data
Downloading Bronx 2022 data
Downloading Brooklyn 2022 data
Downloading Queens 2022 data
Downloading Staten Island 2022 data

Saving files for 2021
Downloading Manhattan 2021 data
Downloading Bronx 2021 data
Downloading Brooklyn 2021 data
Downloading Queens 2021 data
Downloading Staten Island 2021 data

Saving files for 2020
Downloading Manhattan 2020 data
Downloading Bronx 2020 data
Downloading Brooklyn 2020 data
Downloading Queens 2020 data
Downloading Staten Island 2020 data

Saving files for 2019
Downloading Manhattan 2019 data
Downloading Bronx 2019 data
Downloading Brooklyn 2

### `.xls` to `.xlsx` conversion

Opens `.xls` files into a `dataframe` and then saves the files as an `.xlsx` to ensure data type uniformity. 
Removes previous `.xls` file.

In [4]:
def clean_string(s):
    if isinstance(s, str):
        # Remove illegal characters (ASCII 0-31 except \t, \n, \r)
        s = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', s)
    return s

for year in os.listdir(root_folder):
    year_folder = os.path.join(root_folder, year)
    
    if not os.path.isdir(year_folder):
        continue
    
    for file in os.listdir(year_folder):
        file_path = os.path.join(year_folder, file)
        # Only process .xls files
        if file.lower().endswith(".xls"):
            # Read the old file
            df = pd.read_excel(file_path)

            for col in df.select_dtypes(include='object'):
                df[col] = df[col].apply(clean_string)
                   
            # Create new filename with .xlsx
            new_file_path = os.path.splitext(file_path)[0] + ".xlsx"
            
            # Save as .xlsx
            df.to_excel(new_file_path, index=False)
            
            # Optionally, remove the old .xls file
            os.remove(file_path)
            
            print(f"Converted {file} → {os.path.basename(new_file_path)}")


Converted Queens.xls → Queens.xlsx
Converted Manhattan.xls → Manhattan.xlsx
Converted Brooklyn.xls → Brooklyn.xlsx
Converted Staten Island.xls → Staten Island.xlsx
Converted Bronx.xls → Bronx.xlsx
Converted Queens.xls → Queens.xlsx
Converted Manhattan.xls → Manhattan.xlsx
Converted Brooklyn.xls → Brooklyn.xlsx
Converted Staten Island.xls → Staten Island.xlsx
Converted Bronx.xls → Bronx.xlsx
Converted Queens.xls → Queens.xlsx
Converted Manhattan.xls → Manhattan.xlsx
Converted Brooklyn.xls → Brooklyn.xlsx
Converted Staten Island.xls → Staten Island.xlsx
Converted Bronx.xls → Bronx.xlsx
Converted Queens.xls → Queens.xlsx
Converted Manhattan.xls → Manhattan.xlsx
Converted Brooklyn.xls → Brooklyn.xlsx
Converted Staten Island.xls → Staten Island.xlsx
Converted Bronx.xls → Bronx.xlsx
Converted Queens.xls → Queens.xlsx
Converted Manhattan.xls → Manhattan.xlsx
Converted Brooklyn.xls → Brooklyn.xlsx
Converted Staten Island.xls → Staten Island.xlsx
Converted Bronx.xls → Bronx.xlsx
Converted Queen

### Data Cleaning and Standardization Process

In this step, I standardized all NYC property sales `.xslx` files to ensure consistent formatting and datatypes across years.

1. **Detect and normalize headers automatically**  
   - Since each dataset starts at a different row, the script dynamically locates the header row by comparing it to an expected list of column names.  
   - Column names are normalized (uppercase, stripped whitespace, removed quotes/newlines) for consistency.

2. **Convert datatypes for each column**  
   - **Integers:** `BLOCK`, `LOT`, `RESIDENTIAL UNITS`, `COMMERCIAL UNITS`, `TOTAL UNITS`, `YEAR BUILT`
   - **Floats:** `LAND SQUARE FEET`, `GROSS SQUARE FEET`, `SALE PRICE`  
   - **Strings:** `BOROUGH`, `NEIGHBORHOOD`, `BUILDING CLASS CATEGORY`, `TAX CLASS AT PRESENT`,
            `EASE-MENT`, `BUILDING CLASS AT PRESENT`, `ADDRESS`, `APARTMENT NUMBER`,
            `ZIP CODE`, `TAX CLASS AT TIME OF SALE`, `BUILDING CLASS AT TIME OF SALE`
   - **Datetime:** `SALE DATE`  

3. **Reformat and overwrite cleaned files**  
   - Each cleaned DataFrame is saved back to its original Excel file location, ensuring all files now have consistent structure, column names, and data types.


In [5]:
root_folder = "New York City Sales Data"
expected_columns = [
    "BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY", "TAX CLASS AT PRESENT",
    "BLOCK", "LOT", "EASE-MENT", "BUILDING CLASS AT PRESENT", "ADDRESS",
    "APARTMENT NUMBER", "ZIP CODE", "RESIDENTIAL UNITS", "COMMERCIAL UNITS",
    "TOTAL UNITS", "LAND SQUARE FEET", "GROSS SQUARE FEET", "YEAR BUILT",
    "TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE",
    "SALE PRICE", "SALE DATE"
]

int_cols = ["BLOCK", "LOT", "RESIDENTIAL UNITS", "COMMERCIAL UNITS", "TOTAL UNITS", "YEAR BUILT"]
float_cols = ["LAND SQUARE FEET", "GROSS SQUARE FEET", "SALE PRICE"]
str_cols = ["BOROUGH", "NEIGHBORHOOD", "BUILDING CLASS CATEGORY", "TAX CLASS AT PRESENT",
            "EASE-MENT", "BUILDING CLASS AT PRESENT", "ADDRESS", "APARTMENT NUMBER",
            "ZIP CODE", "TAX CLASS AT TIME OF SALE", "BUILDING CLASS AT TIME OF SALE"]
datetime_cols = ["SALE DATE"]

def normalize(col):
    return str(col).replace('\n', ' ').replace('"', '').strip().upper()

normalized_expected = [normalize(c) for c in expected_columns]

for year in os.listdir(root_folder):
    year_folder = os.path.join(root_folder, year)
    if not os.path.isdir(year_folder):
        continue
    print(f"\nEntering year: {year}")

    for file in os.listdir(year_folder):
        if file.startswith("~"):  # skip temp files
            continue

        file_path = os.path.join(year_folder, file)

        # Detect header row
        df = pd.read_excel(file_path, header=None, engine="openpyxl")
        header_row_idx = None
        for i, row in df.iterrows():
            normalized_row = [normalize(c) for c in row.values]
            matches = sum(col in normalized_expected for col in normalized_row)
            if matches >= len(normalized_expected) * 0.7:
                header_row_idx = i
                break

        df = pd.read_excel(file_path, header=header_row_idx, engine="openpyxl")
        df.columns = [normalize(c) for c in df.columns]
        df.reset_index(drop=True, inplace=True)

        # Convert cols
        for col in int_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")  # allows NaN

        for col in float_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors="coerce")

        for col in str_cols:
            if col in df.columns:
                df[col] = df[col].astype(str).replace("nan", "")

        for col in datetime_cols:
            if col in df.columns:
                df[col] = pd.to_datetime(df[col], errors="coerce")

        # Resave neatly
        df.to_excel(file_path, index=False, engine="openpyxl")
        print(f"Processed and saved {file_path}")


Entering year: 2013
Processed and saved New York City Sales Data/2013/Manhattan.xlsx
Processed and saved New York City Sales Data/2013/Brooklyn.xlsx
Processed and saved New York City Sales Data/2013/Bronx.xlsx
Processed and saved New York City Sales Data/2013/Staten Island.xlsx
Processed and saved New York City Sales Data/2013/Queens.xlsx

Entering year: 2014
Processed and saved New York City Sales Data/2014/Manhattan.xlsx
Processed and saved New York City Sales Data/2014/Brooklyn.xlsx
Processed and saved New York City Sales Data/2014/Bronx.xlsx
Processed and saved New York City Sales Data/2014/Staten Island.xlsx
Processed and saved New York City Sales Data/2014/Queens.xlsx

Entering year: 2022
Processed and saved New York City Sales Data/2022/Manhattan.xlsx
Processed and saved New York City Sales Data/2022/Brooklyn.xlsx
Processed and saved New York City Sales Data/2022/Bronx.xlsx
Processed and saved New York City Sales Data/2022/Staten Island.xlsx
Processed and saved New York City Sa