In [2]:
import pandas as pd
import sys
import os

In [4]:
def load_redfin_data(filepath: str) -> pd.DataFrame:
    """
    Loads Redfin housing market data from a CSV file.

    Parameters:
    filepath (str): The path to the CSV file.

    Returns:
    pd.DataFrame: A DataFrame containing the specified columns from the Redfin dataset.
    """
    return pd.read_csv(
        filepath, 
        sep='\t', 
        usecols=['period_end', 'region_type', 'region_name', 'duration', 'median_sale_price']
    )

In [5]:
def filter_data(df):
    """
    Filters and cleans Redfin housing market data to include only the most recent 
    12-week aggregated median sale prices for counties.

    Steps:
    1. Identifies the most recent period_end date in the dataset.
    2. Filters the DataFrame to include only county-level data with a duration of '12 weeks'.
    3. Drops rows with missing median sale prices.
    4. Retains only the latest period_end data.
    5. Cleans up county names by removing suffixes like 'County', 'Parish', 'Census Area', and 'Borough'.
    6. Renames the 'region_name' column to 'county'.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing Redfin housing market data.

    Returns:
    pd.DataFrame: A DataFrame with two columns: 'county' and 'median_sale_price', 
                  filtered and cleaned for analysis.
    """
    max_dt = df['period_end'].max()

    # Filter data to include only county-level, 12-week duration records
    df_filtered = (
        df.loc[
            (df['region_type'] == 'county') &
            (df['duration'] == '12 weeks')
        ]
        .dropna(subset=['median_sale_price'])
        .query("period_end == @max_dt")
    )[['region_name', 'median_sale_price']]

    # Clean county names
    df_filtered['region_name'] = (
        df_filtered['region_name']
        .str.replace(' County', '', regex=False)
        .str.replace(' Parish', '', regex=False)
        .str.replace(' Census Area', '', regex=False)
        .str.replace(' Borough', '', regex=False)
    )

    # Rename column for clarity
    df_filtered = df_filtered.rename(columns={"region_name": "county"})

    return df_filtered


In [6]:
def main():
    """
    Main function to load, filter, and save Redfin housing market data.

    Steps:
    1. Add the current directory to the system path.
    2. Define input and output file paths.
    3. Load the Redfin dataset.
    4. Filter and clean the data.
    5. Save the processed DataFrame to a CSV file.

    Returns:
    None
    """

    # Ensure the script can access necessary modules
    sys.path.append(os.path.abspath('.'))
    print("Added current directory to system path.")

    # Define file paths
    input_path = os.path.join('..', 'data', 'redfin_raw.csv')
    output_path = os.path.join('..', 'data', 'redfin_clean.csv')
    print(f"Input file path: {input_path}")
    print(f"Output file path: {output_path}")

    # Load data
    print("Loading Redfin data...")
    df = load_redfin_data(input_path)
    print("Data loaded successfully.")

    # Filter and clean data
    print("Filtering and cleaning data...")
    df_filtered = df.pipe(filter_data)
    print("Data filtering complete.")

    # Save the cleaned data
    print(f"Saving cleaned data to {output_path}...")
    df_filtered.to_csv(output_path, index=False)
    print("File saved successfully.")

In [7]:
main()

Added current directory to system path.
Input file path: ../data/redfin_raw.csv
Output file path: ../data/redfin_clean.csv
Loading Redfin data...
Data loaded successfully.
Filtering and cleaning data...
Data filtering complete.
Saving cleaned data to ../data/redfin_clean.csv...
File saved successfully.
