In [8]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [9]:

def preprocess_data(df: pd.DataFrame, cont_vars: list, target: str):
    """
    Encodes categorical features, extracts predictors, and splits the data into training and imputation sets.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    cont_vars (list): List of continuous variables.
    target (str): Target variable to predict.

    Returns:
    tuple: (df, predictors, X_train_split, X_val, y_train_split, y_val, X_impute_data)
    """
    # One-hot encode categorical variables
    df_encoded = pd.get_dummies(df, columns=['state_name'], drop_first=True)

    # Get predictor columns
    predictors = cont_vars + [col for col in df_encoded.columns if col.startswith('state_name_')]

    # Split data into training and imputation sets
    train_data = df_encoded[df_encoded[target].notnull()]
    X_train = train_data[predictors]
    y_train = train_data[target]

    impute_data = df_encoded[df_encoded[target].isnull()]
    X_impute_data = impute_data[predictors]

    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

    return df, predictors, X_train_split, X_val, y_train_split, y_val, X_impute_data

def train_and_impute(df: pd.DataFrame, predictors: list, X_train_split, X_val, y_train_split, y_val, X_impute_data, target: str):
    """
    Trains a RandomForestRegressor, evaluates performance, and imputes missing target values.

    Parameters:
    df (pd.DataFrame): The original DataFrame.
    predictors (list): List of predictor variables.
    X_train_split (pd.DataFrame): Training predictors.
    X_val (pd.DataFrame): Validation predictors.
    y_train_split (pd.Series): Training target.
    y_val (pd.Series): Validation target.
    X_impute_data (pd.DataFrame): Predictors for missing target values.
    target (str): The target variable to predict.

    Returns:
    pd.DataFrame: The DataFrame with imputed values.
    """
    # Train the model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train_split, y_train_split)

    # Evaluate model performance
    y_val_pred_rf = rf_model.predict(X_val)
    print(f"Random Forest R² Score: {r2_score(y_val, y_val_pred_rf):.4f}")

    # Impute missing values
    df.loc[df[target].isnull(), target] = rf_model.predict(X_impute_data)

    return df

def execute_pipeline(df: pd.DataFrame):
    """
    Executes the full pipeline: preprocessing, training, and imputing missing values.

    Parameters:
    df (pd.DataFrame): The input DataFrame.

    Returns:
    pd.DataFrame: The final DataFrame with missing target values imputed.
    """
    cont_vars = [
        'median_household_income', 'population',
        'median_age', 'intersection_density', 'population_density',
        'pct_bachelor'
    ]
    target = 'median_sale_price'

    # Preprocess data
    df, predictors, X_train_split, X_val, y_train_split, y_val, X_impute_data = preprocess_data(df, cont_vars, target)

    # Train model and impute missing values
    df_final = train_and_impute(df, predictors, X_train_split, X_val, y_train_split, y_val, X_impute_data, target)
    
    return df_final



In [10]:
def compute_composite_score(df, cont_vars, adjustments, scale_range=(0, 100)):
    """
    Computes a composite score from continuous variables using PCA.

    This function:
    1. Adjusts continuous variables based on provided adjustment factors.
    2. Standardizes the adjusted values using `StandardScaler()`.
    3. Performs Principal Component Analysis (PCA) and extracts the first principal component.
    4. Scales the principal component to a specified range (default: 0-100).
    5. Returns the modified DataFrame with the composite score and PCA loadings.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing continuous variables.
    cont_vars (list): List of continuous variable column names to be used in PCA.
    adjustments (dict): Dictionary where keys are variable names and values are adjustment factors.
                        If a variable is not in the dictionary, it defaults to a factor of 1.
    scale_range (tuple, optional): The target range for the composite score scaling. Default is (0, 100).

    Returns:
    tuple:
        - pd.DataFrame: A copy of the input DataFrame with an added 'composite_score' column.
        - np.ndarray: The PCA loadings (importance of each variable in the principal component).
    """
    # Create a copy of selected continuous variables and apply adjustments
    X = df[cont_vars].copy()
    for var in cont_vars:
        factor = adjustments.get(var, 1)  # Default adjustment factor is 1
        X[var] = X[var] * factor
    
    X_array = X.values

    # Standardize the adjusted data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_array)

    # Perform PCA to reduce dimensionality to one component
    pca = PCA(n_components=1)
    pc1 = pca.fit_transform(X_scaled)

    # Scale the principal component to the specified range
    min_val, max_val = np.min(pc1), np.max(pc1)
    if max_val == min_val:
        scaled_pc1 = np.full_like(pc1, scale_range[0])  # Avoid division by zero
    else:
        scaled_pc1 = (pc1 - min_val) / (max_val - min_val) * (scale_range[1] - scale_range[0]) + scale_range[0]

    # Add the composite score to the DataFrame
    df_out = df.copy()
    df_out['composite_score'] = scaled_pc1.flatten()

    # Extract PCA loadings (importance of each variable in the first principal component)
    loadings = pca.components_[0]
    
    return df_out, loadings


In [11]:
def main():
    """
    Main function to execute the full data processing pipeline.

    Steps:
    1. Add the current directory to the system path.
    2. Define input and output file paths.
    3. Load Redfin and ACS datasets.
    4. Merge ACS data with Redfin data.
    5. Drop duplicates and process the data pipeline.
    6. Compute the composite score.
    7. Round specific columns for better readability.
    8. Save the final processed dataset to a CSV file.

    Returns:
    None
    """

    # Ensure the script can access necessary modules
    sys.path.append(os.path.abspath('.'))
    print("Added current directory to system path.")

    # Define file paths
    redfin_path = os.path.join('..', 'data', 'redfin_clean.csv')
    acs_path = os.path.join('..', 'data', 'acs_with_county.csv')
    output_path = os.path.join('..', 'data', 'final_data.csv')

    print(f"Loading data from:\n  - {redfin_path}\n  - {acs_path}")

    # Load datasets
    df_redfin = pd.read_csv(redfin_path)
    df_acs_with_county = pd.read_csv(acs_path)
    print("Datasets loaded successfully.")

    # Merge datasets
    print("Merging ACS and Redfin data...")
    df_merged = df_acs_with_county.merge(df_redfin, on='county', how='left')

    # Drop duplicates, keeping the last occurrence
    df_final = df_merged.drop_duplicates(subset=['state_name', 'town'], keep='last')
    print("Duplicates removed.")

    # Execute the data processing pipeline
    print("Executing data pipeline...")
    df_final = execute_pipeline(df_final)
    print("Pipeline execution completed.")

    # Compute composite score
    print("Computing composite score...")
    df_out, loadings = compute_composite_score(
        df=df_final,
        cont_vars=['median_household_income', 'median_age', 'intersection_density',
                   'pct_bachelor', 'median_sale_price'],
        adjustments={
            'age': -1,
            'median_sale_price': 0.75
        }
    )
    print("Composite score computed.")

    # Round selected columns
    cols_to_round = ['intersection_density', 'population_density',
                     'pct_bachelor', 'median_sale_price', 'composite_score']
    df_out[cols_to_round] = df_out[cols_to_round].round(2)

    # Save final processed data
    print(f"Saving final dataset to {output_path}...")
    df_out.to_csv(output_path, index=False)
    print("Final dataset saved successfully.")


In [12]:
main()

Added current directory to system path.
Loading data from:
  - ../data/redfin_clean.csv
  - ../data/acs_with_county.csv
Datasets loaded successfully.
Merging ACS and Redfin data...
Duplicates removed.
Executing data pipeline...
Random Forest R² Score: 0.6255
Pipeline execution completed.
Computing composite score...
Composite score computed.
Saving final dataset to ../data/final_data.csv...
Final dataset saved successfully.
