# Scripts

##compare_dfs

## save_df

In [37]:
def save_df(df: pd.DataFrame, base_path: str, filename: str):
    """
    Overwrites a file in the base_path and creates a timestamped copy in an 'archive' subdirectory.
    If the archive file for the current day already exists, it will not be overwritten.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        base_path (str): The base directory where the file will be saved and archived.
        filename (str): The name of the file (e.g., 'gen_lookup.csv').
    """
    include_index_map = {
      "listings.csv": False,
      "gen_lookup.csv": False,
      "notes.csv": True,
      "allocations.csv": True,
    }
    # Determine if index should be included for the current filename
    include_idx = include_index_map.get(filename, False)

    # Construct the full path for the original file
    original_filepath = os.path.join(base_path, filename)

    # Save (overwrite) the original file
    df.to_csv(original_filepath, index=include_idx)
    print(f"Overwrote: {original_filepath}")

    # Create the archive directory path
    archive_dir = os.path.join(base_path, 'archive')
    os.makedirs(archive_dir, exist_ok=True)

    # Generate timestamp for the archive filename
    timestamp = datetime.now().strftime('%Y%m%d')
    name, ext = os.path.splitext(filename)
    archive_filename = f"{name}_{timestamp}{ext}"
    archive_filepath = os.path.join(archive_dir, archive_filename)

    # Check if the archive file already exists before saving
    if not os.path.exists(archive_filepath):
        # Save the archived file
        df.to_csv(archive_filepath, index=include_idx)
        print(f"Archived to: {archive_filepath}")
    else:
        print(f"Archive file already exists for today: {archive_filepath}. Skipping archive save.")

# Example usage:
# base_directory = "/content/drive/Shareddrives/market_analysis_v2/"
# save_df(gen_lookup, base_directory, "gen_lookup.csv")

## const and helpers

In [38]:
import re
import pandas as pd
from typing import Dict, Optional, List

# --- Carsales/General Scrapes (CS) Constants ---
YEAR_MIN, YEAR_MAX = 1980, 2035
ORDER: List[str] = ['href', 'year_make_model', 'trim', "listed_price", 'transmission', 'odometer', 'seller_type']

YEAR_RE  = r'\b(19[89]\d|20[0-3]\d)\b'
PRICE_RE = r'^(?:AU\$|\$)\s*[\d,]+(?:\.\d{2})?\b' # Made currency symbol mandatory
ODOM_RE  = r'^\s*\d+(?:,?\d{3})*K?\s*km\s*$' # Added optional 'K' for Facebook odometer format
URL_RE   = r'^(?:https?://|www\.)'
TX, SELLER = {'automatic', 'manual'}, {'private', 'dealer used'}

THRESH: Dict[str, float] = {
    'year_make_model': 0.50,
    "listed_price":           0.60,
    'transmission':    0.80,
    'odometer':        0.60,
    'seller_type':     0.70,
}

# --- Facebook Marketplace (FB) Constants ---
FB_ORDER: List[str] = ['href', 'year_make_model', 'listed_price', 'odometer', 'location']
THRESH_FB: Dict[str, float] = {
    'href':            0.80,
    'year_make_model': 0.50,
    'listed_price':    0.60,
    'odometer':        0.60,
    'location':        0.40,
}

# --- Predicates (Validation Rules) ---
def _ratio(mask: pd.Series) -> float:
    return float(mask.mean()) if len(mask) else 0.0

def _yr_ok(s: pd.Series) -> pd.Series:
    years = pd.to_numeric(s.astype(str).str.extract(YEAR_RE, expand=False), errors='coerce')
    return years.between(YEAR_MIN, YEAR_MAX)

PRED = {
    'year_make_model': lambda s: s.astype(str).pipe(_yr_ok) & s.astype(str).str.contains(r'[A-Za-z]', na=False),
    "listed_price":           lambda s: s.astype(str).str.match(PRICE_RE, na=False),
    'transmission':    lambda s: s.astype(str).str.strip().str.lower().isin(TX),
    'odometer':        lambda s: s.astype(str).str.match(ODOM_RE, flags=re.I, na=False),
    'seller_type':     lambda s: s.astype(str).str.strip().str.lower().isin(SELLER),
}

PRED_FB = {
    'year_make_model': lambda s: s.astype(str).pipe(_yr_ok) & s.astype(str).str.contains(r'[A-Za-z]', na=False),
    'listed_price':    lambda s: s.astype(str).str.match(PRICE_RE, na=False),
    'odometer':        lambda s: s.astype(str).str.match(ODOM_RE, flags=re.I, na=False),
}

# --- Core Identification Functions ---
def identify_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
    """Identifies and maps raw DataFrame columns to canonical Carsales/General columns."""
    cols = list(df.columns)
    if not cols:
        return {k: None for k in ORDER}

    href_col = cols[0]

    # Exclude URL-like columns from other detection logic
    url_ratio = {c: _ratio(df[c].astype(str).str.contains(URL_RE, case=False, na=False)) for c in cols}
    urlish = {c for c, r in url_ratio.items() if r >= 0.50}
    blocked = {href_col} | urlish

    remaining = [c for c in cols if c not in blocked]
    picks = {t: None for t in PRED}

    for t in PRED:
        if not remaining:
            break
        scores = {c: _ratio(PRED[t](df[c])) for c in remaining}
        best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
        if best_score >= THRESH[t]:
            picks[t] = best_col
            remaining.remove(best_col)

    trim_col = None
    ymm = picks.get('year_make_model')
    if ymm in cols:
        i = cols.index(ymm)
        if i + 1 < len(cols):
            trim_col = cols[i + 1]

    return {'href': href_col, **picks, 'trim': trim_col}

def identify_fb_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
    """Identifies and maps raw DataFrame columns to canonical Facebook Marketplace columns.
    Note: 'href' is assumed to be the first column and is handled by clean_fb directly.
    """
    cols = list(df.columns)
    if not cols:
        return {k: None for k in FB_ORDER}

    picks = {t: None for t in FB_ORDER}
    remaining = set(cols)

    # 'href' is now handled externally by clean_fb and is assumed to be the first column
    # So we set it to None here or simply don't try to identify it.
    # We explicitly remove the first column from 'remaining' as it's the href
    if cols and cols[0] in remaining:
        remaining.remove(cols[0])
    picks['href'] = None # No longer identified by this function

    # Identify 'year_make_model', 'listed_price', 'odometer'
    for t in ['year_make_model', 'listed_price', 'odometer']:
        if not remaining:
            break
        scores = {c: _ratio(PRED_FB[t](df[c])) for c in remaining}
        if scores:
            best_col, score = max(scores.items(), key=lambda kv: kv[1])
            if score >= THRESH_FB[t]:
                picks[t] = best_col
                remaining.remove(best_col)

    # Assign 'location', often found in column 'c' or as the last remaining column
    if picks['location'] is None:
        if 'c' in remaining:
            picks['location'] = 'c'
            remaining.remove('c')
        elif len(remaining) == 1:
            picks['location'] = remaining.pop()

    return picks

## clean_cs

In [39]:
import pandas as pd
import os
from datetime import datetime, timedelta
from typing import Dict, Optional, List

def clean_cs(df: pd.DataFrame, save_raw: bool = False) -> pd.DataFrame:
    """
    Business Logic for clean_cs function:

    This function processes raw DataFrame outputs from Carsales/General web scrapes to standardize
    and clean vehicle listing data into a consistent format for analysis.

    Key steps and business rules:
    1.  **Raw Data Preservation (Optional):** If `save_raw` is True, the original DataFrame
        is saved to a timestamped CSV, and a 'raw' column (filename) is added to the output.
    2.  **Column Identification:** Dynamically maps raw DataFrame columns to canonical names
        ('href', 'year_make_model', 'listed_price', 'odometer', etc.) using `identify_columns`.
    3.  **Data Extraction & Standardization:**
        *   Cleans 'href' by removing query parameters and `http(s)://www.` prefix.
        *   Splits 'year_make_model' into 'year', 'make', and 'model'; converts 'year' to integer.
        *   Converts 'listed_price' and 'odometer' to integer, removing non-numeric characters.
        *   Transforms 'odometer' values from 'km' to '000 km' (e.g., 180,000 km -> 180).
    4.  **Output Structure:** Returns a DataFrame with a standardized set of columns for consistency.
    """
    raw_col_value = None
    if save_raw:
        raw_data_dir = 'data/raws'
        os.makedirs(raw_data_dir, exist_ok=True)
        timestamp = datetime.now()
        raw_filename = ''
        while True:
            raw_filename = os.path.join(raw_data_dir, f"raw_carsales_data_{timestamp.strftime('%Y%m%d_%H%M%S')}.csv")
            if not os.path.exists(raw_filename):
                break
            timestamp += timedelta(seconds=1)
        df.to_csv(raw_filename, index=False)
        raw_col_value = os.path.basename(raw_filename)

    if 'identify_columns' not in globals():
        raise NameError("Function 'identify_columns' not found. Please ensure 'constants_and_helpers.py' or cell 'gECV1vdedUm0' has been executed.")

    out = pd.DataFrame()
    if not df.empty and len(df.columns) > 0:
        out['href'] = df.iloc[:, 0]

    mapping = identify_columns(df)
    for col in ['year_make_model', 'trim', "listed_price", 'transmission', 'odometer', 'seller_type']:
        src = mapping.get(col)
        if src is not None and src != out['href'].name:
            out[col] = df[src]

    if save_raw and raw_col_value:
        out['raw'] = raw_col_value

    if 'year_make_model' in out.columns:
        split_cols = out['year_make_model'].astype(str).str.split(expand=True, n=2)
        if 0 in split_cols.columns:
            out['year'] = pd.to_numeric(
                split_cols[0].astype(str).str.replace(r'[^\d]', '', regex=True),
                errors='coerce'
            ).astype('Int64')
        else:
            out['year'] = pd.NA
        out['make'] = split_cols[1] if 1 in split_cols.columns else pd.NA
        out['model'] = split_cols[2] if 2 in split_cols.columns else pd.NA
    else:
        out[['year', 'make', 'model']] = pd.NA

    if 'href' in out.columns:
        # Revert: Remove .str.lower(), keep http(s)://www. prefixes removal
        out['href'] = out['href'].astype(str).str.replace(r'^(https?://)?(www\.)?', '', regex=True).str.split('?').str[0]

    for col in ["listed_price", 'odometer']:
        if col in out.columns:
            out[col] = pd.to_numeric(
                out[col].astype(str).str.replace(r'[^\d]', '', regex=True),
                errors='coerce'
            ).astype('Int64')

    if 'odometer' in out.columns:
        out['odometer'] = out['odometer'] // 1000

    final_cols = ['href', 'year', 'make', 'model', "listed_price", 'trim', 'odometer', 'seller_type']
    if save_raw:
        final_cols.insert(0, 'raw')
    return out[[c for c in final_cols if c in out.columns]]

## clean_fb

In [40]:
import pandas as pd
import os
from datetime import datetime, timedelta
from typing import Dict, Optional, List

def clean_fb(df: pd.DataFrame, save_raw: bool = False) -> pd.DataFrame:
    """
    Business Logic for clean_fb function:

    This function processes raw DataFrame outputs from Facebook Marketplace scrapes to standardize
    and clean vehicle listing data into a consistent format for analysis.

    Key steps and business rules:
    1.  **Raw Data Preservation (Optional):** If `save_raw` is True, the original DataFrame
        is saved to a timestamped CSV, and a 'raw' column (filename) is added to the output.
    2.  **Column Identification:** Dynamically maps raw DataFrame columns to canonical names
        ('href', 'year_make_model', 'listed_price', 'odometer', 'location') using `identify_fb_columns`.
    3.  **Data Extraction & Standardization:**
        *   Cleans 'href' by removing query parameters and `http(s)://www.` prefix.
        *   Splits 'year_make_model' into 'year', 'make', and 'model'; converts 'year' to integer.
        *   Converts 'listed_price' and 'odometer' to integer, removing non-numeric characters.
        *   Filters out listings with 'listed_price' explicitly marked as "free".
    4.  **Data Quality Filtering:** Drops rows with missing (`pd.NA`) values in critical columns
        ('listed_price', 'odometer', 'year') to ensure data integrity. Also removes listings
        with a placeholder 'listed_price' of 12345.
    5.  **Output Structure:** Returns a DataFrame with a standardized set of columns for consistency.
    """
    raw_col_value = None
    if save_raw:
        raw_data_dir = 'data/raws'
        os.makedirs(raw_data_dir, exist_ok=True)
        timestamp = datetime.now()
        raw_filename = ''
        while True:
            raw_filename = os.path.join(raw_data_dir, f"raw_facebook_data_{timestamp.strftime('%Y%m%d_%H%M%S')}.csv")
            if not os.path.exists(raw_filename):
                break
            timestamp += timedelta(seconds=1)
        df.to_csv(raw_filename, index=False)
        raw_col_value = os.path.basename(raw_filename)

    if 'identify_fb_columns' not in globals():
        raise NameError("Function 'identify_fb_columns' not found. Please ensure 'constants_and_helpers.py' or cell 'gECV1vdedUm0' has been executed.")

    out = pd.DataFrame()
    if not df.empty and len(df.columns) > 0:
        out['href'] = df.iloc[:, 0]

    mapping = identify_fb_columns(df)
    for canonical_col, src_col in mapping.items():
        if canonical_col != 'href' and src_col is not None and src_col in df.columns:
            out[canonical_col] = df[src_col]

    if save_raw and raw_col_value:
        out['raw'] = raw_col_value

    if 'year_make_model' in out.columns:
        split_df = out['year_make_model'].astype(str).str.split(expand=True, n=2)
        if 0 in split_df.columns:
            out['year'] = split_df[0].astype(str).str.replace(r'[^0-9]', '', regex=True).replace('', pd.NA).astype(float).astype('Int64')
        else:
            out['year'] = pd.NA
        out['make'] = split_df[1] if 1 in split_df.columns else pd.NA
        out['model'] = split_df[2] if 2 in split_df.columns else pd.NA
    else:
        out[['year', 'make', 'model']] = pd.NA

    if 'href' in out.columns:
        # Revert: Remove .str.lower(), keep http(s)://www. prefixes removal
        out['href'] = out['href'].astype(str).str.replace(r'^(https?://)?(www\.)?', '', regex=True).str.split('?').str[0]

    for col in ["listed_price", 'odometer']:
        if col in out.columns:
            if col == 'listed_price':
                out = out[out[col].astype(str).str.lower() != "free"]
            out[col] = pd.to_numeric(
                out[col].astype(str).str.replace(r'[^0-9]', '', regex=True),
                errors='coerce'
            ).astype('Int64')

    cols_to_check_for_na = []
    if 'listed_price' in out.columns: cols_to_check_for_na.append('listed_price')
    if 'odometer' in out.columns: cols_to_check_for_na.append('odometer')
    if 'year' in out.columns: cols_to_check_for_na.append('year')

    if cols_to_check_for_na:
        out = out.dropna(subset=cols_to_check_for_na)

    final_columns = ['href', 'year', 'make', 'model', "listed_price", 'odometer', 'location']
    if save_raw:
        final_columns.insert(0, 'raw')
    return out[[c for c in final_columns if c in out.columns]]

## enrich_df

In [41]:
import pandas as pd
from typing import Dict, Optional, List

def enrich_df(df: pd.DataFrame, gen_lookup: pd.DataFrame) -> pd.DataFrame:
    """Final clean after clean_cs or clean_fb, including generation assignment.

    Args:
        df (pd.DataFrame): The DataFrame to enrich.
        gen_lookup (pd.DataFrame): A lookup table for car generations.

    Returns:
        pd.DataFrame: The enriched DataFrame.
    """

    # --- 1. Add/Update date_scraped ---
    current_timestamp = pd.Timestamp.now().normalize()
    if 'date_scraped' not in df.columns:
        # Initialize as datetime type with NaT values if column doesn't exist
        df['date_scraped'] = pd.Series(pd.NaT, index=df.index, dtype='datetime64[ns]')
    else:
        # Ensure it's datetime type, coercing errors if it exists but isn't datetime
        df['date_scraped'] = pd.to_datetime(df['date_scraped'], errors='coerce')

    # Now fill NaT values with current_timestamp
    df['date_scraped'] = df['date_scraped'].fillna(current_timestamp)

    # --- 2. Normalise make & model ---
    for col in ["make", "model"]:
        if col in df.columns:
            df[col] = (
                df[col]
                .astype(str)
                .str.lower()
                .str.replace(r"[^a-z0-9]+", "", regex=True)
            )

    # --- Remove 'https://' or 'http://' and 'www.' from href ---
    if 'href' in df.columns:
        df['href'] = df['href'].astype(str).str.replace(r'^(https?://)?(www\.)?', '', regex=True)

    # --- 3. Ensure year is numeric ---
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

    # --- 4. Calculate age ---
    if 'year' in df.columns:
        df['age'] = 2026 - df['year']

    # --- 5. Assign generation manually (no merge, no year_start/year_end contamination) ---
    df["gen"] = pd.NA

    for idx, row in gen_lookup.iterrows():
        mask = (
            (df["make"] == row["make"]) &
            (df["model"] == row["model"]) &
            (df["year"].between(row["year_start"], row["year_end"], inclusive="both"))
        )
        df.loc[mask, "gen"] = row["gen"]

    df["gen"] = df["gen"].astype("Int64")

    # --- 6. Create model_gen ---
    df["model_gen"] = df.apply(
        lambda r: f"{r['model']}_{r['gen']}" if pd.notna(r["gen"]) else None,
        axis=1
    )

    return df

## remove_bad_listings

In [42]:
def remove_bad_listings(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies filters to remove bad or undesirable listings from the DataFrame.
    This function is intended to be called after initial cleaning and data type conversions.

    Args:
        df (pd.DataFrame): The DataFrame to filter, expected to have 'year', 'listed_price', and 'odometer' columns.

    Returns:
        pd.DataFrame: The filtered DataFrame.
    """
    df_filtered = df.copy()

    # Price filters as specified by the user
    if 'listed_price' in df_filtered.columns:
        # Ensure listed_price is numeric for comparison
        df_filtered['listed_price'] = pd.to_numeric(df_filtered['listed_price'], errors='coerce')
        df_filtered = df_filtered[df_filtered["listed_price"] != 12345]
        df_filtered = df_filtered[df_filtered["listed_price"] > 3000]

    # Calculate age temporarily for the odometer filter if 'year' is available
    # Assuming 2026 is the reference year for age calculation based on other parts of the notebook
    if 'year' in df_filtered.columns:
        df_filtered['year'] = pd.to_numeric(df_filtered['year'], errors='coerce') # Ensure year is numeric
        temp_age = 2026 - df_filtered['year']
    else:
        temp_age = pd.Series(pd.NA, index=df_filtered.index) # Create a Series of NA for consistent operations

    # Odometer filter: odometer > 2 * age
    if 'odometer' in df_filtered.columns:
        # Ensure odometer is numeric
        df_filtered['odometer'] = pd.to_numeric(df_filtered['odometer'], errors='coerce')

        # Create a mask for rows where both odometer and temp_age are valid for comparison
        mask_valid_comparison = df_filtered['odometer'].notna() & temp_age.notna()

        # Filter out rows where (odometer is NOT > 2 * age) AND (the comparison is valid)
        # We keep rows where (odometer > 2 * age) OR (the comparison cannot be made due to NA values)
        df_filtered = df_filtered[~((df_filtered['odometer'] <= 2 * temp_age) & mask_valid_comparison)]

    return df_filtered

## compare_new_listings

In [43]:
def compare_new_listings(listings: pd.DataFrame, gen_lookup: pd.DataFrame):
    """
    Processes new listing files, cleans, enriches, and compares them against existing listings.

    Args:
        listings (pd.DataFrame): Existing DataFrame of car listings.
        gen_lookup (pd.DataFrame): Lookup table for car generations.

    Returns:
        Tuple[pd.DataFrame, int, int, int, int]: A tuple containing:
            - enriched_new_listings (pd.DataFrame): DataFrame of newly processed and enriched listings.
            - unq_new (int): Total count of truly new unique listings.
            - unq_updated (int): Total count of updated unique listings.
            - unq_unchanged (int): Total count of unchanged unique listings.
            - unq_tot (int): Total count of all unique listings processed from new files.
    """
    # Sets to track unique hrefs across all processed files
    unique_new_hrefs = set()
    unique_updated_hrefs = set()
    unique_unchanged_hrefs = set()
    unique_total_hrefs = set()

    enriched_new_listings = pd.DataFrame()

    # Dynamically find new CSV files
    cs_files = glob.glob('/content/carsales*.csv')
    fb_files = glob.glob('/content/facebook*.csv')

    for file_path in cs_files + fb_files:
        df_raw = pd.read_csv(file_path)
        df_cleaned = None

        if 'carsales' in os.path.basename(file_path):
            df_cleaned = clean_cs(df_raw, save_raw=False)
        elif 'facebook' in os.path.basename(file_path):
            df_cleaned = clean_fb(df_raw, save_raw=False)
        else:
            print(f"Unknown file type: {file_path}")
            continue

        if df_cleaned is not None and not df_cleaned.empty:
            unique_total_hrefs.update(df_cleaned['href'].tolist())

        # Checking how many new, updated, unchanged listings
        df_comparison = pd.merge(
            df_cleaned,
            listings,
            on='href',
            how='left',
            suffixes=('_new', '_existing')
        )

        # Identify new listings
        new_listings_df = df_comparison[df_comparison['listed_price_existing'].isnull()]
        n_new = len(new_listings_df)
        if not new_listings_df.empty:
            unique_new_hrefs.update(new_listings_df['href'].tolist())

        # Identify matched listings
        matched_listings_df = df_comparison[df_comparison['listed_price_existing'].notnull()]

        # From matched_listings, identify updated listings
        updated_listings_df = matched_listings_df[
            matched_listings_df['listed_price_new'] != matched_listings_df['listed_price_existing']
        ]
        n_updated = len(updated_listings_df)
        if not updated_listings_df.empty:
            unique_updated_hrefs.update(updated_listings_df['href'].tolist())

        # From matched_listings, identify unchanged listings
        unchanged_listings_df = matched_listings_df[
            matched_listings_df['listed_price_new'] == matched_listings_df['listed_price_existing']
        ]
        n_unchanged = len(unchanged_listings_df)
        if not unchanged_listings_df.empty:
            unique_unchanged_hrefs.update(unchanged_listings_df['href'].tolist())

        # Calculate total listings for the current file
        n_total_listings = len(df_cleaned)

        # Print the comparison result for the current file
        print(f"{file_path}    \t {n_new=}   \t {n_updated=} \t {n_unchanged=} \t Tot {n_total_listings}")

        if df_cleaned is not None:
            df_enriched = enrich_df(df_cleaned, gen_lookup)
            enriched_new_listings = pd.concat([enriched_new_listings, df_enriched], ignore_index=True)

    # Calculate unique total counts at the end
    unq_new = len(unique_new_hrefs)
    unq_updated = len(unique_updated_hrefs)
    unq_unchanged = len(unique_unchanged_hrefs)
    unq_tot = len(unique_total_hrefs)

    print(f"\t \t \t \t {unq_new=} \t {unq_updated=}\t {unq_unchanged=} {unq_tot=}")

    # Check for missing values in enriched_new_listings after concatenation
    if not enriched_new_listings.empty:
        for col in ['model_gen', 'age', 'odometer']:
            if col in enriched_new_listings.columns and enriched_new_listings[col].isna().any():
                missing_count = enriched_new_listings[col].isna().sum()
                print(f"WARNING: Column '{col}' in enriched_new_listings has {missing_count} missing values.")


    return enriched_new_listings

## integrate_listings

In [44]:
import pandas as pd
import os
from datetime import datetime, timedelta
from typing import Dict, Optional, List
import glob # Import glob for file pattern matching

def integrate_listings(listings_df: pd.DataFrame, gen_lookup: pd.DataFrame) -> pd.DataFrame:
    """
    Integrates new car listings from '/content/carsales*.csv' and '/content/facebook*.csv' files into an existing listings DataFrame.

    Args:
        listings_df (pd.DataFrame): The existing DataFrame of car listings.
        gen_lookup (pd.DataFrame): The lookup table for car generations.

    Returns:
        pd.DataFrame: A new DataFrame (`listings_1`) with integrated, cleaned, and enriched listings,
                      with existing listings handled by keeping the most recent entry.
    """
    processed_dfs = []

    # Dynamically find new CSV files
    cs_files = glob.glob('/content/carsales*.csv')
    fb_files = glob.glob('/content/facebook*.csv')
    new_file_paths = cs_files + fb_files

    for file_path in new_file_paths:
        df_raw = pd.read_csv(file_path)
        df_cleaned = None

        if 'carsales' in os.path.basename(file_path):
            df_cleaned = clean_cs(df_raw, save_raw=False)
        elif 'facebook' in os.path.basename(file_path):
            df_cleaned = clean_fb(df_raw, save_raw=False)
        else:
            print(f"Unknown file type: {file_path}")
            continue

        if df_cleaned is not None:
            df_enriched = enrich_df(df_cleaned, gen_lookup)
            processed_dfs.append(df_enriched)

    if processed_dfs:
        new_listings_df = pd.concat(processed_dfs, ignore_index=True)

        # Define all possible columns that might exist in either DataFrame
        # Get columns from existing listings and new listings, handling potential differences
        all_cols = list(set(listings_df.columns) | set(new_listings_df.columns))

        # Reindex both DataFrames to ensure they have the same columns
        listings_aligned = listings_df.reindex(columns=all_cols, fill_value=pd.NA)
        new_listings_aligned = new_listings_df.reindex(columns=all_cols, fill_value=pd.NA)

        # Ensure 'date_scraped' is in datetime format for proper sorting
        listings_aligned['date_scraped'] = pd.to_datetime(listings_aligned['date_scraped'], errors='coerce')
        new_listings_aligned['date_scraped'] = pd.to_datetime(new_listings_aligned['date_scraped'], errors='coerce')

        # Explicitly cast dtypes of new_listings_aligned to match listings_aligned for common columns
        # This helps prevent FutureWarning and ensures consistent types across the concatenated DataFrame
        for col in all_cols:
            if col in listings_aligned.columns and col in new_listings_aligned.columns:
                if listings_aligned[col].dtype != new_listings_aligned[col].dtype:
                    try:
                        if pd.api.types.is_numeric_dtype(listings_aligned[col]):
                            if str(listings_aligned[col].dtype) == 'Int64':
                                new_listings_aligned[col] = new_listings_aligned[col].astype('Int64')
                            else:
                                new_listings_aligned[col] = pd.to_numeric(new_listings_aligned[col], errors='coerce').astype(listings_aligned[col].dtype)
                        else:
                            new_listings_aligned[col] = new_listings_aligned[col].astype(listings_aligned[col].dtype)
                    except (TypeError, ValueError):
                        pass # Keep original dtype if casting causes error

        # Concatenate the aligned Dataframes
        listings_1 = pd.concat([listings_aligned, new_listings_aligned], ignore_index=True)
    else:
        print("No new listings")
        return listings_df # Return the original listings_df if no new listings were processed


    # Sort by href, then listed_price (lowest first), then date_scraped (most recent first), then drop duplicates keeping the first
    listings_1 = listings_1.sort_values(by=['href', 'listed_price', 'date_scraped'], ascending=[True, True, True])
    listings_1 = listings_1.drop_duplicates(subset=['href'], keep='first')
    listings_1 = remove_bad_listings(listings_1)

    # Ensure 'gen' column is Int64 after all operations
    listings_1['gen'] = listings_1['gen'].astype('Int64')

    print(f"Final DataFrame has {len(listings_1)} unique listings after merging and de-duplication.")
    return listings_1

## allocate_listings

In [45]:
import pandas as pd
from datetime import date
from typing import Optional, List

def allocate_listings(listings_lr: pd.DataFrame, notes: pd.DataFrame, allocations: pd.DataFrame, clients_to_process: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Allocates car listings based on universal standards, client-specific criteria, and notes/allocation history.

    Business Logic for allocate_listings function:

    This function identifies and allocates suitable car listings to clients based on a multi-step filtering process:

    1.  **Client Identification:** Determines which clients to process based on `clients_to_process` or processes all global clients if none are specified.
    2.  **Universal Filters:** Applies initial filters to `listings_lr`:
        *   `odometer` must be greater than 4 times the `age` of the car.
        *   `listed_price` must be less than 95% of the `market_value`.
    3.  **Client-Specific Allocation:** Iterates through each active client and applies their specific criteria:
        *   `max_listing_price`: Listing price must be at or below the client's maximum price.
        *   `max_odometer`: Odometer reading must be at or below the client's maximum odometer.
        *   `model_gens_allowed`: The `model_gen` of the listing must match one of the client's allowed model generations (allowing for partial matches).
    4.  **Duplicate Allocation Prevention:** New allocation records are created. Before adding them to the `allocations` DataFrame, the function checks if an (href, client) pair already exists in the historical `allocations` to prevent re-allocating an already processed listing.
    5.  **Output:** Returns an updated `allocations` DataFrame containing all previous allocations plus any newly proposed and unique allocations.

    Args:
        listings_lr (pd.DataFrame): The DataFrame of car listings with regression results (market_value, excess_value).
        notes (pd.DataFrame): DataFrame containing historical notes and statuses for listings.
        allocations (pd.DataFrame): DataFrame containing historical allocation decisions.
        clients_to_process (Optional[List[str]]): List of client names to process. If None, all global clients are processed.

    Returns:
        pd.DataFrame: An updated allocations DataFrame containing newly proposed allocations.
    """

    global clients # Access the global list of client configuration dictionaries

    # Determine which clients to actually process
    effective_clients_info = []
    if clients_to_process is None:
        effective_clients_info = clients # Process all clients
    else:
        # Filter global clients to get the dictionaries for specified client names
        effective_clients_info = [c_info for c_info in clients if c_info['client'] in clients_to_process]

    if not effective_clients_info:
        print("No clients specified or found to process for allocations.")
        return allocations

    # Make copies to avoid modifying original DataFrames
    listings_filtered = listings_lr.copy()
    notes_filtered = notes.copy() # notes_filtered is still used to filter out listings, but will not be changed here
    current_allocations = allocations.copy()

    # 1. Apply Universal Filters
    listings_filtered = listings_filtered[
        (listings_filtered['odometer'] > 4 * listings_filtered['age']) &
        (listings_filtered['listed_price'] < 0.95 * listings_filtered['market_value'])
    ]

    if listings_filtered.empty:
        print("No listings remain after universal filters.")
        return allocations

    # Ensure 'excess_value' is present for sorting
    if 'excess_value' not in listings_filtered.columns:
        print("Error: 'excess_value' column is missing for sorting.")
        return allocations

    new_allocation_records = []
    current_timestamp = pd.Timestamp.now()

    # 2. Iterate through each specified client for allocations
    for client_info in effective_clients_info:
        current_client_name = client_info['client']
        max_price = client_info['max_listing_price']
        max_odometer = client_info['max_odometer']
        model_gens_allowed = client_info['model_gens']

        # Client-specific criteria
        price_cond = listings_filtered['listed_price'] <= max_price
        odometer_cond = listings_filtered['odometer'] <= max_odometer

        # Model generation condition (using str.startswith for broader matching)
        model_gen_cond = pd.Series(False, index=listings_filtered.index)
        if 'model_gen' in listings_filtered.columns and model_gens_allowed:
            for allowed_gen_pattern in model_gens_allowed:
                model_gen_cond = model_gen_cond | (
                    listings_filtered['model_gen'].astype(str).str.startswith(allowed_gen_pattern)
                )

        client_eligible_listings = listings_filtered[
            price_cond & odometer_cond & model_gen_cond
        ].copy()

        if not client_eligible_listings.empty:
            for _, listing_row in client_eligible_listings.iterrows():
                href = listing_row['href']
                new_allocation_records.append({
                    'href': href,
                    'client': current_client_name,
                    'allocation': True,
                    'timestamp': current_timestamp
                })

    if new_allocation_records:
        new_allocations_df = pd.DataFrame(new_allocation_records)
        new_allocations_df['timestamp'] = pd.to_datetime(new_allocations_df['timestamp'])
        new_allocations_df['allocation'] = new_allocations_df['allocation'].astype('boolean')


        # Filter out new allocations that are already present in the existing 'allocations' DataFrame
        existing_allocation_keys = allocations[['href', 'client']].drop_duplicates()
        merged_df = pd.merge(
            new_allocations_df,
            existing_allocation_keys,
            on=['href', 'client'],
            how='left',
            indicator=True
        )
        truly_new_allocations = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])

        # 3. Concatenate the truly new allocation records with the existing allocations DataFrame
        allocations = pd.concat([allocations, truly_new_allocations], ignore_index=True)
        allocations['allocation'] = allocations['allocation'].astype('boolean')
        print(f"Added {len(truly_new_allocations)} new allocation entries.")
    else:
        print("No new allocations found based on current criteria.")

    return allocations

##get_best_listings

In [89]:
def get_best_listings(listings_lr: pd.DataFrame, allocations: pd.DataFrame, notes: pd.DataFrame, clients: List[Dict], newer_than_date: Optional[str] = None, n_top: int = 10) -> List[str]:
    """
    Identifies the best car listings for each client based on allocation, status, and excess value.

    Args:
        listings_lr (pd.DataFrame): DataFrame of car listings with regression results (e.g., market_value, excess_value).
        allocations (pd.DataFrame): DataFrame containing historical allocation decisions.
        notes (pd.DataFrame): DataFrame containing historical notes and statuses for listings.
        clients (List[Dict]): List of client configuration dictionaries.
        newer_than_date (Optional[str]): If provided, filter listings scraped on or after this date. Format 'YYYY-MM-DD'.
        n_top (int): Number of top listings to select for each client.

    Returns:
        List[str]: A list of unique hrefs representing the best listings across all clients.
    """

    # 1. Make copies of the input DataFrames
    df = listings_lr.copy()
    allocations_copy = allocations.copy()
    notes_copy = notes.copy()

    # 2. Convert date columns to datetime objects
    df['date_scraped'] = pd.to_datetime(df['date_scraped'], errors='coerce')
    notes_copy['timestamp'] = pd.to_datetime(notes_copy['timestamp'], errors='coerce')

    # 3. Process notes_copy to determine the latest status for each unique href
    # Sort notes by href and timestamp (descending) to get the latest status per href
    notes_copy_sorted = notes_copy.sort_values(by=['href', 'timestamp'], ascending=[True, False])
    # Drop duplicates, keeping the first (which will be the latest status for each href)
    latest_notes_status = notes_copy_sorted.drop_duplicates(subset=['href'], keep='first')
    latest_notes_status = latest_notes_status[['href', 'status']].rename(columns={'status': 'last_status'})

    # 4. Merge df with this last_status information
    df = pd.merge(df, latest_notes_status, on='href', how='left')

    # Create client allocation flags
    for client_info in clients:
        client_name = client_info['client']
        alloc_mask = (allocations_copy['client'] == client_name) & (allocations_copy['allocation'] == True)
        allocated_hrefs = allocations_copy[alloc_mask]['href'].unique()
        df[f'client_{client_name}'] = df['href'].isin(allocated_hrefs)

    # 5. Implement date filtering
    if newer_than_date:
        newer_than_date_dt = pd.to_datetime(newer_than_date)
        df = df[df['date_scraped'] >= newer_than_date_dt]
    else:
        # If no specific date is given, filter for the latest date_scraped
        if not df.empty:
            latest_scraped_date = df['date_scraped'].max()
            df = df[df['date_scraped'] == latest_scraped_date]

    # 6. Further filter df to keep only listings where last_status is either None (missing) or 'seen'
    # Convert 'last_status' to string to handle both NaN and actual string values consistently
    df = df[df['last_status'].isna() | (df['last_status'] == 'seen')]

    # 7. Ensure the excess_value column in df is numeric and drop any rows where it is missing
    df['excess_value'] = pd.to_numeric(df['excess_value'], errors='coerce')
    df.dropna(subset=['excess_value'], inplace=True)

    if df.empty:
        print("No listings remain after initial filtering.")
        return []

    all_best_hrefs = []

    # 8. Iterate through each client and select top listings
    for client_info in clients:
        client_name = client_info['client']
        client_col = f'client_{client_name}'

        # Filter for listings allocated to the current client
        client_df = df[df[client_col] == True].copy()

        if not client_df.empty:
            # Sort by excess_value in descending order and get the top n_top hrefs
            top_listings_for_client = client_df.sort_values(by='excess_value', ascending=False).head(n_top)
            all_best_hrefs.extend(top_listings_for_client['href'].tolist())

    # 9. Convert all_best_hrefs to a Series, remove duplicate hrefs, and return as a list
    unique_best_hrefs = pd.Series(all_best_hrefs).drop_duplicates().tolist()

    return unique_best_hrefs

## write_yaml

In [46]:
import pandas as pd
import yaml
import os
from datetime import datetime, date
import numpy as np
from google.colab import files # Import files for download functionality
from typing import List, Tuple

def write_yaml(listings_to_print: List[str], listings_lr: pd.DataFrame, allocations: pd.DataFrame, notes_df: pd.DataFrame, out_file: Optional[str] = None, download: bool = True) -> pd.DataFrame:
    """
    Consolidates listing data from shortlist and notes DataFrames and saves it to a YAML file.
    Dynamically adds client eligibility flags from shortlist columns.
    If a listing has no status, it adds a 'seen' status with author 'beep_boop'.

    Args:
        listings_to_print (List[str]): List of hrefs to include in the YAML output.
        listings_lr (pd.DataFrame): DataFrame of car listings with regression results.
        allocations (pd.DataFrame): DataFrame containing historical allocation decisions.
        notes_df (pd.DataFrame): DataFrame containing notes associated with listings.
        out_file (Optional[str]): The filename to save the YAML to. If None, defaults to 'shortlist.yaml'.
        download (bool): If True, the generated YAML file will be prompted for download.

    Returns:
        pd.DataFrame: The updated `notes_df` DataFrame, potentially with new 'seen' entries.
    """

    # Helper function to convert pandas-specific types to standard Python equivalents
    def to_python_type(value):
        if pd.isna(value):
            return None
        if isinstance(value, pd.Timestamp):
            return value.to_pydatetime() # Convert pandas Timestamp to datetime object
        if isinstance(value, (pd.Int64Dtype, np.int64)):
            return int(value)
        if isinstance(value, (pd.Float64Dtype, np.float64)):
            return float(value)
        if isinstance(value, (date, datetime)): # Use datetime.date and datetime
            return value
        return value

    # Prepare notes_df (make a copy to ensure any internal modifications are to this copy)
    current_notes_df = notes_df.copy()
    current_notes_df['timestamp'] = pd.to_datetime(current_notes_df['timestamp'], errors='coerce')
    current_notes_df.dropna(subset=['timestamp'], inplace=True)

    # Filter listings_lr to include only the listings specified in listings_to_print
    # Then sort by 'excess_value' in descending order
    current_shortlist = listings_lr[listings_lr['href'].isin(listings_to_print)].copy()
    if 'excess_value' in current_shortlist.columns:
        current_shortlist = current_shortlist.sort_values(by='excess_value', ascending=False)
    else:
        print("Warning: 'excess_value' column not found, cannot sort by it.")

    all_listings_data = []

    for idx, row in current_shortlist.iterrows():
        href = row['href']
        initial_status_for_listing = None # This will hold the status *before* any 'seen' logic
        current_notes_for_listing = [] # Notes associated *before* any 'seen' logic

        matching_notes = current_notes_df[current_notes_df['href'] == href]

        if not matching_notes.empty:
            matching_notes_sorted = matching_notes.sort_values(by='timestamp', ascending=False)
            initial_status_for_listing = to_python_type(matching_notes_sorted.iloc[0]['status'])
            current_notes_for_listing = [to_python_type(n) for n in matching_notes_sorted['note'].tolist() if pd.notna(n)]

        # Create a dictionary named listing_data with the specified order and format
        listing_data = {
            'title': f"{to_python_type(row['year'])}, {to_python_type(row['model_gen'])}, {int(to_python_type(row['odometer']))}k",
            'seller': to_python_type(row['seller']), # Modified to use seller
            'listed_price': to_python_type(row['listed_price']),
            'excess_value': int(to_python_type(row['excess_value'])), # Convert to int here
            'href': to_python_type(row['href'])
        }

        # Dynamically add client eligibility from the allocations DataFrame
        eligible_clients = allocations[
            (allocations['href'] == href) & (allocations['allocation'] == True)
        ]['client'].unique().tolist()
        listing_data['clients'] = eligible_clients

        # Add status and notes based on initial values
        listing_data['status'] = initial_status_for_listing
        listing_data['notes'] = current_notes_for_listing

        all_listings_data.append(listing_data)

        # Now, if the listing had no status, add 'seen' to current_notes_df for the *next* iteration
        if initial_status_for_listing is None:
            current_notes_df = add_note(current_notes_df, 'beep_boop', href, status='seen')

    output_filename = out_file if out_file is not None else 'shortlist.yaml'
    # Initialize yaml_content list
    yaml_content = []
    for listing in all_listings_data:
        yaml_content.append('---\n') # Add separator before each listing
        yaml_content.append(yaml.dump(listing, allow_unicode=True, sort_keys=False))
        yaml_content.append('\n') # Add an extra newline after each dumped listing for readability

    yaml_content_str = "".join(yaml_content)
    with open(output_filename, 'w') as f:
        f.write(yaml_content_str)

    if download:
        files.download(output_filename)
        print(f"The YAML file '{output_filename}' has been generated and prompted for download with {len(all_listings_data)} listings.")
    else:
        print(f"The YAML file '{output_filename}' has been generated with {len(all_listings_data)} listings (download skipped).")

    return current_notes_df # Return the potentially updated notes_df


## apply_regression

In [47]:
def apply_regression(df: pd.DataFrame) -> (pd.DataFrame, pd.Series):
    """
    Applies Huber regression to the input DataFrame to predict car prices.

    Args:
        df (pd.DataFrame): The input DataFrame containing car listings.

    Returns:
        (pd.DataFrame, pd.Series): A tuple containing:
            - The DataFrame with 'market_value' and 'excess_value' columns added.
            - A Series of unscaled regression coefficients.
    """
    listings_lr = df.copy()

    # 1) Coerce numeric types
    listings_lr['year'] = pd.to_numeric(listings_lr['year'], errors='coerce')
    listings_lr['odometer'] = pd.to_numeric(listings_lr['odometer'], errors='coerce')
    listings_lr["listed_price"] = pd.to_numeric(listings_lr["listed_price"], errors='coerce')

    # 2) One-hot encode model_gen
    listings_lr["model_gen"] = listings_lr["model_gen"].astype(str)
    dummies = pd.get_dummies(listings_lr["model_gen"], prefix="mg_", prefix_sep="")

    # remove base category "civic_9" if it exists
    base_col = "mg_civic_9" # Corrected base column name to match dummy format
    if base_col in dummies.columns:
        dummies = dummies.drop(columns=[base_col])

    listings_lr = pd.concat([listings_lr, dummies], axis=1)

    # 3) Build X, y & keep mask
    predictor_cols = ['age', 'odometer'] + list(dummies.columns)
    X = listings_lr[predictor_cols].astype(float)
    y = listings_lr["listed_price"].astype(float)

    keep = X.notna().all(axis=1) & y.notna()

    X_keep = X.loc[keep]
    y_keep = y.loc[keep]

    # 4) Scale predictors
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_keep)

    # 5) Fit Huber Regression
    huber = HuberRegressor(max_iter=1000, epsilon=1.5)
    huber.fit(X_scaled, y_keep)

    # 6) Predict & store results
    pred = huber.predict(X_scaled)
    listings_lr.loc[keep, "market_value"] = pred
    listings_lr.loc[keep, "excess_value"] = pred - listings_lr.loc[keep, "listed_price"]

    # 7) Recover coefficients on the original (unscaled) feature scale
    coef_scaled = huber.coef_
    mu = scaler.mean_
    sigma = scaler.scale_

    original_intercept = huber.intercept_ - np.sum(coef_scaled * (mu / sigma))
    original_coefs = coef_scaled / sigma

    coef_unscaled = pd.Series(
        np.concatenate([[original_intercept], original_coefs]),
        index=["intercept"] + predictor_cols
    )

    listings_lr = listings_lr.loc[:, ~listings_lr.columns.str.startswith("mg_")]

    return listings_lr, coef_unscaled

##add_note

In [48]:
import pandas as pd
from datetime import datetime

def add_note(notes_df: pd.DataFrame, author: str, href: str, status: str = None, note: str = None) -> pd.DataFrame:
    """
    Adds a new note or updates a status for a specific href in the `notes_df` DataFrame.
    Checks if the status/note already exists and adds it only if new.

    Args:
        notes_df (pd.DataFrame): The DataFrame containing historical notes and statuses.
        author (str): The author of the note/status update.
        href (str): The href of the listing to update.
        status (str, optional): The new status. Defaults to None.
        note (str, optional): The new note. Defaults to None.

    Returns:
        pd.DataFrame: The updated `notes_df` DataFrame.
    """
    # Make a copy to avoid modifying the original DataFrame directly
    current_notes = notes_df.copy()

    # Ensure 'timestamp' column is datetime for comparison
    current_notes['timestamp'] = pd.to_datetime(current_notes['timestamp'], errors='coerce')

    current_timestamp = pd.Timestamp.now(tz='UTC')
    new_entries = []
    href = href.replace('https://', '').replace('http://', '').replace('www.', '') # Clean href

    # Filter existing records for the current href
    existing_notes_for_href = current_notes[current_notes['href'] == href].copy()
    existing_notes_for_href.sort_values(by='timestamp', ascending=False, inplace=True)

    latest_status_in_notes = None
    if not existing_notes_for_href.empty:
        latest_status_in_notes = existing_notes_for_href.iloc[0]['status']

    existing_note_texts = set(existing_notes_for_href['note'].dropna().tolist())

    # Check and add new status if provided and different
    if status is not None and status != latest_status_in_notes:
        new_entries.append({
            'href': href,
            'timestamp': current_timestamp,
            'author': author,
            'status': status,
            'note': pd.NA
        })

    # Check and add new note if provided and not already existing
    if note is not None and note not in existing_note_texts:
        new_entries.append({
            'href': href,
            'timestamp': current_timestamp,
            'author': author,
            'status': pd.NA, # Status is not changing, just adding a note
            'note': note
        })

    if new_entries:
        new_notes_df = pd.DataFrame(new_entries)
        new_notes_df['timestamp'] = pd.to_datetime(new_notes_df['timestamp'])
        # Ensure consistent column order and dtypes. If current_notes is empty, ensure new_notes_df has the correct columns.
        if current_notes.empty:
            # Define columns if current_notes is empty, assuming standard notes DataFrame columns
            # This is a fallback and might need adjustment if notes columns vary significantly.
            new_notes_df = new_notes_df.reindex(columns=['href', 'timestamp', 'author', 'status', 'note'])
        else:
            new_notes_df = new_notes_df.reindex(columns=current_notes.columns)

        current_notes = pd.concat([current_notes, new_notes_df], ignore_index=True)

        # --- User's requested logic for YAML export ---
        if status in ["message_left", "follow_up"]:

            # Get the latest status for each href in the *updated* notes DataFrame
            latest_notes_status = current_notes.sort_values(by='timestamp', ascending=False).drop_duplicates(subset=['href'], keep='first')

            # Filter for 'message_left' or 'follow_up' statuses
            hrefs_for_yaml = latest_notes_status[
                latest_notes_status['status'].isin(["message_left", "follow_up"])
            ]['href'].unique().tolist()

            if hrefs_for_yaml:
                print(f"Found {len(hrefs_for_yaml)} listings with current status 'message_left' or 'follow_up'.")
                # Access global variables for write_yaml
                # These are assumed to be defined in the global scope of the notebook.
                from __main__ import listings_lr, allocations, write_yaml

                output_path = "/content/drive/Shareddrives/market_analysis_v2/message_left.yaml"
                write_yaml(
                    listings_to_print=hrefs_for_yaml,
                    listings_lr=listings_lr,
                    allocations=allocations,
                    notes_df=current_notes, # Pass the updated notes DataFrame
                    out_file=output_path,
                    download=False # Do not prompt for download in a background task
                )
        # --- End of user's requested logic ---

    return current_notes

## update_notes

In [49]:
def update_notes(notes_df: pd.DataFrame, update_yaml: list, author: str) -> pd.DataFrame:
    """
    Processes shortlist data and updates the `notes_df` DataFrame
    with new status changes and notes using the `add_note` function.

    Args:
        notes_df (pd.DataFrame): The DataFrame containing historical notes and statuses.
        update_yaml (list): A list of dictionaries parsed from shortlist-edited.yaml.
        author (str): The author of the notes.

    Returns:
        pd.DataFrame: The updated `notes_df` DataFrame.
    """
    updated_notes_df = notes_df.copy()
    initial_notes_count = len(updated_notes_df)

    for listing in update_yaml:
        current_href = listing['href']
        current_status_from_yaml = listing['status']
        notes_list_from_yaml = listing['notes'] if listing['notes'] is not None else []

        # Update status using add_note
        updated_notes_df = add_note(updated_notes_df, author, current_href, status=current_status_from_yaml)

        # Update individual notes using add_note
        for note_text in notes_list_from_yaml:
            if pd.notna(note_text): # Ensure note_text is not NaN before passing
                updated_notes_df = add_note(updated_notes_df, author, current_href, note=note_text)

    final_notes_count = len(updated_notes_df)
    added_entries_count = final_notes_count - initial_notes_count

    if added_entries_count > 0:
        print(f"Total {added_entries_count} new entries added to notes DataFrame through update_notes.")
    else:
        print("No new notes or status updates to add via update_notes.")

    return updated_notes_df

## update_seller

In [50]:
import re
import pandas as pd

def update_seller(listings_df: pd.DataFrame, update_yaml: list) -> pd.DataFrame:
    """
    Updates the 'seller' column in listings_df based on information from update_yaml.

    Args:
        update_yaml (list): A list of dictionaries parsed from a YAML file, potentially containing
                                   'href' and 'seller' information.
        listings_df (pd.DataFrame): The DataFrame of car listings to be updated.

    Returns:
        pd.DataFrame: The modified listings_df with updated 'seller' information.
    """

    # Make a copy to avoid modifying the original DataFrame directly
    updated_listings_df = listings_df.copy()

    # Ensure 'seller' column is of object (string) type to accommodate string assignments
    # This prevents FutureWarning when assigning strings to a float64 column that might contain NaNs
    if 'seller' in updated_listings_df.columns and updated_listings_df['seller'].dtype != object:
        updated_listings_df['seller'] = updated_listings_df['seller'].astype("string")

    for item in update_yaml:
        seller = item.get('seller')
        href = item.get('href')

        if seller is not None and href is not None:
            # Clean the href string using re.sub for regex replacement
            cleaned_href = re.sub(r'^(https?://)?(www\.)?', '', str(href))

            # Update the 'seller' column for matching 'href' entries
            updated_listings_df.loc[updated_listings_df['href'] == cleaned_href, 'seller'] = seller

    return updated_listings_df

print("Defined `update_seller` function.")

Defined `update_seller` function.


## update_allocations

In [51]:
import pandas as pd

def update_allocations(allocations: pd.DataFrame, update_yaml: list) -> pd.DataFrame:
    """
    Updates the allocations DataFrame based on information from the update_yaml.

    Business Logic for update_allocations function:

    This function processes updates from a YAML file (typically `shortlist-edited.yaml`) to modify
    the `allocations` DataFrame. The key business rules applied are:

    1.  **De-allocation of Unspecified Clients:** For listings present in the `update_yaml`,
        if a client was previously allocated to that listing but is *not* present
        in the `clients` list for that listing in the `update_yaml`, their `allocation` status in the
        `allocations` DataFrame will be set to `False`. This ensures that clients no longer interested
        in a particular listing (as indicated by the YAML) are de-allocated.
    2.  **Preservation of Other Allocations:** Allocations for listings not mentioned in the `update_yaml`,
        or allocations for clients still specified in the `update_yaml`, remain unchanged.

    Args:
        allocations (pd.DataFrame): The existing DataFrame of allocations.
        update_yaml (list): A list of dictionaries parsed from shortlist-edited.yaml.

    Returns:
        pd.DataFrame: The modified allocations DataFrame.
    """
    # 1. Create a copy of the input allocations DataFrame named allocations_copy.
    allocations_copy = allocations.copy()

    # 2. Iterate through update_yaml to create a dictionary update_lookup where keys are href values
    # and values are dictionaries containing the status and a list of clients for that href from the YAML.
    update_lookup = {}
    for item in update_yaml:
        href = item.get('href')
        status = item.get('status')
        clients_from_yaml = item.get('clients', [])
        # Fix: Ensure clients_from_yaml is always a list, even if 'clients' key has a None value
        if clients_from_yaml is None:
            clients_from_yaml = []
        if href:
            # Clean the href string, consistent with enrich_df and update_seller
            cleaned_href = href.replace('https://', '').replace('http://', '').replace('www.', '')
            update_lookup[cleaned_href] = {
                'status': status,
                'clients': clients_from_yaml
            }

    # All hrefs from update_lookup are considered for de-allocation logic, regardless of status
    hrefs_in_yaml = set(update_lookup.keys())

    # 3. Create a set current_active_allocations containing (href, client) tuples for all entries
    # in the modified allocations_copy where allocation is True.
    current_active_allocations = set(
        allocations_copy[allocations_copy['allocation'] == True]
        [['href', 'client']].apply(tuple, axis=1)
    )

    # 4. Create a set yaml_should_be_active containing (href, client) tuples for all clients
    # associated with hrefs present in update_lookup.
    yaml_should_be_active = set()
    for href in hrefs_in_yaml:
        data = update_lookup[href]
        for client in data['clients']:
            yaml_should_be_active.add((href, client))

    # Filter current_active_allocations to only include hrefs that are in the update_yaml scope
    active_allocations_in_yaml_scope = {
        (h, c) for h, c in current_active_allocations if h in hrefs_in_yaml
    }

    # 5. Determine the set of (href, client) pairs that need to be de-allocated.
    # These are allocations that were active within the scope of hrefs mentioned in YAML,
    # but are not present in the 'clients' list for those hrefs in the YAML.
    to_deallocate = active_allocations_in_yaml_scope - yaml_should_be_active

    # 6. Iterate through the identified (href, client) pairs and set the allocation column to
    # False for those specific entries in allocations_copy.
    for href, client in to_deallocate:
        allocations_copy.loc[
            (allocations_copy['href'] == href) & (allocations_copy['client'] == client),
            'allocation'
        ] = False

    # Ensure the 'allocation' column is boolean type
    allocations_copy['allocation'] = allocations_copy['allocation'].astype('boolean')

    # 7. Return the modified allocations_copy DataFrame.
    print(f"De-allocated {len(to_deallocate)} entries based on YAML updates.")
    return allocations_copy

##email_client

In [52]:
# def email_client(hrefs, listings_lr, coefficients, notes):

# # print rows
# for _, row in best_n.iterrows():
#     print(f"{model_name} ({row['rank']})")
#     print(f"Link: {row['href']}")
#     print(f"Market Value: ${row['predicted_price']:,.0f}")
#     print(f"Listed Price: ${row["listed_price"]:,}")
#     try:
#         print(f"Negotiated Price: ${row['nego_price']:,.0f}")
#     except KeyError as e:
#         pass
#     print(f"Year: {row['year']:.0f}")
#     print(f"Odometer: {row['odometer']:,.0f},000km")
#     print(f"Notes:\n")


# # Produce scatterplot
# # Function to format price axis
# def price_format(x, _):
#     return f'${int(x):,}'

# # Plotting
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# # Compute age
# other_listings['age'] = 2026 - other_listings['year']
# best_n['age'] = 2026 - best_n['year']

# # Scatter (Year vs Price)
# ax1.scatter(other_listings['year'], other_listings["listed_price"],
#             label='Data', color='lightsteelblue', s=20)

# for _, row in best_n.iterrows():
#     ax1.scatter(row['year'], row["listed_price"], s=70, facecolors='none', linewidths=1.2)
#     ax1.text(row['year'], row["listed_price"], str(int(row['rank'])),
#              ha='center', va='center', fontsize=10, fontweight='bold',
#              color='red', alpha=0.7)
#     if not pd.isna(row['nego_price']):
#         ax1.text(row['year'], row['nego_price'], str(int(row['rank'])),
#                  ha='center', va='center', fontsize=10, fontweight='bold',
#                  color='green', alpha=0.7)

# # Regression line (fix odometer at mean)
# year_range = np.linspace(other_listings['year'].min(),
#                          other_listings['year'].max(), 100)

# age_range = 2026 - year_range  # convert back to age for model input
# mean_odometer = other_listings['odometer'].mean()

# X_line = pd.DataFrame({
#     'const': 1,
#     'age': age_range,
#     'odometer': [mean_odometer] * 100
# })

# y_line = model.predict(X_line)

# ax1.plot(year_range, y_line, label='Regression line')

# ax1.xaxis.set_major_locator(MaxNLocator(integer=True))
# ax1.yaxis.set_major_formatter(FuncFormatter(price_format))
# ax1.set_xlabel('Model Year')
# ax1.set_ylabel("listed_price")
# ax1.set_title(f"{model_name} Price vs Year")

# # Scatter (Odometer vs Price)
# ax2.scatter(other_listings['odometer'], other_listings["listed_price"],
#             label='Data', color='lightsteelblue', s=20)

# for _, row in best_n.iterrows():
#     ax2.scatter(row['odometer'], row["listed_price"], s=70, facecolors='none', linewidths=1.2)
#     ax2.text(row['odometer'], row["listed_price"], str(int(row['rank'])),
#              ha='center', va='center', fontsize=10, fontweight='bold',
#              color='red', alpha=0.7)
#     if not pd.isna(row['nego_price']):
#         ax2.text(row['odometer'], row['nego_price'], str(int(row['rank'])),
#                  ha='center', va='center', fontsize=10, fontweight='bold',
#                  color='green', alpha=0.7)

# # Regression line (fix age at mean)
# odometer_range = np.linspace(other_listings['odometer'].min(),
#                              other_listings['odometer'].max(), 100)

# mean_age = other_listings['age'].mean()

# X_line2 = pd.DataFrame({
#     'const': 1,
#     'age': [mean_age] * 100,
#     'odometer': odometer_range
# })

# y_line2 = model.predict(X_line2)
# ax2.plot(odometer_range, y_line2, label='Regression line')

# ax2.yaxis.set_major_formatter(FuncFormatter(price_format))
# ax2.set_xlabel('Odometer (kms)')
# ax2.set_ylabel("listed_price")
# ax2.set_title(f"{model_name} Price vs Mileage")


# # Legend handles
# live_listing_handle = Line2D([], [], marker='o', color='lightsteelblue', linestyle='None', markersize=6, label=f'Listing as of {df1.iloc[0]["date_scraped"]}')
# listed_price_handle = Line2D([], [], marker='o', color='red', linestyle='None',
#                                  markersize=8, label='Listed Price')
# negotiated_price_handle = Line2D([], [], marker='o', color='green', linestyle='None',
#                                  markersize=8, label='Negotiated Price')

# # Apply legend to both subplots
# ax1.legend(handles=[live_listing_handle, listed_price_handle, negotiated_price_handle])
# ax2.legend(handles=[live_listing_handle, listed_price_handle, negotiated_price_handle])


# plt.tight_layout()
# plt.show()

# Setup

In [53]:
clients=[
    {
        "client":"anita_c",
        "max_listing_price":13500,
        "max_odometer":160,
        "model_gens":[
            "3_2",
            "3_3",
            "civic_9",
            "jazz_3",
            "i30_2",
        ]
    },
    {
        "client":"magesh_t",
        "max_listing_price":13500,
        "max_odometer":160,
        "model_gens":[
            "3_3",
            "civic_9",
            "i30_2",
            "corolla_11",
        ]
    },
    {
        "client":"raymon_s",
        "max_listing_price":11000,
        "max_odometer":210,
        "model_gens":[
            "3_2",
            "civic_8",
            "i30_2",
            "city_1",
            "city_2",
            "corolla_10",
            "corolla_11",
        ]
    },
]

In [None]:
import sys
import glob
import os
import re
import yaml
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import HuberRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.ticker import FuncFormatter, MaxNLocator
import numpy as np
from datetime import datetime
from typing import Dict, Optional, List
from google.colab import drive
drive.mount('/content/drive')

pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.0f}'.format)

Mounted at /content/drive


In [62]:
# load dataframes
gen_lookup = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/gen_lookup.csv")
listings = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/listings.csv")
notes = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/notes.csv", index_col=0)
allocations = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/allocations.csv", index_col=0)

In [None]:
statuses = {
    None:"No status saved",
    "seen": "listing has been printed to YAML at least once",
    "rejected": "listing not suitable for any buyer",
    "sold": "sold or on hold",
    "shortlisted": "VA checked listing and looks good",
    "contacted": "Roger has contacted the seller",
    "message_left": "self explanatory",
    "follow_up": "Roger to call seller",
    "inspection": "Inspection booked",
    "deposit": "Deposit left with seller",
    "purchased": "Self explainatory",
    "bad_inspection": "Not recommended after inspection (Roger/Andrew)",
}

# Working

In [63]:
z = compare_new_listings(listings, gen_lookup)

/content/carsales (1).csv    	 n_new=2   	 n_updated=1 	 n_unchanged=5 	 Tot 8
/content/carsales (5).csv    	 n_new=9   	 n_updated=3 	 n_unchanged=10 	 Tot 22
/content/carsales (7).csv    	 n_new=3   	 n_updated=1 	 n_unchanged=18 	 Tot 22
/content/carsales (4).csv    	 n_new=10   	 n_updated=3 	 n_unchanged=9 	 Tot 22
/content/carsales (10).csv    	 n_new=5   	 n_updated=0 	 n_unchanged=17 	 Tot 22
/content/carsales (9).csv    	 n_new=5   	 n_updated=0 	 n_unchanged=17 	 Tot 22
/content/carsales (2).csv    	 n_new=6   	 n_updated=7 	 n_unchanged=9 	 Tot 22
/content/carsales (3).csv    	 n_new=10   	 n_updated=1 	 n_unchanged=11 	 Tot 22
/content/carsales (6).csv    	 n_new=9   	 n_updated=2 	 n_unchanged=11 	 Tot 22
/content/carsales (8).csv    	 n_new=8   	 n_updated=0 	 n_unchanged=14 	 Tot 22
/content/carsales.csv    	 n_new=6   	 n_updated=2 	 n_unchanged=6 	 Tot 14
/content/facebook (4).csv    	 n_new=32   	 n_updated=1 	 n_unchanged=11 	 Tot 44
/content/facebook.csv    	 n_new=

In [64]:
# Add new listings to listings dataframe
listings = integrate_listings(listings, gen_lookup)

Final DataFrame has 1097 unique listings after merging and de-duplication.


In [65]:
listings_lr, coefficients = apply_regression(listings)

In [90]:
allocations = allocate_listings(listings_lr, notes, allocations)
best_listings = get_best_listings(listings_lr, allocations, notes, clients)

Added 0 new allocation entries.


In [92]:
# Call the updated output_shortlist function
notes = write_yaml(best_listings, listings_lr, allocations, notes)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The YAML file 'shortlist.yaml' has been generated and prompted for download with 20 listings.


In [None]:
with open('/content/shortlist-edited.yaml', 'r') as file:
    update_yaml = list(yaml.safe_load_all(file))

print("YAML file 'shortlist-edited.yaml' loaded successfully as 'update_yaml'.")

YAML file 'shortlist-edited.yaml' loaded successfully as 'update_yaml'.


In [None]:
author = "roger"
notes = update_notes(notes, update_yaml, author)

Total 12 new entries added to notes DataFrame through update_notes.


In [None]:
listings = update_seller(listings, update_yaml)

In [None]:
allocations = update_allocations(allocations, update_yaml)

De-allocated 0 entries based on YAML updates.


In [None]:
a = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/listings.csv")

In [None]:
# assume:
# a = old DataFrame
# listings = new DataFrame
# "href" uniquely identifies rows

# 1. align on key
a_idx = a.set_index("href")
l_idx = listings.set_index("href")

# 2. shared rows only
common_idx = a_idx.index.intersection(l_idx.index)
a_common = a_idx.loc[common_idx]
l_common = l_idx.loc[common_idx]

# 3. define change mask ONCE (NaN-safe)
change_mask = (
    a_common.ne(l_common)
    & ~(a_common.isna() & l_common.isna())
)

# 4. new / removed rows
n_new_rows = len(l_idx.index.difference(a_idx.index))
n_removed_rows = len(a_idx.index.difference(l_idx.index))

# 5. changed rows (any column)

print(f"Expected (new - removed rows)={len(listings)-len(a)}")
n_changed_rows = change_mask.any(axis=1).sum()

print(f"{n_new_rows=}")
print(f"{n_removed_rows=}")
print(f"{n_changed_rows=:.0f}")

# 6. row changes per column
row_changes_per_column = change_mask.sum()

print(row_changes_per_column)

Expected (new - removed rows)=229
n_new_rows=230
n_removed_rows=1
n_changed_rows=27
age              1
date_scraped    27
gen              0
listed_price    27
location         1
make             1
model            1
model_gen        1
odometer         1
seller           0
seller_type      0
trim             1
year             1
dtype: Int64


# Roger

In [None]:
# After

new_notes = add_note(
    new_notes,
    "roger",
    "", #href
    status="message_left",
)

NameError: name 'new_notes' is not defined

# Task
The user has approved the plan. I will now proceed to define the `get_best_listings` function, call it, and then confirm that the `best_listings` variable is populated as expected.

```python
def get_best_listings(listings_lr: pd.DataFrame, allocations: pd.DataFrame, notes: pd.DataFrame, clients: List[Dict], newer_than_date: Optional[str] = None, n_top: int = 10) -> List[str]:
    """
    Identifies the best car listings for each client based on allocation, status, and excess value.

    Args:
        listings_lr (pd.DataFrame): DataFrame of car listings with regression results ('excess_value', 'date_scraped').
        allocations (pd.DataFrame): DataFrame containing historical allocation decisions.
        notes (pd.DataFrame): DataFrame containing historical notes and statuses for listings.
        clients (List[Dict]): A list of client dictionaries containing client-specific criteria.
        newer_than_date (Optional[str]): A date string (YYYY-MM-DD) to filter listings scraped
                                         on or after this date. If None, only listings with the
                                         latest 'date_scraped' are considered.
        n_top (int): The number of top listings to select for each client.

    Returns:
        List[str]: A deduplicated list of 'href's of the best listings across all clients.
    """
    # 1. Make copies of the input DataFrames to avoid modifying the originals.
    df = listings_lr.copy()
    allocations_copy = allocations.copy()
    notes_copy = notes.copy()

    # Ensure date columns are in datetime format
    df['date_scraped'] = pd.to_datetime(df['date_scraped'], errors='coerce')
    notes_copy['timestamp'] = pd.to_datetime(notes_copy['timestamp'], errors='coerce')
    if newer_than_date:
        newer_than_date_dt = pd.to_datetime(newer_than_date)

    # 2. Process the notes DataFrame to determine the last_status for each unique href.
    # Sort notes by timestamp to get the latest status for each href
    notes_sorted = notes_copy.sort_values(by='timestamp', ascending=False)
    last_status_per_href = notes_sorted.drop_duplicates(subset='href', keep='first')[['href', 'status']]
    last_status_per_href.rename(columns={'status': 'last_status'}, inplace=True)

    # 3. Merge listings_lr with the last_status information.
    df = pd.merge(df, last_status_per_href, on='href', how='left')

    # Create client-specific allocation columns
    for client_info in clients:
        client_name = client_info['client']
        # Filter allocations for the current client and where allocation is True
        client_allocations = allocations_copy[
            (allocations_copy['client'] == client_name) & (allocations_copy['allocation'] == True)
        ]['href'].drop_duplicates()

        # Create a boolean column indicating if the listing is allocated to this client
        df[f'client_{client_name}'] = df['href'].isin(client_allocations)

    # 4. Apply date filtering:
    if newer_than_date:
        # Filter for listings scraped on or after newer_than_date
        df = df[df['date_scraped'] >= newer_than_date_dt]
    else:
        # If no specific date, filter to include only those listings with the latest date_scraped
        latest_date = df['date_scraped'].max()
        df = df[df['date_scraped'] == latest_date]

    # 5. Further filter the remaining listings to keep only those with no status or last_status="seen".
    df = df[df['last_status'].isna() | (df['last_status'] == 'seen')]

    # Ensure 'excess_value' is present and numeric for sorting
    if 'excess_value' not in df.columns:
        print("Error: 'excess_value' column is missing in the filtered listings.")
        return []
    df['excess_value'] = pd.to_numeric(df['excess_value'], errors='coerce')
    df.dropna(subset=['excess_value'], inplace=True) # Drop rows where excess_value became NaN

    all_best_hrefs = []

    # 6. Iterate through each client and select top N listings.
    for client_info in clients:
        client_name = client_info['client']
        client_allocation_col = f'client_{client_name}'

        # Filter for listings allocated to the current client
        client_eligible_listings = df[df[client_allocation_col]].copy()

        if not client_eligible_listings.empty:
            # Select the top n_top listings based on the highest excess_value
            top_listings_for_client = client_eligible_listings.sort_values(
                by='excess_value', ascending=False
            ).head(n_top)
            all_best_hrefs.extend(top_listings_for_client['href'].tolist())

    # 7. Collect all selected 'href's, deduplicate them, and return the final list.
    return list(pd.Series(all_best_hrefs).drop_duplicates())

# Call the newly defined get_best_listings function
# Assuming listings_lr, allocations, notes, and clients are already defined in the environment.
best_listings = get_best_listings(listings_lr, allocations, notes, clients)

# Final confirmation: Print the number of best listings found and the first few if any
print(f"Found {len(best_listings)} best listings.")
if best_listings:
    print("First 5 best listings (hrefs):")
    for i, href in enumerate(best_listings[:5]):
        print(f"- {href}")
```

## Define get_best_listings function

### Subtask:
Create the `get_best_listings` function that identifies the best car listings for each client based on allocation, status, and excess value.


**Reasoning**:
I need to define the `get_best_listings` function as per the instructions, including handling data types, filtering, merging, and client-specific logic.



In [87]:
def get_best_listings(listings_lr: pd.DataFrame, allocations: pd.DataFrame, notes: pd.DataFrame, clients: List[Dict], newer_than_date: Optional[str] = None, n_top: int = 10) -> List[str]:
    """
    Identifies the best car listings for each client based on allocation, status, and excess value.

    Args:
        listings_lr (pd.DataFrame): DataFrame of car listings with regression results (e.g., market_value, excess_value).
        allocations (pd.DataFrame): DataFrame containing historical allocation decisions.
        notes (pd.DataFrame): DataFrame containing historical notes and statuses for listings.
        clients (List[Dict]): List of client configuration dictionaries.
        newer_than_date (Optional[str]): If provided, filter listings scraped on or after this date. Format 'YYYY-MM-DD'.
        n_top (int): Number of top listings to select for each client.

    Returns:
        List[str]: A list of unique hrefs representing the best listings across all clients.
    """

    # 1. Make copies of the input DataFrames
    df = listings_lr.copy()
    allocations_copy = allocations.copy()
    notes_copy = notes.copy()

    # 2. Convert date columns to datetime objects
    df['date_scraped'] = pd.to_datetime(df['date_scraped'], errors='coerce')
    notes_copy['timestamp'] = pd.to_datetime(notes_copy['timestamp'], errors='coerce')

    # 3. Process notes_copy to determine the latest status for each unique href
    # Sort notes by href and timestamp (descending) to get the latest status per href
    notes_copy_sorted = notes_copy.sort_values(by=['href', 'timestamp'], ascending=[True, False])
    # Drop duplicates, keeping the first (which will be the latest status for each href)
    latest_notes_status = notes_copy_sorted.drop_duplicates(subset=['href'], keep='first')
    latest_notes_status = latest_notes_status[['href', 'status']].rename(columns={'status': 'last_status'})

    # 4. Merge df with this last_status information
    df = pd.merge(df, latest_notes_status, on='href', how='left')

    # Create client allocation flags
    for client_info in clients:
        client_name = client_info['client']
        alloc_mask = (allocations_copy['client'] == client_name) & (allocations_copy['allocation'] == True)
        allocated_hrefs = allocations_copy[alloc_mask]['href'].unique()
        df[f'client_{client_name}'] = df['href'].isin(allocated_hrefs)

    # 5. Implement date filtering
    if newer_than_date:
        newer_than_date_dt = pd.to_datetime(newer_than_date)
        df = df[df['date_scraped'] >= newer_than_date_dt]
    else:
        # If no specific date is given, filter for the latest date_scraped
        if not df.empty:
            latest_scraped_date = df['date_scraped'].max()
            df = df[df['date_scraped'] == latest_scraped_date]

    # 6. Further filter df to keep only listings where last_status is either None (missing) or 'seen'
    # Convert 'last_status' to string to handle both NaN and actual string values consistently
    df = df[df['last_status'].isna() | (df['last_status'] == 'seen')]

    # 7. Ensure the excess_value column in df is numeric and drop any rows where it is missing
    df['excess_value'] = pd.to_numeric(df['excess_value'], errors='coerce')
    df.dropna(subset=['excess_value'], inplace=True)

    if df.empty:
        print("No listings remain after initial filtering.")
        return []

    all_best_hrefs = []

    # 8. Iterate through each client and select top listings
    for client_info in clients:
        client_name = client_info['client']
        client_col = f'client_{client_name}'

        # Filter for listings allocated to the current client
        client_df = df[df[client_col] == True].copy()

        if not client_df.empty:
            # Sort by excess_value in descending order and get the top n_top hrefs
            top_listings_for_client = client_df.sort_values(by='excess_value', ascending=False).head(n_top)
            all_best_hrefs.extend(top_listings_for_client['href'].tolist())

    # 9. Convert all_best_hrefs to a Series, remove duplicate hrefs, and return as a list
    unique_best_hrefs = pd.Series(all_best_hrefs).drop_duplicates().tolist()

    return unique_best_hrefs

**Reasoning**:
The `get_best_listings` function has been defined as per the instructions in the previous step. Now I will execute the function and store the result in `best_listings` and then print the number of listings found for confirmation.



In [88]:
best_listings = get_best_listings(listings_lr, allocations, notes, clients)

print(f"Found {len(best_listings)} best listings.")
if best_listings:
    print("First 5 best listings (hrefs):")
    for i, href in enumerate(best_listings[:5]):
        print(f"- {href}")

Found 20 best listings.
First 5 best listings (hrefs):
- facebook.com/marketplace/item/2415723482183677/
- facebook.com/marketplace/item/1205597424817761/
- carsales.com.au/cars/details/2012-mazda-3-neo-bl-series-2-auto-my13/SSE-AD-19823528/
- facebook.com/marketplace/item/836649155823129/
- facebook.com/marketplace/item/1485701575847486/


## Final Task

### Subtask:
Confirm that the `best_listings` variable has been populated with the appropriate hrefs from the `get_best_listings` function.


## Summary:

### Q&A
The `best_listings` variable has been successfully populated with 20 unique listing `href`s from the `get_best_listings` function, confirming the subtask's requirement.

### Data Analysis Key Findings
*   The `get_best_listings` function was successfully defined and executed, performing the necessary data manipulations, filtering, and selection logic.
*   The `best_listings` variable was populated with 20 unique listing `href`s after processing the input dataframes (`listings_lr`, `allocations`, `notes`) and the `clients` list.
*   The function correctly identified relevant listings based on allocation, status, and excess value, with the first 5 `href`s printed as confirmation.

### Insights or Next Steps
*   The implemented `get_best_listings` function is now ready for use in subsequent stages of the car listing optimization process.
*   Further analysis could involve examining the common characteristics (e.g., brand, model, price range, age) of the identified "best listings" to refine client-specific criteria or market understanding.
