In [54]:
import sys
import glob
import os
import re
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.ticker import FuncFormatter, MaxNLocator
import numpy as np
from datetime import datetime
from typing import Dict, Optional, List
from google.colab import drive
drive.mount('/content/drive')

sys.path.append('/content/drive/Shareddrives/market_analysis_v2/scripts')
# from clean_cs import *
# from clean_fb import *
# from constants_and_helpers import *
from enrich import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Script Functions

In [151]:
import re
import pandas as pd
from typing import Dict, Optional, List

# --- Carsales/General Scrapes (CS) Constants ---
YEAR_MIN, YEAR_MAX = 1980, 2035
ORDER: List[str] = ['href', 'year_make_model', 'trim', "listed_price", 'transmission', 'odometer', 'seller_type']

YEAR_RE  = r'\b(19[89]\d|20[0-3]\d)\b'
PRICE_RE = r'^\s*(?:AU\$|\$)?\s*[\d,]+(?:\.\d{2})?\b'
ODOM_RE  = r'^\s*\d+(?:,?\d{3})*(?:K)?\s*km?\s*$'
URL_RE   = r'^(?:https?://|www\.)'
TX, SELLER = {'automatic', 'manual'}, {'private', 'dealer used'}

THRESH: Dict[str, float] = {
    'year_make_model': 0.50,
    "listed_price":           0.60,
    'transmission':    0.80,
    'odometer':        0.60,
    'seller_type':     0.70,
}

# --- Facebook Marketplace (FB) Constants ---
FB_ORDER: List[str] = ['href', 'year_make_model', 'listed_price', 'odometer', 'location']
THRESH_FB: Dict[str, float] = {
    'href':            0.80,
    'year_make_model': 0.50,
    'listed_price':    0.60,
    'odometer':        0.60,
    'location':        0.40,
}

# --- Predicates (Validation Rules) ---
def _ratio(mask: pd.Series) -> float:
    return float(mask.mean()) if len(mask) else 0.0

def _yr_ok(s: pd.Series) -> pd.Series:
    years = pd.to_numeric(s.astype(str).str.extract(YEAR_RE, expand=False), errors='coerce')
    return years.between(YEAR_MIN, YEAR_MAX)

PRED = {
    'year_make_model': lambda s: s.astype(str).pipe(_yr_ok) & s.astype(str).str.contains(r'[A-Za-z]', na=False),
    "listed_price":           lambda s: s.astype(str).str.match(PRICE_RE, na=False),
    'transmission':    lambda s: s.astype(str).str.strip().str.lower().isin(TX),
    'odometer':        lambda s: s.astype(str).str.match(ODOM_RE, flags=re.I, na=False),
    'seller_type':     lambda s: s.astype(str).str.strip().str.lower().isin(SELLER),
}

PRED_FB = {
    'href':            lambda s: s.astype(str).str.contains(URL_RE, case=False, na=False),
    'year_make_model': lambda s: s.astype(str).pipe(_yr_ok) & s.astype(str).str.contains(r'[A-Za-z]', na=False),
    'listed_price':    lambda s: s.astype(str).str.match(PRICE_RE, na=False),
    'odometer':        lambda s: s.astype(str).str.match(ODOM_RE, flags=re.I, na=False),
}

# --- Core Identification Functions ---
def identify_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
    """Identifies and maps raw DataFrame columns to canonical Carsales/General columns."""
    cols = list(df.columns)
    if not cols:
        return {k: None for k in ORDER}

    href_col = cols[0]

    # Exclude URL-like columns from other detection logic
    url_ratio = {c: _ratio(df[c].astype(str).str.contains(URL_RE, case=False, na=False)) for c in cols}
    urlish = {c for c, r in url_ratio.items() if r >= 0.50}
    blocked = {href_col} | urlish

    remaining = [c for c in cols if c not in blocked]
    picks = {t: None for t in PRED}

    for t in PRED:
        if not remaining:
            break
        scores = {c: _ratio(PRED[t](df[c])) for c in remaining}
        best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
        if best_score >= THRESH[t]:
            picks[t] = best_col
            remaining.remove(best_col)

    trim_col = None
    ymm = picks.get('year_make_model')
    if ymm in cols:
        i = cols.index(ymm)
        if i + 1 < len(cols):
            trim_col = cols[i + 1]

    return {'href': href_col, **picks, 'trim': trim_col}

def identify_fb_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
    """Identifies and maps raw DataFrame columns to canonical Facebook Marketplace columns."""
    cols = list(df.columns)
    if not cols:
        return {k: None for k in FB_ORDER}

    picks = {t: None for t in FB_ORDER}
    remaining = set(cols)

    # Prioritize 'href' column identification
    href_scores = {c: _ratio(PRED_FB['href'](df[c])) for c in remaining}
    best_href_col, best_href_score = None, 0.0
    if href_scores:
        best_href_col, best_href_score = max(href_scores.items(), key=lambda kv: kv[1])

    if best_href_score >= THRESH_FB['href']:
        picks['href'] = best_href_col
        remaining.remove(best_href_col)
    elif 'x1i10hfl href' in remaining and _ratio(PRED_FB['href'](df['x1i10hfl href'])) >= THRESH_FB['href']:
        # Fallback to specific column name if it exists and matches pattern well
        picks['href'] = 'x1i10hfl href'
        remaining.remove('x1i10hfl href')

    # Identify 'year_make_model', 'listed_price', 'odometer'
    for t in ['year_make_model', 'listed_price', 'odometer']:
        if not remaining:
            break
        scores = {c: _ratio(PRED_FB[t](df[c])) for c in remaining}
        if scores:
            best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
            if best_score >= THRESH_FB[t]:
                picks[t] = best_col
                remaining.remove(best_col)

    # Assign 'location', often found in column 'c' or as the last remaining column
    if picks['location'] is None:
        if 'c' in remaining:
            picks['location'] = 'c'
            remaining.remove('c')
        elif len(remaining) == 1:
            picks['location'] = remaining.pop()

    return picks

In [152]:
import pandas as pd
import os
from datetime import datetime, timedelta
from typing import Dict, Optional, List

def clean_cs(df: pd.DataFrame, save_raw: bool = False) -> pd.DataFrame:
    """
    Business Logic for clean_cs function:

    This function processes raw DataFrame outputs from Carsales/General web scrapes to standardize
    and clean vehicle listing data into a consistent format for analysis.

    Key steps and business rules:
    1.  **Raw Data Preservation (Optional):** If `save_raw` is True, the original DataFrame
        is saved to a timestamped CSV, and a 'raw' column (filename) is added to the output.
    2.  **Column Identification:** Dynamically maps raw DataFrame columns to canonical names
        ('href', 'year_make_model', 'listed_price', 'odometer', etc.) using `identify_columns`.
    3.  **Data Extraction & Standardization:**
        *   Cleans 'href' by removing query parameters.
        *   Splits 'year_make_model' into 'year', 'make', and 'model'; converts 'year' to integer.
        *   Converts 'listed_price' and 'odometer' to integer, removing non-numeric characters.
        *   Transforms 'odometer' values from 'km' to '000 km' (e.g., 180,000 km -> 180).
    4.  **Output Structure:** Returns a DataFrame with a standardized set of columns for consistency.
    """
    raw_col_value = None
    if save_raw:
        raw_data_dir = 'data/raws'
        os.makedirs(raw_data_dir, exist_ok=True)
        timestamp = datetime.now()
        raw_filename = ''
        while True:
            raw_filename = os.path.join(raw_data_dir, f"raw_carsales_data_{timestamp.strftime('%Y%m%d_%H%M%S')}.csv")
            if not os.path.exists(raw_filename):
                break
            timestamp += timedelta(seconds=1)
        df.to_csv(raw_filename, index=False)
        raw_col_value = os.path.basename(raw_filename)

    if 'identify_columns' not in globals():
        raise NameError("Function 'identify_columns' not found. Please ensure 'constants_and_helpers.py' or cell 'gECV1vdedUm0' has been executed.")

    mapping = identify_columns(df)
    out = pd.DataFrame()

    if mapping['href'] is not None:
        out['href'] = df[mapping['href']]
    for col in ['year_make_model', 'trim', "listed_price", 'transmission', 'odometer', 'seller_type']:
        src = mapping.get(col)
        if src is not None:
            out[col] = df[src]

    if save_raw and raw_col_value:
        out['raw'] = raw_col_value

    if 'year_make_model' in out.columns:
        split_cols = out['year_make_model'].astype(str).str.split(expand=True, n=2)
        if 0 in split_cols.columns:
            out['year'] = pd.to_numeric(
                split_cols[0].astype(str).str.replace(r'[^\d]', '', regex=True),
                errors='coerce'
            ).astype('Int64')
        else:
            out['year'] = pd.NA
        out['make'] = split_cols[1] if 1 in split_cols.columns else pd.NA
        out['model'] = split_cols[2] if 2 in split_cols.columns else pd.NA
    else:
        out[['year', 'make', 'model']] = pd.NA

    if 'href' in out.columns:
        out['href'] = out['href'].astype(str).str.split('?').str[0]

    for col in ["listed_price", 'odometer']:
        if col in out.columns:
            out[col] = pd.to_numeric(
                out[col].astype(str).str.replace(r'[^\d]', '', regex=True),
                errors='coerce'
            ).astype('Int64')

    if 'odometer' in out.columns:
        out['odometer'] = out['odometer'] // 1000

    final_cols = ['href', 'year', 'make', 'model', "listed_price", 'trim', 'odometer', 'seller_type']
    if save_raw:
        final_cols.insert(0, 'raw')
    return out[[c for c in final_cols if c in out.columns]]

In [153]:

def clean_fb(df: pd.DataFrame, save_raw: bool = False) -> pd.DataFrame:
    """
    Business Logic for clean_fb function:

    This function processes raw DataFrame outputs from Facebook Marketplace scrapes to standardize
    and clean vehicle listing data into a consistent format for analysis.

    Key steps and business rules:
    1.  **Raw Data Preservation (Optional):** If `save_raw` is True, the original DataFrame
        is saved to a timestamped CSV, and a 'raw' column (filename) is added to the output.
    2.  **Column Identification:** Dynamically maps raw DataFrame columns to canonical names
        ('href', 'year_make_model', 'listed_price', 'odometer', 'location') using `identify_fb_columns`.
    3.  **Data Extraction & Standardization:**
        *   Cleans 'href' by removing query parameters.
        *   Splits 'year_make_model' into 'year', 'make', and 'model'; converts 'year' to integer.
        *   Converts 'listed_price' and 'odometer' to integer, removing non-numeric characters.
        *   Filters out listings with 'listed_price' explicitly marked as "free".
    4.  **Data Quality Filtering:** Drops rows with missing (`pd.NA`) values in critical columns
        ('listed_price', 'odometer', 'year') to ensure data integrity. Also removes listings
        with a placeholder 'listed_price' of 12345.
    5.  **Output Structure:** Returns a DataFrame with a standardized set of columns for consistency.
    """
    raw_col_value = None
    if save_raw:
        raw_data_dir = 'data/raws'
        os.makedirs(raw_data_dir, exist_ok=True)
        timestamp = datetime.now()
        raw_filename = ''
        while True:
            raw_filename = os.path.join(raw_data_dir, f"raw_facebook_data_{timestamp.strftime('%Y%m%d_%H%M%S')}.csv")
            if not os.path.exists(raw_filename):
                break
            timestamp += timedelta(seconds=1)
        df.to_csv(raw_filename, index=False)
        raw_col_value = os.path.basename(raw_filename)

    if 'identify_fb_columns' not in globals():
        raise NameError("Function 'identify_fb_columns' not found. Please ensure 'constants_and_helpers.py' or cell 'gECV1vdedUm0' has been executed.")

    mapping = identify_fb_columns(df)
    out = pd.DataFrame()
    for canonical_col, src_col in mapping.items():
        if src_col is not None and src_col in df.columns:
            out[canonical_col] = df[src_col]

    if save_raw and raw_col_value:
        out['raw'] = raw_col_value

    if 'year_make_model' in out.columns:
        split_df = out['year_make_model'].astype(str).str.split(expand=True, n=2)
        if 0 in split_df.columns:
            out['year'] = split_df[0].astype(str).str.replace(r'[^\d]', '', regex=True).replace('', pd.NA).astype(float).astype('Int64')
        else:
            out['year'] = pd.NA
        out['make'] = split_df[1] if 1 in split_df.columns else pd.NA
        out['model'] = split_df[2] if 2 in split_df.columns else pd.NA
    else:
        out[['year', 'make', 'model']] = pd.NA

    if 'href' in out.columns:
        out['href'] = out['href'].astype(str).str.split('?').str[0]

    for col in ["listed_price", 'odometer']:
        if col in out.columns:
            if col == 'listed_price':
                out = out[out[col].astype(str).str.lower() != "free"]
            out[col] = pd.to_numeric(
                out[col].astype(str).str.replace(r'[^\d]', '', regex=True),
                errors='coerce'
            ).astype('Int64')

    cols_to_check_for_na = []
    if 'listed_price' in out.columns: cols_to_check_for_na.append('listed_price')
    if 'odometer' in out.columns: cols_to_check_for_na.append('odometer')
    if 'year' in out.columns: cols_to_check_for_na.append('year')

    if cols_to_check_for_na:
        out = out.dropna(subset=cols_to_check_for_na)

    if 'listed_price' in out.columns:
        out = out[out["listed_price"] != 12345]
        out = out[out["listed_price"] < 3000]

    final_columns = ['href', 'year', 'make', 'model', "listed_price", 'odometer', 'location']
    if save_raw:
        final_columns.insert(0, 'raw')
    return out[[c for c in final_columns if c in out.columns]]

In [154]:
import pandas as pd
from typing import Dict, Optional, List

def enrich_df(df: pd.DataFrame, gen_lookup: pd.DataFrame) -> pd.DataFrame:
    """Final clean after clean_cs or clean_fb, including generation assignment.

    Args:
        df (pd.DataFrame): The DataFrame to enrich.
        gen_lookup (pd.DataFrame): A lookup table for car generations.

    Returns:
        pd.DataFrame: The enriched DataFrame.
    """

    # --- 1. Add date_scraped ---
    df["date_scraped"] = pd.Timestamp.now().normalize()

    # --- 2. Normalise make & model ---
    for col in ["make", "model"]:
        if col in df.columns:
            df[col] = (
                df[col]
                .astype(str)
                .str.lower()
                .str.replace(r"[^a-z0-9]+", "", regex=True)
            )

    # --- 3. Ensure year is numeric ---
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

    # --- 4. Assign generation manually (no merge, no year_start/year_end contamination) ---
    df["gen"] = pd.NA

    for idx, row in gen_lookup.iterrows():
        mask = (
            (df["make"] == row["make"]) &
            (df["model"] == row["model"]) &
            (df["year"].between(row["year_start"], row["year_end"], inclusive="both"))
        )
        df.loc[mask, "gen"] = row["gen"]

    df["gen"] = df["gen"].astype("Int64")

    # --- 5. Create model_gen ---
    df["model_gen"] = df.apply(
        lambda r: f"{r['model']}_{r['gen']}" if pd.notna(r["gen"]) else None,
        axis=1
    )

    return df

# Working

In [113]:
clients=[
    {
        "client":"anita_c",
        "max_listing_price":13500,
        "max_odometer":160000,
        "model_gens":[
            "3_2",
            "civic_",
            "jazz_3",
            "i30_"
        ]
    },
    {
        "client":"magesh_t",
        "max_listing_price":13500,
        "max_odometer":160000,
        "model_gens":[
            "3_2",
            "civic_",
            "i30_"
        ]
    }
]

In [150]:
# How many listings are new, updated, or duplicate?

total_new, total_updated, total_dups = 0,0,0

cs_files = glob.glob('/content/carsales*.csv')
fb_files = glob.glob('/content/facebook*.csv')
for file_path in cs_files+fb_files:
    file_name = os.path.basename(file_path)

    # a. Load the CSV file into a pandas DataFrame
    df_raw = pd.read_csv(file_path)

    # b. Clean the loaded DataFrame using the clean_cs() function
    df_cleaned = clean_cs(df_raw)

    # c. Perform a left merge of df_cleaned with the listings DataFrame
    df_comparison = pd.merge(
        df_cleaned,
        listings,
        on='href',
        how='left',
        suffixes=('_new', '_existing')
    )

    # d. Identify new listings
    new_listings_df = df_comparison[df_comparison['listed_price_existing'].isnull()]
    n_new = len(new_listings_df)

    # e. Identify matched listings
    matched_listings_df = df_comparison[df_comparison['listed_price_existing'].notnull()]

    # f. From matched_listings, identify updated listings
    updated_listings_df = matched_listings_df[
        matched_listings_df['listed_price_new'] != matched_listings_df['listed_price_existing']
    ]
    n_updated = len(updated_listings_df)

    # g. From matched_listings, identify duplicate listings
    duplicate_listings_df = matched_listings_df[
        matched_listings_df['listed_price_new'] == matched_listings_df['listed_price_existing']
    ]
    n_duplicate = len(duplicate_listings_df)

    # h. Calculate total listings for the current file
    n_total_listings = len(df_cleaned)

    # i. Print the comparison result for the current file
    print(f"{file_name}    \t New {n_new}   \t Update {n_updated} \t Dups {n_duplicate} \t Tot {n_total_listings}")

    # j. Add counts to total counters
    total_new += n_new
    total_updated += n_updated
    total_dups += n_duplicate

print(f"Total New: {total_new} \t Total Updated: {total_updated} \t Total Duplicate: {total_dups}")


carsales (1).csv    	 New 2   	 Update 12 	 Dups 0 	 Tot 14
carsales (3).csv    	 New 3   	 Update 5 	 Dups 0 	 Tot 8
carsales (4).csv    	 New 7   	 Update 4 	 Dups 0 	 Tot 11
carsales.csv    	 New 0   	 Update 8 	 Dups 0 	 Tot 8
carsales (2).csv    	 New 18   	 Update 3 	 Dups 0 	 Tot 21
carsales (6).csv    	 New 4   	 Update 11 	 Dups 0 	 Tot 15
carsales (5).csv    	 New 6   	 Update 16 	 Dups 0 	 Tot 22
facebook (1).csv    	 New 21   	 Update 0 	 Dups 4 	 Tot 25
facebook.csv    	 New 6   	 Update 2 	 Dups 25 	 Tot 33
facebook (3).csv    	 New 18   	 Update 0 	 Dups 11 	 Tot 29
facebook (2).csv    	 New 43   	 Update 1 	 Dups 9 	 Tot 53
Total New: 128 	 Total Updated: 62 	 Total Duplicate: 49


In [140]:
gen_lookup = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/gen_lookup.csv")
listings = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/listings.csv")