In [22]:
import sys
import glob
import os
import re
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.ticker import FuncFormatter, MaxNLocator
import numpy as np
from datetime import datetime
from typing import Dict, Optional, List
from google.colab import drive
drive.mount('/content/drive')

sys.path.append('/content/drive/Shareddrives/market_analysis_v2/scripts')
# from clean_cs import *
# from clean_fb import *
# from constants_and_helpers import *
from enrich import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# constants_and_helpers.py

import re
import pandas as pd
from typing import Dict, Optional, List

# ---------- Constants for CS (Carsales/General Scrapes) ----------
YEAR_MIN, YEAR_MAX = 1980, 2035
ORDER: List[str] = ['href', 'year_make_model', 'trim', "listed_price", 'transmission', 'odometer', 'seller_type']

YEAR_RE  = r'\b(19[89]\d|20[0-3]\d)\b'
PRICE_RE = r'^\s*\$\s*[\d,]+(?:\.\d{2})?\b'
ODOM_RE  = r'^\s*\d{1,3}(?:,\d{3})+\s*km\s*$'
URL_RE   = r'^(?:https?://|www\.)'
TX, SELLER = {'automatic', 'manual'}, {'private', 'dealer used'}

THRESH: Dict[str, float] = {
    'year_make_model': 0.50,
    "listed_price":           0.60,
    'transmission':    0.80,
    'odometer':        0.60,
    'seller_type':     0.70,
}

# ---------- Constants for FB (Facebook Marketplace Scrapes) ----------
FB_ORDER: List[str] = ['href', 'year_make_model', 'listed_price', 'odometer', 'location']
# THRESH_FB values are used for identifying FB specific columns.
# 'href' and 'year_make_model' are critical for data structuring.
# 'listed_price' and 'odometer' are numeric and generally have clear patterns.
# 'location' is the hardest to identify solely by content, so it has a lower threshold
# and is often inferred from remaining columns or specific column names like 'c'.
THRESH_FB: Dict[str, float] = {
    'href':            0.80,
    'year_make_model': 0.50,
    'listed_price':    0.60,
    'odometer':        0.60,
    'location':        0.40,
}

# ---------- Predicates ----------
def _ratio(mask: pd.Series) -> float:
    return float(mask.mean()) if len(mask) else 0.0

def _yr_ok(s: pd.Series) -> pd.Series:
    years = pd.to_numeric(s.astype(str).str.extract(YEAR_RE, expand=False), errors='coerce')
    return years.between(YEAR_MIN, YEAR_MAX)

PRED = {
    'year_make_model': lambda s: s.astype(str).pipe(_yr_ok) & s.astype(str).str.contains(r'[A-Za-z]', na=False),
    "listed_price":           lambda s: s.astype(str).str.match(PRICE_RE, na=False),
    'transmission':    lambda s: s.astype(str).str.strip().str.lower().isin(TX),
    'odometer':        lambda s: s.astype(str).str.match(ODOM_RE, flags=re.I, na=False),
    'seller_type':     lambda s: s.astype(str).str.strip().str.lower().isin(SELLER),
}

PRED_FB = {
    'href':            lambda s: s.astype(str).str.contains(URL_RE, case=False, na=False),
    'year_make_model': lambda s: s.astype(str).pipe(_yr_ok) & s.astype(str).str.contains(r'[A-Za-z]', na=False),
    'listed_price':    lambda s: s.astype(str).str.match(PRICE_RE, na=False),
    'odometer':        lambda s: s.astype(str).str.match(ODOM_RE, flags=re.I, na=False),
}

# ---------- Core Identification Functions ----------
def identify_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
    """Identify and map each canonical Carsales/General column."""
    cols = list(df.columns)
    if not cols:
        return {k: None for k in ORDER}

    href_col = cols[0]

    # exclude URL-like columns from other detection
    url_ratio = {c: _ratio(df[c].astype(str).str.contains(URL_RE, case=False, na=False)) for c in cols}
    urlish = {c for c, r in url_ratio.items() if r >= 0.50}
    blocked = {href_col} | urlish

    remaining = [c for c in cols if c not in blocked]
    picks = {t: None for t in PRED}

    for t in PRED:
        if not remaining:
            break
        scores = {c: _ratio(PRED[t](df[c])) for c in remaining}
        best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
        if best_score >= THRESH[t]:
            picks[t] = best_col
            remaining.remove(best_col)

    trim_col = None
    ymm = picks.get('year_make_model')
    if ymm in cols:
        i = cols.index(ymm)
        if i + 1 < len(cols):
            trim_col = cols[i + 1]

    return {'href': href_col, **picks, 'trim': trim_col}

def identify_fb_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
    """Identify and map each canonical Facebook Marketplace column."""
    cols = list(df.columns)
    if not cols:
        return {k: None for k in FB_ORDER}

    picks = {t: None for t in FB_ORDER}
    remaining = set(cols)

    # Prioritize 'href'
    href_scores = {c: _ratio(PRED_FB['href'](df[c])) for c in remaining}
    best_href_col, best_href_score = None, 0.0
    if href_scores:
        best_href_col, best_href_score = max(href_scores.items(), key=lambda kv: kv[1])

    if best_href_score >= THRESH_FB['href']:
        picks['href'] = best_href_col
        remaining.remove(best_href_col)
    elif 'x1i10hfl href' in remaining and _ratio(PRED_FB['href'](df['x1i10hfl href'])) >= THRESH_FB['href']:
        # Fallback to specific column name if it exists and matches pattern well
        picks['href'] = 'x1i10hfl href'
        remaining.remove('x1i10hfl href')


    # Identify 'year_make_model', 'listed_price', 'odometer'
    for t in ['year_make_model', 'listed_price', 'odometer']:
        if not remaining:
            break
        scores = {c: _ratio(PRED_FB[t](df[c])) for c in remaining}
        if scores:
            best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
            if best_score >= THRESH_FB[t]:
                picks[t] = best_col
                remaining.remove(best_col)

    # Assign 'location'
    if picks['location'] is None:
        if 'c' in remaining: # The common FB Marketplace location column name
            picks['location'] = 'c'
            remaining.remove('c')
        elif len(remaining) == 1: # If only one column left, it's likely location
            picks['location'] = remaining.pop()

    return picks

In [28]:
import pandas as pd
import os
from datetime import datetime, timedelta
from typing import Dict, Optional, List

# Import necessary functions from constants_and_helpers.py
# Assuming constants_and_helpers.py is loaded and its functions are available
# from constants_and_helpers import identify_columns, identify_fb_columns

def clean_cs(df: pd.DataFrame, save_raw: bool = False) -> pd.DataFrame:
    """Detect, rename, clean numeric/text data, and return standardized columns for Carsales/General Scrapes.
    If save_raw is True, saves the raw input DataFrame to a uniquely timestamped CSV file and adds the filename as a 'raw' column.
    """
    raw_col_value = None
    if save_raw:
        raw_data_dir = 'data/raws'
        os.makedirs(raw_data_dir, exist_ok=True)
        timestamp = datetime.now()
        raw_filename = ''
        while True:
            raw_filename = os.path.join(raw_data_dir, f"raw_carsales_data_{timestamp.strftime('%Y%m%d_%H%M%S')}.csv")
            if not os.path.exists(raw_filename):
                break
            timestamp += timedelta(seconds=1)
        df.to_csv(raw_filename, index=False)
        raw_col_value = os.path.basename(raw_filename)

    if 'identify_columns' not in globals():
        raise NameError("Function 'identify_columns' not found. Please ensure 'constants_and_helpers.py' or cell 'gECV1vdedUm0' has been executed.")

    mapping = identify_columns(df)
    out = pd.DataFrame()

    if mapping['href'] is not None:
        out['href'] = df[mapping['href']]
    for col in ['year_make_model', 'trim', "listed_price", 'transmission', 'odometer', 'seller_type']:
        src = mapping.get(col)
        if src is not None:
            out[col] = df[src]

    if save_raw and raw_col_value:
        out['raw'] = raw_col_value

    if 'year_make_model' in out.columns:
        split_cols = out['year_make_model'].astype(str).str.split(expand=True, n=2)
        if 0 in split_cols.columns: # Clean 'year' column to keep only digits
            out['year'] = split_cols[0].astype(str).str.replace(r'[^\d]', '', regex=True).replace('', pd.NA).astype(float).astype('Int64')
        else:
            out['year'] = pd.NA
        out['make'] = split_cols[1] if 1 in split_cols.columns else pd.NA
        out['model'] = split_cols[2] if 2 in split_cols.columns else pd.NA
    else:
        out[['year', 'make', 'model']] = pd.NA

    if 'href' in out.columns:
        out['href'] = out['href'].astype(str).str.split('?').str[0]

    # Clean numeric columns: listed_price, odometer to keep only digits
    for col in ["listed_price", 'odometer']:
        if col in out.columns:
            out[col] = (
                out[col].astype(str)
                .replace(r'[^\d]', '', regex=True) # Keep only digits
                .replace('', pd.NA)
                .astype(float)
                .astype('Int64')
            )

    if 'odometer' in out.columns:
        out['odometer'] = out['odometer'] // 1000

    final_cols = ['href', 'year', 'make', 'model', "listed_price", 'trim', 'odometer', 'seller_type']
    if save_raw:
        final_cols.insert(0, 'raw')
    return out[[c for c in final_cols if c in out.columns]]


def clean_fb(df: pd.DataFrame, save_raw: bool = False) -> pd.DataFrame:
    """Detect, rename, clean numeric/text data, and return standardized columns for Facebook Marketplace.
    If save_raw is True, saves the raw input DataFrame to a uniquely timestamped CSV file and adds the filename as a 'raw' column.
    """
    raw_col_value = None
    if save_raw:
        raw_data_dir = 'data/raws'
        os.makedirs(raw_data_dir, exist_ok=True)
        timestamp = datetime.now()
        raw_filename = ''
        while True:
            raw_filename = os.path.join(raw_data_dir, f"raw_facebook_data_{timestamp.strftime('%Y%m%d_%H%M%S')}.csv")
            if not os.path.exists(raw_filename):
                break
            timestamp += timedelta(seconds=1)
        df.to_csv(raw_filename, index=False)
        raw_col_value = os.path.basename(raw_filename)

    if 'identify_fb_columns' not in globals():
        raise NameError("Function 'identify_fb_columns' not found. Please ensure 'constants_and_helpers.py' or cell 'gECV1vdedUm0' has been executed.")

    mapping = identify_fb_columns(df)
    out = pd.DataFrame()
    for canonical_col, src_col in mapping.items():
        if src_col is not None and src_col in df.columns:
            out[canonical_col] = df[src_col]

    if save_raw and raw_col_value:
        out['raw'] = raw_col_value

    if 'year_make_model' in out.columns:
        split_df = out['year_make_model'].astype(str).str.split(expand=True, n=2)
        if 0 in split_df.columns: # Clean 'year' column to keep only digits
            out['year'] = split_df[0].astype(str).str.replace(r'[^\d]', '', regex=True).replace('', pd.NA).astype(float).astype('Int64')
        else:
            out['year'] = pd.NA
        out['make'] = split_df[1] if 1 in split_df.columns else pd.NA
        out['model'] = split_df[2] if 2 in split_df.columns else pd.NA
    else:
        out[['year', 'make', 'model']] = pd.NA

    if 'href' in out.columns:
        out['href'] = out['href'].astype(str).str.split('?').str[0]

    for col in ["listed_price", 'odometer']:
        if col in out.columns:
            if col == 'listed_price':
                out = out[out[col].astype(str).str.lower() != "free"]
            # The existing regex for clean_fb's numeric columns already removes non-digits including commas.
            # This is now explicitly handled by the general `replace(r'[^\d]', '', regex=True)`
            out[col] = (
                out[col].astype(str)
                .replace(r'[^\d]', '', regex=True) # Keep only digits
                .replace('', pd.NA)
                .astype(float)
                .astype('Int64')
            )

    cols_to_check_for_na = []
    if 'listed_price' in out.columns: cols_to_check_for_na.append('listed_price')
    if 'odometer' in out.columns: cols_to_check_for_na.append('odometer')
    if 'year' in out.columns: cols_to_check_for_na.append('year')

    if cols_to_check_for_na:
        out = out.dropna(subset=cols_to_check_for_na)

    if 'listed_price' in out.columns:
        out = out[out["listed_price"] < 3000]
        out = out[out["listed_price"] != 12345]

    final_columns = ['href', 'year', 'make', 'model', "listed_price", 'odometer', 'location']
    if save_raw:
        final_columns.insert(0, 'raw')
    return out[[c for c in final_columns if c in out.columns]]

In [24]:
gen_lookup = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/gen_lookup.csv")
listings = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/listings.csv")

In [29]:
clean_fb(pd.read_csv("facebook.csv"))

Unnamed: 0,href,year,make,model
0,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2014,Honda,civic
1,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2012,Honda,civic
2,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2012,Honda,civic
3,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2012,Honda,civic
4,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2012,Honda,civic
5,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2014,Honda,civic
6,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2012,Honda,civic
7,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2014,Honda,civic
8,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2013,Honda,civic
9,https://scontent.fsyd14-1.fna.fbcdn.net/v/t39....,2014,Honda,civic
