In [2]:
import sys
import glob
import os
import re
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from matplotlib.ticker import FuncFormatter, MaxNLocator
import numpy as np
from datetime import datetime
from typing import Dict, Optional, List
from google.colab import drive
drive.mount('/content/drive')

sys.path.append('/content/drive/Shareddrives/market_analysis_v2/scripts')
# from clean_cs import *
# from clean_fb import *
# from constants_and_helpers import *
from enrich import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Script Functions

In [3]:
import re
import pandas as pd
from typing import Dict, Optional, List

# --- Carsales/General Scrapes (CS) Constants ---
YEAR_MIN, YEAR_MAX = 1980, 2035
ORDER: List[str] = ['href', 'year_make_model', 'trim', "listed_price", 'transmission', 'odometer', 'seller_type']

YEAR_RE  = r'\b(19[89]\d|20[0-3]\d)\b'
PRICE_RE = r'^\s*(?:AU\$|\$)?\s*[\d,]+(?:\.\d{2})?\b'
ODOM_RE  = r'^\s*\d+(?:,?\d{3})*(?:K)?\s*km?\s*$'
URL_RE   = r'^(?:https?://|www\.)'
TX, SELLER = {'automatic', 'manual'}, {'private', 'dealer used'}

THRESH: Dict[str, float] = {
    'year_make_model': 0.50,
    "listed_price":           0.60,
    'transmission':    0.80,
    'odometer':        0.60,
    'seller_type':     0.70,
}

# --- Facebook Marketplace (FB) Constants ---
FB_ORDER: List[str] = ['href', 'year_make_model', 'listed_price', 'odometer', 'location']
THRESH_FB: Dict[str, float] = {
    'href':            0.80,
    'year_make_model': 0.50,
    'listed_price':    0.60,
    'odometer':        0.60,
    'location':        0.40,
}

# --- Predicates (Validation Rules) ---
def _ratio(mask: pd.Series) -> float:
    return float(mask.mean()) if len(mask) else 0.0

def _yr_ok(s: pd.Series) -> pd.Series:
    years = pd.to_numeric(s.astype(str).str.extract(YEAR_RE, expand=False), errors='coerce')
    return years.between(YEAR_MIN, YEAR_MAX)

PRED = {
    'year_make_model': lambda s: s.astype(str).pipe(_yr_ok) & s.astype(str).str.contains(r'[A-Za-z]', na=False),
    "listed_price":           lambda s: s.astype(str).str.match(PRICE_RE, na=False),
    'transmission':    lambda s: s.astype(str).str.strip().str.lower().isin(TX),
    'odometer':        lambda s: s.astype(str).str.match(ODOM_RE, flags=re.I, na=False),
    'seller_type':     lambda s: s.astype(str).str.strip().str.lower().isin(SELLER),
}

PRED_FB = {
    'href':            lambda s: s.astype(str).str.contains(URL_RE, case=False, na=False),
    'year_make_model': lambda s: s.astype(str).pipe(_yr_ok) & s.astype(str).str.contains(r'[A-Za-z]', na=False),
    'listed_price':    lambda s: s.astype(str).str.match(PRICE_RE, na=False),
    'odometer':        lambda s: s.astype(str).str.match(ODOM_RE, flags=re.I, na=False),
}

# --- Core Identification Functions ---
def identify_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
    """Identifies and maps raw DataFrame columns to canonical Carsales/General columns."""
    cols = list(df.columns)
    if not cols:
        return {k: None for k in ORDER}

    href_col = cols[0]

    # Exclude URL-like columns from other detection logic
    url_ratio = {c: _ratio(df[c].astype(str).str.contains(URL_RE, case=False, na=False)) for c in cols}
    urlish = {c for c, r in url_ratio.items() if r >= 0.50}
    blocked = {href_col} | urlish

    remaining = [c for c in cols if c not in blocked]
    picks = {t: None for t in PRED}

    for t in PRED:
        if not remaining:
            break
        scores = {c: _ratio(PRED[t](df[c])) for c in remaining}
        best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
        if best_score >= THRESH[t]:
            picks[t] = best_col
            remaining.remove(best_col)

    trim_col = None
    ymm = picks.get('year_make_model')
    if ymm in cols:
        i = cols.index(ymm)
        if i + 1 < len(cols):
            trim_col = cols[i + 1]

    return {'href': href_col, **picks, 'trim': trim_col}

def identify_fb_columns(df: pd.DataFrame) -> Dict[str, Optional[str]]:
    """Identifies and maps raw DataFrame columns to canonical Facebook Marketplace columns."""
    cols = list(df.columns)
    if not cols:
        return {k: None for k in FB_ORDER}

    picks = {t: None for t in FB_ORDER}
    remaining = set(cols)

    # Prioritize 'href' column identification
    href_scores = {c: _ratio(PRED_FB['href'](df[c])) for c in remaining}
    best_href_col, best_href_score = None, 0.0
    if href_scores:
        best_href_col, best_href_score = max(href_scores.items(), key=lambda kv: kv[1])

    if best_href_score >= THRESH_FB['href']:
        picks['href'] = best_href_col
        remaining.remove(best_href_col)
    elif 'x1i10hfl href' in remaining and _ratio(PRED_FB['href'](df['x1i10hfl href'])) >= THRESH_FB['href']:
        # Fallback to specific column name if it exists and matches pattern well
        picks['href'] = 'x1i10hfl href'
        remaining.remove('x1i10hfl href')

    # Identify 'year_make_model', 'listed_price', 'odometer'
    for t in ['year_make_model', 'listed_price', 'odometer']:
        if not remaining:
            break
        scores = {c: _ratio(PRED_FB[t](df[c])) for c in remaining}
        if scores:
            best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
            if best_score >= THRESH_FB[t]:
                picks[t] = best_col
                remaining.remove(best_col)

    # Assign 'location', often found in column 'c' or as the last remaining column
    if picks['location'] is None:
        if 'c' in remaining:
            picks['location'] = 'c'
            remaining.remove('c')
        elif len(remaining) == 1:
            picks['location'] = remaining.pop()

    return picks

In [4]:
import pandas as pd
import os
from datetime import datetime, timedelta
from typing import Dict, Optional, List

def clean_cs(df: pd.DataFrame, save_raw: bool = False) -> pd.DataFrame:
    """
    Business Logic for clean_cs function:

    This function processes raw DataFrame outputs from Carsales/General web scrapes to standardize
    and clean vehicle listing data into a consistent format for analysis.

    Key steps and business rules:
    1.  **Raw Data Preservation (Optional):** If `save_raw` is True, the original DataFrame
        is saved to a timestamped CSV, and a 'raw' column (filename) is added to the output.
    2.  **Column Identification:** Dynamically maps raw DataFrame columns to canonical names
        ('href', 'year_make_model', 'listed_price', 'odometer', etc.) using `identify_columns`.
    3.  **Data Extraction & Standardization:**
        *   Cleans 'href' by removing query parameters.
        *   Splits 'year_make_model' into 'year', 'make', and 'model'; converts 'year' to integer.
        *   Converts 'listed_price' and 'odometer' to integer, removing non-numeric characters.
        *   Transforms 'odometer' values from 'km' to '000 km' (e.g., 180,000 km -> 180).
    4.  **Output Structure:** Returns a DataFrame with a standardized set of columns for consistency.
    """
    raw_col_value = None
    if save_raw:
        raw_data_dir = 'data/raws'
        os.makedirs(raw_data_dir, exist_ok=True)
        timestamp = datetime.now()
        raw_filename = ''
        while True:
            raw_filename = os.path.join(raw_data_dir, f"raw_carsales_data_{timestamp.strftime('%Y%m%d_%H%M%S')}.csv")
            if not os.path.exists(raw_filename):
                break
            timestamp += timedelta(seconds=1)
        df.to_csv(raw_filename, index=False)
        raw_col_value = os.path.basename(raw_filename)

    if 'identify_columns' not in globals():
        raise NameError("Function 'identify_columns' not found. Please ensure 'constants_and_helpers.py' or cell 'gECV1vdedUm0' has been executed.")

    mapping = identify_columns(df)
    out = pd.DataFrame()

    if mapping['href'] is not None:
        out['href'] = df[mapping['href']]
    for col in ['year_make_model', 'trim', "listed_price", 'transmission', 'odometer', 'seller_type']:
        src = mapping.get(col)
        if src is not None:
            out[col] = df[src]

    if save_raw and raw_col_value:
        out['raw'] = raw_col_value

    if 'year_make_model' in out.columns:
        split_cols = out['year_make_model'].astype(str).str.split(expand=True, n=2)
        if 0 in split_cols.columns:
            out['year'] = pd.to_numeric(
                split_cols[0].astype(str).str.replace(r'[^\d]', '', regex=True),
                errors='coerce'
            ).astype('Int64')
        else:
            out['year'] = pd.NA
        out['make'] = split_cols[1] if 1 in split_cols.columns else pd.NA
        out['model'] = split_cols[2] if 2 in split_cols.columns else pd.NA
    else:
        out[['year', 'make', 'model']] = pd.NA

    if 'href' in out.columns:
        out['href'] = out['href'].astype(str).str.split('?').str[0]

    for col in ["listed_price", 'odometer']:
        if col in out.columns:
            out[col] = pd.to_numeric(
                out[col].astype(str).str.replace(r'[^\d]', '', regex=True),
                errors='coerce'
            ).astype('Int64')

    if 'odometer' in out.columns:
        out['odometer'] = out['odometer'] // 1000

    final_cols = ['href', 'year', 'make', 'model', "listed_price", 'trim', 'odometer', 'seller_type']
    if save_raw:
        final_cols.insert(0, 'raw')
    return out[[c for c in final_cols if c in out.columns]]

In [5]:

def clean_fb(df: pd.DataFrame, save_raw: bool = False) -> pd.DataFrame:
    """
    Business Logic for clean_fb function:

    This function processes raw DataFrame outputs from Facebook Marketplace scrapes to standardize
    and clean vehicle listing data into a consistent format for analysis.

    Key steps and business rules:
    1.  **Raw Data Preservation (Optional):** If `save_raw` is True, the original DataFrame
        is saved to a timestamped CSV, and a 'raw' column (filename) is added to the output.
    2.  **Column Identification:** Dynamically maps raw DataFrame columns to canonical names
        ('href', 'year_make_model', 'listed_price', 'odometer', 'location') using `identify_fb_columns`.
    3.  **Data Extraction & Standardization:**
        *   Cleans 'href' by removing query parameters.
        *   Splits 'year_make_model' into 'year', 'make', and 'model'; converts 'year' to integer.
        *   Converts 'listed_price' and 'odometer' to integer, removing non-numeric characters.
        *   Filters out listings with 'listed_price' explicitly marked as "free".
    4.  **Data Quality Filtering:** Drops rows with missing (`pd.NA`) values in critical columns
        ('listed_price', 'odometer', 'year') to ensure data integrity. Also removes listings
        with a placeholder 'listed_price' of 12345.
    5.  **Output Structure:** Returns a DataFrame with a standardized set of columns for consistency.
    """
    raw_col_value = None
    if save_raw:
        raw_data_dir = 'data/raws'
        os.makedirs(raw_data_dir, exist_ok=True)
        timestamp = datetime.now()
        raw_filename = ''
        while True:
            raw_filename = os.path.join(raw_data_dir, f"raw_facebook_data_{timestamp.strftime('%Y%m%d_%H%M%S')}.csv")
            if not os.path.exists(raw_filename):
                break
            timestamp += timedelta(seconds=1)
        df.to_csv(raw_filename, index=False)
        raw_col_value = os.path.basename(raw_filename)

    if 'identify_fb_columns' not in globals():
        raise NameError("Function 'identify_fb_columns' not found. Please ensure 'constants_and_helpers.py' or cell 'gECV1vdedUm0' has been executed.")

    mapping = identify_fb_columns(df)
    out = pd.DataFrame()
    for canonical_col, src_col in mapping.items():
        if src_col is not None and src_col in df.columns:
            out[canonical_col] = df[src_col]

    if save_raw and raw_col_value:
        out['raw'] = raw_col_value

    if 'year_make_model' in out.columns:
        split_df = out['year_make_model'].astype(str).str.split(expand=True, n=2)
        if 0 in split_df.columns:
            out['year'] = split_df[0].astype(str).str.replace(r'[^\d]', '', regex=True).replace('', pd.NA).astype(float).astype('Int64')
        else:
            out['year'] = pd.NA
        out['make'] = split_df[1] if 1 in split_df.columns else pd.NA
        out['model'] = split_df[2] if 2 in split_df.columns else pd.NA
    else:
        out[['year', 'make', 'model']] = pd.NA

    if 'href' in out.columns:
        out['href'] = out['href'].astype(str).str.split('?').str[0]

    for col in ["listed_price", 'odometer']:
        if col in out.columns:
            if col == 'listed_price':
                out = out[out[col].astype(str).str.lower() != "free"]
            out[col] = pd.to_numeric(
                out[col].astype(str).str.replace(r'[^\d]', '', regex=True),
                errors='coerce'
            ).astype('Int64')

    cols_to_check_for_na = []
    if 'listed_price' in out.columns: cols_to_check_for_na.append('listed_price')
    if 'odometer' in out.columns: cols_to_check_for_na.append('odometer')
    if 'year' in out.columns: cols_to_check_for_na.append('year')

    if cols_to_check_for_na:
        out = out.dropna(subset=cols_to_check_for_na)

    if 'listed_price' in out.columns:
        out = out[out["listed_price"] != 12345]
        out = out[out["listed_price"] < 3000]

    final_columns = ['href', 'year', 'make', 'model', "listed_price", 'odometer', 'location']
    if save_raw:
        final_columns.insert(0, 'raw')
    return out[[c for c in final_columns if c in out.columns]]

In [6]:
import pandas as pd
from typing import Dict, Optional, List

def enrich_df(df: pd.DataFrame, gen_lookup: pd.DataFrame) -> pd.DataFrame:
    """Final clean after clean_cs or clean_fb, including generation assignment.

    Args:
        df (pd.DataFrame): The DataFrame to enrich.
        gen_lookup (pd.DataFrame): A lookup table for car generations.

    Returns:
        pd.DataFrame: The enriched DataFrame.
    """

    # --- 1. Add date_scraped ---
    df["date_scraped"] = pd.Timestamp.now().normalize()

    # --- 2. Normalise make & model ---
    for col in ["make", "model"]:
        if col in df.columns:
            df[col] = (
                df[col]
                .astype(str)
                .str.lower()
                .str.replace(r"[^a-z0-9]+", "", regex=True)
            )

    # --- 3. Ensure year is numeric ---
    if "year" in df.columns:
        df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")

    # --- 4. Assign generation manually (no merge, no year_start/year_end contamination) ---
    df["gen"] = pd.NA

    for idx, row in gen_lookup.iterrows():
        mask = (
            (df["make"] == row["make"]) &
            (df["model"] == row["model"]) &
            (df["year"].between(row["year_start"], row["year_end"], inclusive="both"))
        )
        df.loc[mask, "gen"] = row["gen"]

    df["gen"] = df["gen"].astype("Int64")

    # --- 5. Create model_gen ---
    df["model_gen"] = df.apply(
        lambda r: f"{r['model']}_{r['gen']}" if pd.notna(r["gen"]) else None,
        axis=1
    )

    return df

# Linear Regression

In [None]:
# Building Regression Model
# 0) Work on a real copy (kills SettingWithCopyWarning)
df1 = df1.copy()

# 1) Coerce to numeric (allow bad cells to become NaN)
df1['year']     = pd.to_numeric(df1['year'], errors='coerce')
df1['odometer'] = pd.to_numeric(df1['odometer'], errors='coerce')
df1["listed_price"]    = pd.to_numeric(df1["listed_price"], errors='coerce')
df1['age'] = 2026 - df1['year']

# 2) Build X, y as float and drop rows with NaNs
X_num = df1[['age','odometer']].astype(float)
y_num = df1["listed_price"].astype(float)
keep  = X_num.notna().all(axis=1) & y_num.notna()

X = sm.add_constant(X_num.loc[keep])
y = y_num.loc[keep]

# Optional sanity checks
assert np.isfinite(X.to_numpy()).all() and np.isfinite(y.to_numpy()).all()
assert X[['age','odometer']].std().gt(0).all()

# 3) Fit and predict
model = sm.OLS(y, X).fit()
print(model.summary())

df1.loc[keep, 'predicted_price'] = model.predict(X)
df1.loc[keep, 'value_diff'] = df1.loc[keep, 'predicted_price'] - df1.loc[keep, "listed_price"]

print(f"Used {keep.sum()} rows; dropped {len(df1) - keep.sum()} rows.")

# Working

In [7]:
clients=[
    {
        "client":"anita_c",
        "max_listing_price":13500,
        "max_odometer":160000,
        "model_gens":[
            "3_2",
            "civic_",
            "jazz_3",
            "i30_"
        ]
    },
    {
        "client":"magesh_t",
        "max_listing_price":13500,
        "max_odometer":160000,
        "model_gens":[
            "3_2",
            "civic_",
            "i30_"
        ]
    }
]

In [49]:
# How many listings are new, updated, or existing?

total_new, total_updated, total_existing = 0,0,0

cs_files = glob.glob('/content/carsales*.csv')
fb_files = glob.glob('/content/facebook*.csv')
for file_path in cs_files+fb_files:
    file_name = os.path.basename(file_path)

    # a. Load the CSV file into a pandas DataFrame
    df_raw = pd.read_csv(file_path)

    # b. Clean the loaded DataFrame using the clean_cs() function
    df_cleaned = clean_cs(df_raw)

    # c. Perform a left merge of df_cleaned with the listings DataFrame
    df_comparison = pd.merge(
        df_cleaned,
        listings,
        on='href',
        how='left',
        suffixes=('_new', '_existing')
    )

    # d. Identify new listings
    new_listings_df = df_comparison[df_comparison['listed_price_existing'].isnull()]
    n_new = len(new_listings_df)

    # e. Identify matched listings
    matched_listings_df = df_comparison[df_comparison['listed_price_existing'].notnull()]

    # f. From matched_listings, identify updated listings
    updated_listings_df = matched_listings_df[
        matched_listings_df['listed_price_new'] != matched_listings_df['listed_price_existing']
    ]
    n_updated = len(updated_listings_df)

    # g. From matched_listings, identify existing listings
    existing_listings_df = matched_listings_df[
        matched_listings_df['listed_price_new'] == matched_listings_df['listed_price_existing']
    ]
    n_existing = len(existing_listings_df)

    # h. Calculate total listings for the current file
    n_total_listings = len(df_cleaned)

    # i. Print the comparison result for the current file
    print(f"{file_name}    \t New {n_new}   \t Update {n_updated} \t Existing {n_existing} \t Tot {n_total_listings}")

    # j. Add counts to total counters
    total_new += n_new
    total_updated += n_updated
    total_existing += n_existing

print(f"Total New: {total_new} \t Total Updated: {total_updated} \t Total existing: {total_existing}")


carsales (3).csv    	 New 3   	 Update 5 	 Existing 0 	 Tot 8
carsales (4).csv    	 New 7   	 Update 4 	 Existing 0 	 Tot 11
carsales (1).csv    	 New 2   	 Update 12 	 Existing 0 	 Tot 14
carsales (2).csv    	 New 18   	 Update 3 	 Existing 0 	 Tot 21
carsales.csv    	 New 0   	 Update 8 	 Existing 0 	 Tot 8
carsales (6).csv    	 New 4   	 Update 11 	 Existing 0 	 Tot 15
carsales (5).csv    	 New 6   	 Update 16 	 Existing 0 	 Tot 22
facebook (1).csv    	 New 21   	 Update 0 	 Existing 4 	 Tot 25
facebook (3).csv    	 New 18   	 Update 0 	 Existing 11 	 Tot 29
facebook.csv    	 New 6   	 Update 2 	 Existing 25 	 Tot 33
facebook (2).csv    	 New 43   	 Update 1 	 Existing 9 	 Tot 53
Total New: 128 	 Total Updated: 62 	 Total existing: 49


In [27]:
gen_lookup = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/gen_lookup.csv")
listings = pd.read_csv("/content/drive/Shareddrives/market_analysis_v2/listings.csv")

In [29]:
def integrate_listings(listings_df: pd.DataFrame, new_file_paths: List[str], gen_lookup_df: pd.DataFrame) -> pd.DataFrame:
    """
    Integrates new car listings from a list of CSV files into an existing listings DataFrame.

    Args:
        listings_df (pd.DataFrame): The existing DataFrame of car listings.
        new_file_paths (List[str]): A list of file paths for new car listing CSVs.
        gen_lookup_df (pd.DataFrame): The lookup table for car generations.

    Returns:
        pd.DataFrame: A new DataFrame (`listings_1`) with integrated, cleaned, and enriched listings,
                      with existing listings handled by keeping the most recent entry.
    """
    processed_dfs = []

    for file_path in new_file_paths:
        df_raw = pd.read_csv(file_path)
        df_cleaned = None

        if 'carsales' in os.path.basename(file_path):
            df_cleaned = clean_cs(df_raw, save_raw=False)
        elif 'facebook' in os.path.basename(file_path):
            df_cleaned = clean_fb(df_raw, save_raw=False)

        if df_cleaned is not None:
            df_enriched = enrich_df(df_cleaned, gen_lookup_df)
            processed_dfs.append(df_enriched)

    if processed_dfs:
        new_listings_df = pd.concat(processed_dfs, ignore_index=True)

        # Define all possible columns that might exist in either DataFrame
        # Get columns from existing listings and new listings, handling potential differences
        all_cols = list(set(listings_df.columns) | set(new_listings_df.columns))

        # Reindex both DataFrames to ensure they have the same columns
        listings_aligned = listings_df.reindex(columns=all_cols, fill_value=pd.NA)
        new_listings_aligned = new_listings_df.reindex(columns=all_cols, fill_value=pd.NA)

        # Ensure 'date_scraped' is in datetime format for proper sorting
        listings_aligned['date_scraped'] = pd.to_datetime(listings_aligned['date_scraped'], errors='coerce')
        new_listings_aligned['date_scraped'] = pd.to_datetime(new_listings_aligned['date_scraped'], errors='coerce')

        # Explicitly cast dtypes of new_listings_aligned to match listings_aligned for common columns
        # This helps prevent FutureWarning and ensures consistent types across the concatenated DataFrame
        for col in all_cols:
            if col in listings_aligned.columns and col in new_listings_aligned.columns:
                if listings_aligned[col].dtype != new_listings_aligned[col].dtype:
                    try:
                        if pd.api.types.is_numeric_dtype(listings_aligned[col]):
                            if str(listings_aligned[col].dtype) == 'Int64':
                                new_listings_aligned[col] = new_listings_aligned[col].astype('Int64')
                            else:
                                new_listings_aligned[col] = pd.to_numeric(new_listings_aligned[col], errors='coerce').astype(listings_aligned[col].dtype)
                        else:
                            new_listings_aligned[col] = new_listings_aligned[col].astype(listings_aligned[col].dtype)
                    except (TypeError, ValueError):
                        pass # Keep original dtype if casting causes error

        # Concatenate the aligned DataFrames
        listings_1 = pd.concat([listings_aligned, new_listings_aligned], ignore_index=True)
    else:
        listings_1 = listings_df.copy()

    # Sort by href and date_scraped, then drop existings keeping the most recent
    listings_1 = listings_1.sort_values(by=['href', 'date_scraped'], ascending=[True, False])
    listings_1 = listings_1.drop_duplicates(subset=['href'], keep='first')

    print(f"Final listings_1 DataFrame has {len(listings_1)} unique listings after merging and de-duplication.")
    return listings_1


In [43]:
# Get all new CSV files
cs_files = glob.glob('/content/carsales*.csv')
fb_files = glob.glob('/content/facebook*.csv')
all_new_files = cs_files + fb_files

# Call the function to integrate the listings
listings1 = integrate_listings(listings, all_new_files, gen_lookup)

print("\nFirst 5 rows of the newly created listings_1_from_function:")
display(listings1.head())

Final listings_1 DataFrame has 1046 unique listings after merging and de-duplication.

First 5 rows of the newly created listings_1_from_function:


Unnamed: 0,href,odometer,location,make,gen,listed_price,model,model_gen,seller_type,trim,year,date_scraped
950,https://www.carsales.com.au/cars/details/2004-...,183,,honda,1.0,4999,jazz,jazz_1,Dealer used,VTi Auto F,2004,2025-12-04
959,https://www.carsales.com.au/cars/details/2004-...,173,,honda,1.0,6995,jazz,jazz_1,Dealer used,VTi-S Auto F MY05,2004,2025-12-04
945,https://www.carsales.com.au/cars/details/2005-...,270,,honda,1.0,5900,jazz,jazz_1,Private,GLi Auto F MY05,2005,2025-12-04
967,https://www.carsales.com.au/cars/details/2005-...,121,,honda,1.0,7000,jazz,jazz_1,Private,VTi Auto F MY05,2005,2025-12-04
369,https://www.carsales.com.au/cars/details/2006-...,135,,honda,1.0,8990,accordeuro,accordeuro_1,Dealer used,Luxury Auto F MY06,2006,2025-12-04


In [44]:
# listings["date_scraped"]=datetime(2025,12,4)
# listings.to_csv("/content/drive/Shareddrives/market_analysis_v2/listings.csv",index=False)

In [45]:
listings1[listings1["date_scraped"]==datetime(2025,12,5)]

Unnamed: 0,href,odometer,location,make,gen,listed_price,model,model_gen,seller_type,trim,year,date_scraped
1062,https://www.carsales.com.au/cars/details/2012-...,77,,honda,9.0,13,civic,civic_9,,VTi Auto F,2012,2025-12-05
1027,https://www.carsales.com.au/cars/details/2012-...,145,,honda,9.0,14,civic,civic_9,Private,VTi Auto F,2012,2025-12-05
1063,https://www.carsales.com.au/cars/details/2012-...,203,,honda,9.0,18,civic,civic_9,,VTi-L Auto F,2012,2025-12-05
1035,https://www.carsales.com.au/cars/details/2012-...,140,,honda,9.0,15,civic,civic_9,Private,VTi-S Auto F,2012,2025-12-05
1030,https://www.carsales.com.au/cars/details/2012-...,107,,honda,9.0,15,civic,civic_9,Private,VTi-S Auto F,2012,2025-12-05
...,...,...,...,...,...,...,...,...,...,...,...,...
1044,https://www.carsales.com.au/cars/details/2018-...,71,,honda,3.0,17,jazz,jazz_3,Private,VTi Auto F MY19,2018,2025-12-05
1046,https://www.carsales.com.au/cars/details/2019-...,38,,honda,3.0,20,jazz,jazz_3,Private,VTi Auto F MY20,2019,2025-12-05
1104,https://www.facebook.com/marketplace/item/2720...,125,"Sydney, NSW",hyundai,2.0,1234,i30,i30_2,,,2016,2025-12-05
1102,https://www.facebook.com/marketplace/item/7700...,120,"Sydney, NSW",honda,3.0,120,jazz,jazz_3,,,2014,2025-12-05
