In [30]:
import polars as pl
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import make_scorer, root_mean_squared_error
import pandas as pd
import numpy as np
from datetime import datetime

# Steps:
- Import data as strings
- Order the data by date
- EDA
- Drop columns
- Clean the data
- Remove features
- Add features such as seasonality
- Train test split
- OHE
- Train model
- Prediction based on features: region, vehicle type, etc (csv file)
- Dashboard to filter based on various attributes

# Assumptions
- `lat`, `long` is indicative of `region`
- `make`, `model` is indicative of `year`

In [2]:
pd.options.display.max_columns = None

# Data importation

In [3]:
N_ROWS = None
craigslist_vehicles = pl.scan_csv("./data/craigslist_vehicles.csv", n_rows=N_ROWS, infer_schema_length=0)
craigslist_vehicles.head().collect()

Unnamed: 0_level_0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,image_url,description,county,state,lat,long,posting_date,removal_date
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""362773""","""7307679724""","""https://abilen…","""abilene""","""https://abilen…","""4500""","""2002.0""","""bmw""","""x5""",,,"""gas""","""184000.0""","""clean""","""automatic""",,,,,,"""https://images…","""$4,500 Cash 2…",,"""tx""","""32.401556""","""-99.884713""","""2021-04-16 00:…","""2021-05-02 00:…"
"""362712""","""7311833696""","""https://abilen…","""abilene""","""https://abilen…","""4500""","""2002.0""","""bmw""","""x5""",,,"""gas""","""184000.0""","""clean""","""automatic""",,,,,,"""https://images…","""$4,500 Cash 2…",,"""tx""","""32.401556""","""-99.884713""","""2021-04-24 00:…","""2021-04-28 00:…"
"""362722""","""7311441996""","""https://abilen…","""abilene""","""https://abilen…","""4900""","""2006.0""","""toyota""","""camry""","""excellent""","""4 cylinders""","""gas""","""184930.0""","""clean""","""automatic""",,"""fwd""",,"""sedan""","""silver""","""https://images…","""2006 TOYOTA CA…",,"""tx""","""32.453848""","""-99.7879""","""2021-04-23 00:…","""2021-05-25 00:…"
"""362771""","""7307680715""","""https://abilen…","""abilene""","""https://abilen…","""6500""","""2008.0""","""ford""","""expedition""",,,"""gas""","""206000.0""","""clean""","""automatic""",,,,,,"""https://images…","""$6500.00 2008 …",,"""tx""","""32.401556""","""-99.884713""","""2021-04-16 00:…","""2021-04-26 00:…"
"""362710""","""7311834578""","""https://abilen…","""abilene""","""https://abilen…","""6500""","""2008.0""","""ford""","""expedition""",,,"""gas""","""206000.0""","""clean""","""automatic""",,,,,,"""https://images…","""$6500.00 2008 …",,"""tx""","""32.401556""","""-99.884713""","""2021-04-24 00:…","""2021-05-12 00:…"


In [4]:
craigslist_vehicles.collect().sample(20).select(pl.col("removal_date")).to_series().to_list()

['2021-04-28 00:00:00+00:00',
 '2021-04-29 00:00:00+00:00',
 '2021-04-13 00:00:00+00:00',
 '2021-05-18 00:00:00+00:00',
 '2021-05-06 00:00:00+00:00',
 '2021-05-07 00:00:00+00:00',
 '2021-05-25 00:00:00+00:00',
 '2021-05-03 00:00:00+00:00',
 '2021-05-24 00:00:00+00:00',
 '2021-05-08 00:00:00+00:00',
 '2021-05-17 00:00:00+00:00',
 '2021-04-23 00:00:00+00:00',
 '2021-05-30 00:00:00+00:00',
 '2021-05-27 00:00:00+00:00',
 '2021-05-08 00:00:00+00:00',
 '2021-05-20 00:00:00+00:00',
 '2021-04-27 00:00:00+00:00',
 '2021-05-05 00:00:00+00:00',
 '2021-06-01 00:00:00+00:00',
 '2021-05-13 00:00:00+00:00']

# Data conversion

In [5]:
numeric_cols = ["price", "odometer"]
date_cols = ["posting_date", "removal_date"]

In [6]:
def convert_data(data: pl.LazyFrame, date_columns: list = date_cols, numeric_columns: list = numeric_cols) -> pl.LazyFrame:
    
    for d in date_columns:
        data = data.with_columns(pl.col(d).str.to_datetime(format="%Y-%m-%d %H:%M:%S%z"))
        
    for n in numeric_columns:
        data = data.with_columns(pl.col(n).cast(pl.Float32()))
    
    return data.sort(by="posting_date", descending=False)

craigslist_vehicles = convert_data(craigslist_vehicles)
craigslist_vehicles.head().collect()

Unnamed: 0_level_0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,VIN,drive,size,type,paint_color,image_url,description,county,state,lat,long,posting_date,removal_date
str,str,str,str,str,f32,str,str,str,str,str,str,f32,str,str,str,str,str,str,str,str,str,str,str,str,str,"datetime[μs, UTC]","datetime[μs, UTC]"
"""11""","""7218325704""","""https://elpaso…","""el paso""","""https://elpaso…",0.0,,,,,,,,,,,,,,,,,,"""tx""",,,,
"""22""","""7213839225""","""https://bellin…","""bellingham""","""https://bellin…",26850.0,,,,,,,,,,,,,,,,,,"""wa""",,,,
"""214184""","""7316697754""","""https://minnea…","""minneapolis / …","""https://minnea…",6950.0,,,,,,,,,,,,,,,,,,"""mn""",,,,
"""144572""","""7314963546""","""https://rockfo…","""rockford""","""https://rockfo…",5450.0,,,,,,,,,,,,,,,,,,"""il""",,,,
"""80767""","""7217189206""","""https://fortco…","""fort collins /…","""https://fortco…",21928.0,,,,,,,,,,,,,,,,,,"""co""",,,,


In [28]:
min_sale_date = craigslist_vehicles.select("removal_date").min().collect().to_series()[0]
min_sale_date

datetime.datetime(2021, 4, 4, 0, 0, tzinfo=zoneinfo.ZoneInfo(key='UTC'))

# EDA

In [7]:
(craigslist_vehicles
 .collect()
 .describe()
 .transpose(column_names="describe", include_header=True)
)

column,count,null_count,mean,std,min,25%,50%,75%,max
str,str,str,str,str,str,str,str,str,str
"""""","""426880""","""0""",,,"""0""",,,,"""99999"""
"""id""","""426880""","""0""",,,"""7207408119""",,,,"""7317101084"""
"""url""","""426880""","""0""",,,"""https://abilen…",,,,"""https://zanesv…"
"""region""","""426880""","""0""",,,"""SF bay area""",,,,"""zanesville / c…"
"""region_url""","""426880""","""0""",,,"""https://abilen…",,,,"""https://zanesv…"
"""price""","""426880.0""","""0.0""","""75199.03125""","""12182282.0""","""0.0""","""5900.0""","""13950.0""","""26485.0""","""3736928768.0"""
"""year""","""425675""","""1205""",,,"""1900.0""",,,,"""2022.0"""
"""manufacturer""","""409234""","""17646""",,,"""acura""",,,,"""volvo"""
"""model""","""421603""","""5277""",,,"""""t""""",,,,"""🔥GMC Sierra 15…"
"""condition""","""252776""","""174104""",,,"""excellent""",,,,"""salvage"""


# Drop columns

In [8]:
id_cols = ['', "id", "url", "region_url", "VIN", "image_url", "description", "lat", "long", "year"]
craigslist_vehicles = craigslist_vehicles.drop(id_cols)

# Remove features
## Features with too many nulls

In [9]:
def find_excess_nulls(data: pl.LazyFrame, thr: float = 0.2) -> list:
    df = (data
        .null_count()
        .collect()
        .transpose(include_header=True, column_names=["null_count"])
        .with_columns(pl.lit(value=len(craigslist_vehicles.collect())).alias("obs"))
        .with_columns((pl.col("null_count") / pl.col("obs")).alias("prop"))
        .with_columns((pl.col("prop") > thr).alias("is_excess_nulls"))
        .filter(pl.col("is_excess_nulls") == True)
    )
    
    print(df)
        
    excess_nulls = (df
        .select("column")
        .to_series().to_list()
    )
    
    return excess_nulls

excess_null_cols = find_excess_nulls(craigslist_vehicles)
excess_null_cols

shape: (7, 5)
┌─────────────┬────────────┬────────┬──────────┬─────────────────┐
│ column      ┆ null_count ┆ obs    ┆ prop     ┆ is_excess_nulls │
│ ---         ┆ ---        ┆ ---    ┆ ---      ┆ ---             │
│ str         ┆ u32        ┆ i32    ┆ f64      ┆ bool            │
╞═════════════╪════════════╪════════╪══════════╪═════════════════╡
│ condition   ┆ 174104     ┆ 426880 ┆ 0.407852 ┆ true            │
│ cylinders   ┆ 177678     ┆ 426880 ┆ 0.416225 ┆ true            │
│ drive       ┆ 130567     ┆ 426880 ┆ 0.305863 ┆ true            │
│ size        ┆ 306361     ┆ 426880 ┆ 0.717675 ┆ true            │
│ type        ┆ 92858      ┆ 426880 ┆ 0.217527 ┆ true            │
│ paint_color ┆ 130203     ┆ 426880 ┆ 0.305011 ┆ true            │
│ county      ┆ 426880     ┆ 426880 ┆ 1.0      ┆ true            │
└─────────────┴────────────┴────────┴──────────┴─────────────────┘


['condition', 'cylinders', 'drive', 'size', 'type', 'paint_color', 'county']

## Features with near zero variance

In [10]:
def find_nzv_categorical(data: pl.LazyFrame, thr: float = 0.8) -> list:
    cols = data.select(pl.col(pl.Utf8)).columns
    
    categorical_cols = [c 
        for c in cols 
        if c not in 
        numeric_cols + excess_null_cols + id_cols
        ]
    
    df = (data
    .select(categorical_cols)
    .melt(variable_name="column")
    .group_by(pl.all())
    .len()
    .rename({"len": "null_count"})
    .with_columns(pl.col("null_count").sum().over("column").alias("total"))
    .with_columns((pl.col("null_count") / pl.col("total")).alias("prop"))
    .with_columns((pl.col("prop") > thr).alias("is_nzv"))
    .sort(by="column")
    .filter(pl.col("is_nzv") == True)
    .collect()
    )
    
    print(df)
    
    is_nzv = (df
    .select("column")
    .to_series().to_list()
    )
    
    return is_nzv

is_nzv_categorical = find_nzv_categorical(craigslist_vehicles)
is_nzv_categorical

shape: (2, 6)
┌──────────────┬───────┬────────────┬────────┬──────────┬────────┐
│ column       ┆ value ┆ null_count ┆ total  ┆ prop     ┆ is_nzv │
│ ---          ┆ ---   ┆ ---        ┆ ---    ┆ ---      ┆ ---    │
│ str          ┆ str   ┆ u32        ┆ u32    ┆ f64      ┆ bool   │
╞══════════════╪═══════╪════════════╪════════╪══════════╪════════╡
│ fuel         ┆ gas   ┆ 356209     ┆ 426880 ┆ 0.834448 ┆ true   │
│ title_status ┆ clean ┆ 405117     ┆ 426880 ┆ 0.949018 ┆ true   │
└──────────────┴───────┴────────────┴────────┴──────────┴────────┘


['fuel', 'title_status']

In [11]:
def find_nzv_numeric(data: pl.LazyFrame, num_cols: list = numeric_cols, thr: float = 0.8) -> list:
    numeric_data = data.select(num_cols).with_columns(pl.all().cast(pl.Float32()))
    nzv = VarianceThreshold(thr * (1 - thr))
    nzv.fit_transform(numeric_data.collect())
    idx = nzv.get_support(indices=False)
    retained_feats = nzv.get_feature_names_out()[idx]
    return [f for f in numeric_data.columns if f not in retained_feats]

is_nzv_numeric = find_nzv_numeric(craigslist_vehicles)
is_nzv_numeric

[]

In [12]:
cols_to_drop = set(id_cols + excess_null_cols + is_nzv_categorical + is_nzv_numeric)
craigslist_vehicles = craigslist_vehicles.drop(cols_to_drop)
craigslist_vehicles.head().collect()

region,price,manufacturer,model,odometer,transmission,state,posting_date,removal_date
str,f32,str,str,f32,str,str,"datetime[μs, UTC]","datetime[μs, UTC]"
"""el paso""",0.0,,,,,"""tx""",,
"""bellingham""",26850.0,,,,,"""wa""",,
"""minneapolis / …",6950.0,,,,,"""mn""",,
"""rockford""",5450.0,,,,,"""il""",,
"""fort collins /…",21928.0,,,,,"""co""",,


# Create features
## Add bands

In [13]:
band_cols = ["price", "odometer"]

def create_bucketed_features(data: pl.LazyFrame, cols: list = band_cols, buckets: int = 10) -> pl.LazyFrame:
    
    labels = [str(l) for l in range(buckets)]
    
    for c in cols:
        data = data.with_columns(
            pl.col(c).qcut(quantiles=buckets, labels=labels).cast(pl.Int8()).alias(f"{c}_band"),
            pl.col(c).qcut(quantiles=buckets).alias(f"{c}_band_values")
            )
    
    b_cols = [c for c in data.columns if c.__contains__("_band")]
    v_cols = [c for c in data.columns if c.__contains__("_band_values")]
    
    bands = data.drop(cols + v_cols)
    band_values = data.select(b_cols)
    
    return bands, band_values

craigslist_vehicles_banded, band_values = create_bucketed_features(craigslist_vehicles)
craigslist_vehicles_banded.head().collect()

region,manufacturer,model,transmission,state,posting_date,removal_date,price_band,odometer_band
str,str,str,str,str,"datetime[μs, UTC]","datetime[μs, UTC]",i8,i8
"""atlanta""",,,,"""ga""",,,0,
"""bellingham""",,,,"""wa""",,,1,
"""bellingham""",,,,"""wa""",,,2,
"""bellingham""",,,,"""wa""",,,2,
"""bellingham""",,,,"""wa""",,,3,


In [14]:
group_by_cols = craigslist_vehicles_banded.columns + ["year_sold", "month_sold", "day_sold"]
group_by_cols = [c for c in group_by_cols if not c.endswith("date")]

def create_time_features(data: pl.LazyFrame, group_by: list = group_by_cols):
    res = (data
        .with_columns(
            pl.col("removal_date").dt.year().alias("cal_year"),
            pl.col("removal_date").dt.month().alias("month_sold"),
            pl.col("removal_date").dt.day().alias("day_sold")
        )
        .with_columns((pl.col("cal_year").max() - pl.col("cal_year")).alias("year_sold"))
        .group_by(pl.col(group_by))
        .len()
        .rename({"len": "count"})
    )
    
    return res

model_data = create_time_features(craigslist_vehicles_banded)
model_data.head().collect()

region,manufacturer,model,transmission,state,price_band,odometer_band,year_sold,month_sold,day_sold,count
str,str,str,str,str,i8,i8,i32,i8,i8,u32
"""bellingham""",,,,"""wa""",1,,,,,1
"""detroit metro""",,,,"""mi""",6,,,,,2
"""new york city""",,,,"""ny""",6,,,,,1
"""akron / canton…","""lexus""","""ls 460 sedan 4…","""automatic""","""oh""",3,0.0,0.0,4.0,21.0,1
"""akron / canton…","""buick""","""enclave premiu…","""other""","""oh""",9,5.0,0.0,5.0,5.0,1


## Add seasons

In [15]:
def map_month_to_season(month):
    seasons = {
        "spring": [3, 4, 5],
        "summer": [6, 7, 8],
        "autumn": [9, 10, 11],
        "winter": [12, 1, 2]
    }

    for season, months in seasons.items():
        if month in months:
            return season
        else:
            return "unknown"

def create_seaons(data: pl.LazyFrame) -> pl.LazyFrame:
    res = data.with_columns(
        pl.col("month_sold").map_elements(function=map_month_to_season, skip_nulls=False).alias("season")
    )
    return res

model_data = create_seaons(model_data)
model_data.head().collect()

region,manufacturer,model,transmission,state,price_band,odometer_band,year_sold,month_sold,day_sold,count,season
str,str,str,str,str,i8,i8,i32,i8,i8,u32,str
"""atlanta""",,,,"""ga""",0,,,,,1,"""unknown"""
"""bellingham""",,,,"""wa""",1,,,,,1,"""unknown"""
"""bellingham""",,,,"""wa""",2,,,,,2,"""unknown"""
"""buffalo""",,,,"""ny""",3,,,,,2,"""unknown"""
"""charlotte""",,,,"""nc""",2,,,,,1,"""unknown"""


# Train test split (sk.TimeSeriesSplit)

In [16]:
y_data = model_data.select("count").collect().to_series()
y = pd.Series(y_data)

X = model_data.drop("count").collect().to_pandas()
print(X.shape)

(422291, 11)


In [17]:
ts_cv = TimeSeriesSplit(gap=7)

splits = list(ts_cv.split(X, y))
train_idx, test_idx = splits[0]

X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# OHE

In [18]:
ohe = OneHotEncoder(drop="if_binary", max_categories=5, dtype=np.int8)
ohe.fit(X)

In [19]:
def ohe_dataframe(data: pd.DataFrame, encoder: OneHotEncoder = ohe) -> pd.DataFrame:
    X = encoder.transform(data)
    X_df = pd.DataFrame(X.toarray(), columns=ohe.get_feature_names_out())
    print("The infrequent categories are:", len(encoder.infrequent_categories_))
    return X_df

X_train_encoded = ohe_dataframe(X_train)
display(X_train_encoded)

The infrequent categories are: 11


Unnamed: 0,region_columbus,region_jacksonville,region_seattle-tacoma,region_spokane / coeur d'alene,region_infrequent_sklearn,manufacturer_chevrolet,manufacturer_ford,manufacturer_honda,manufacturer_toyota,manufacturer_infrequent_sklearn,model_1500,model_f-150,model_silverado 1500,model_None,model_infrequent_sklearn,transmission_automatic,transmission_manual,transmission_other,transmission_None,state_ca,state_fl,state_ny,state_tx,state_infrequent_sklearn,price_band_0,price_band_1,price_band_6,price_band_8,price_band_infrequent_sklearn,odometer_band_4.0,odometer_band_5.0,odometer_band_8.0,odometer_band_9.0,odometer_band_infrequent_sklearn,year_sold_nan,month_sold_4.0,month_sold_5.0,month_sold_6.0,month_sold_nan,day_sold_9.0,day_sold_10.0,day_sold_13.0,day_sold_14.0,day_sold_infrequent_sklearn,season_unknown
0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1
1,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1
2,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1
3,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1
4,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70374,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0
70375,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
70376,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0
70377,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0


# Train & evaluate model

In [20]:
def train_model(X_train, X_test, y_train, y_test, cv = ts_cv):
    model = HistGradientBoostingRegressor().fit(X_train, y_train)
    
    X_test_encoded = ohe_dataframe(data=X_test)
    y_pred = model.predict(X_test_encoded)
    rmse = root_mean_squared_error(y_true=y_test, y_pred=y_pred)
    
    print("TESTING ERROR:")
    print("rmse:", rmse)
    print("\n")
    
    X_encoded = ohe_dataframe(X)
    scorer = make_scorer(score_func=root_mean_squared_error)
    cv_scores = cross_val_score(estimator=model, X=X_encoded, y=y, scoring=scorer, cv=cv)
    
    print("TESTING ERRORS:")
    print("cv_scores:", cv_scores)
    print("rmse:", cv_scores.mean())
    print("std:", cv_scores.std())

    return model

model = train_model(X_train=X_train_encoded, X_test=X_test, y_train=y_train, y_test=y_test)
model

The infrequent categories are: 11
TESTING ERROR:
rmse: 0.11038793885128614


The infrequent categories are: 11
TESTING ERRORS:
cv_scores: [0.11038888 0.10167717 0.11691352 0.11428559 0.09937762]
rmse: 0.1085285576806709
std: 0.006893373082560599


## Qualitative evaluation

In [21]:
predicted_sales = model.predict(ohe_dataframe(X_test))
print("obs:", len(y_test))
print("total sales (predicted):", sum(predicted_sales))
print("total sales (actual):", sum(y_test))
print("difference:", sum(predicted_sales) - sum(y_test))

The infrequent categories are: 11
obs: 70381
total sales (predicted): 71181.33499376582
total sales (actual): 71126
difference: 55.3349937658204


# Make predictions

In [205]:
predictions = pd.concat(objs=[
    X_test.reset_index(drop=True), 
    pd.Series(y_test, name="actual_sales").reset_index(drop=True),
    pd.Series(predicted_sales, name="predicted_sales"),
    ], axis=1)

predictions

Unnamed: 0,region,manufacturer,model,transmission,state,price_band,odometer_band,year_sold,month_sold,day_sold,season,actual_sales,predicted_sales
0,modesto,bmw,320i,automatic,ca,2,2.0,0.0,5.0,4.0,spring,1,1.011544
1,modesto,ford,expedition,automatic,ca,0,6.0,0.0,5.0,16.0,spring,1,1.014442
2,mohave county,ram,,automatic,az,1,9.0,0.0,5.0,3.0,spring,1,1.010559
3,mohave county,toyota,tundra,automatic,az,9,2.0,0.0,5.0,13.0,spring,2,1.012491
4,monroe,ford,f-150,automatic,mi,1,8.0,0.0,4.0,29.0,spring,1,1.014427
...,...,...,...,...,...,...,...,...,...,...,...,...,...
70376,"st louis, MO",audi,a3 e-tron,automatic,il,4,3.0,0.0,5.0,16.0,spring,1,1.011544
70377,"st louis, MO",chevrolet,malibu,automatic,il,0,7.0,0.0,5.0,29.0,spring,1,1.008632
70378,"st louis, MO",ford,escape,automatic,il,0,8.0,0.0,5.0,24.0,spring,2,1.009437
70379,"st louis, MO",chevrolet,corvette coupe,manual,il,8,2.0,0.0,5.0,11.0,spring,1,1.010442


# Export data

In [210]:
def extract_band_values(data: pl.LazyFrame, cols: list = band_cols):
    res = {}
    for c in cols:
        r = (data
            .unique(subset=f"{c}_band")
            .select(pl.col([f"{c}_band", f"{c}_band_values"]))
            .filter(pl.col(f"{c}_band").is_not_null())
            .collect()
            .sort(by=f"{c}_band")
        )
        
        res[c]=r
        
    return res

band_values_dict = extract_band_values(band_values)
band_values_dict

{'price': shape: (10, 2)
 ┌────────────┬───────────────────┐
 │ price_band ┆ price_band_values │
 │ ---        ┆ ---               │
 │ i8         ┆ cat               │
 ╞════════════╪═══════════════════╡
 │ 0          ┆ (500, 4500]       │
 │ 1          ┆ (13950, 17990]    │
 │ 2          ┆ (9995, 13950]     │
 │ 3          ┆ (23300, 29777]    │
 │ 4          ┆ (17990, 23300]    │
 │ 5          ┆ (4500, 6995]      │
 │ 6          ┆ (6995, 9995]      │
 │ 7          ┆ (-inf, 500]       │
 │ 8          ┆ (29777, 37590]    │
 │ 9          ┆ (37590, inf]      │
 └────────────┴───────────────────┘,
 'odometer': shape: (10, 2)
 ┌───────────────┬──────────────────────────────┐
 │ odometer_band ┆ odometer_band_values         │
 │ ---           ┆ ---                          │
 │ i8            ┆ cat                          │
 ╞═══════════════╪══════════════════════════════╡
 │ 0             ┆ (85548, 102979.4]            │
 │ 1             ┆ (66859.6, 85548]             │
 │ 2             ┆ (

In [211]:
with pd.ExcelWriter(path="./data/predictions.xlsx", mode="w") as writer:
    predictions.to_excel(excel_writer=writer, index=False, sheet_name="predictions")
    for k,v in band_values_dict.items():
        v.to_pandas().to_excel(excel_writer=writer, index=False, sheet_name=k)

# Charts & Graphs

* You can be able to explore trends, insights, etc of the model over different 
time spans using [this interactive dashboard](https://lookerstudio.google.com/reporting/2803f46f-1fdf-48d0-8bf7-5c6d6a665bd1/page/xEeoD) that has been published.

* The predictions can be found on [this Google Sheets](https://docs.google.com/spreadsheets/d/1gfdVHUMXRjXx1QRTUdIMxKd-FGWmEJnt9DMn1xrgat8/edit#gid=1316071412).


![alt text](./dashboard_ss.png "Screenshot of Craiglist Vehicle Sales Dashboard")