In [1]:
import polars as pl

train.csv

* row_id - An ID code for the row.

* cfips - A unique identifier for each county using the Federal Information Processing System. The first two digits correspond to the state FIPS code, while the following 3 represent the county.

* county_name - The written name of the county.

* state_name - The name of the state.

* first_day_of_month - The date of the first day of the month.

* microbusiness_density - Microbusinesses per 100 people over the age of 18 in the given county. This is the target variable. The population figures used to calculate the density are on a two-year lag due to the pace of update provided by the U.S. Census Bureau, which provides the underlying population data annually. 2021 density figures are calculated using 2019 population figures, etc.

* active - The raw count of microbusinesses in the county. Not provided for the test set.


# Steps to Doing Prediction

## clean Data

In [2]:
train = pl.read_csv('train.csv')
train = train.with_columns(pl.col("first_day_of_month").str.strptime(pl.Date, fmt="%Y-%m-%d"))
train = train.drop(['active','county','state','row_id']).sort(['first_day_of_month','cfips'])

In [3]:
from datetime import datetime
validation_df = train.filter((pl.col("first_day_of_month") >= datetime(2022, 9, 1)))
train_df = train.filter((pl.col("first_day_of_month") < datetime(2022, 9, 1)))

In [4]:
def add_nxt_month_col(df):
    shift_down_value = -df.select('cfips').unique().shape[0]

    df = df.with_columns(
        pl.col('microbusiness_density').shift(shift_down_value).alias('next_month microbusiness_density')
        )

    return df

In [5]:
train_df = add_nxt_month_col(train_df)
validation_df = add_nxt_month_col(validation_df)

In [6]:
train_df = train_df.drop_nulls()

## Add Features

### Lag

In [7]:
def add_lag_month_col(df,period = 1):
    shift_down_value = df.select('cfips').unique().shape[0] * period

    df = df.with_columns(
        pl.col('microbusiness_density').shift(shift_down_value).alias(f'last_{period}month microbusiness_density')
        )

    return df

In [8]:
train_df = add_lag_month_col(train_df)
train_df = add_lag_month_col(train_df,2)

In [9]:
validation_df = add_lag_month_col(validation_df)
validation_df = add_lag_month_col(validation_df,2)

### Rolling Statistics-INCOMPLETE
split df into smaller df by cfips and calc rolling stats
then concat all df into one df

* diff
* mean
* max
* min
* std

### Timeseries stats-INCOMPLETE
Capture anomaly tied to a date

* Seasonality
* Trend

## TEST

In [10]:
#Competition measures by SMAPE
import numpy as np

def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

### Impute Data



In [11]:
from sklearn.impute import SimpleImputer

features = ['microbusiness_density','last_1month microbusiness_density','last_2month microbusiness_density']

In [12]:
imp_mean = SimpleImputer()
X = imp_mean.fit_transform(train_df.select(features)).transpose()
y = np.array(train_df.select(['next_month microbusiness_density'])).transpose()

### Build Model

In [13]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 100, random_state = 0, n_jobs=-1)
model.fit(X,y)

  model.fit(X,y)


In [14]:
model.score(X, y)

0.9864288470159518

In [29]:
smape(y.transpose()[0], model.predict(X))

0.9483988266208876

### Eval Model

In [15]:
eval_df = validation_df.filter(~pl.all(pl.col('next_month microbusiness_density').is_null()))

imp_mean = SimpleImputer()
X_eval = imp_mean.fit_transform(eval_df.select(features)).transpose()
y_eval = np.array(eval_df.select(['next_month microbusiness_density'])).transpose()

In [16]:
model.score(X_eval, y_eval)

0.9095170934283543

In [30]:
smape(y.transpose()[0], model.predict(X))

0.9483988266208875