In [1]:
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm

# Evaluation metric

In [2]:
def smape(y_true, y_pred):
    '''
    https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error
    '''
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = (np.abs(y_true) + np.abs(y_pred)) / 2
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100*np.mean(smap)

# Prepare train and test sets

In [3]:
# based on https://www.kaggle.com/code/vadimkamaev/score-1-382/notebook

data_dir = '../data/'

def get_train_test():    
    # combine the original training set with the revealed data
    train = pd.concat([
        pd.read_csv(f'{data_dir}/train.csv'), 
        pd.read_csv(f'{data_dir}/revealed_test.csv')
    ]).sort_values(by=['cfips', 'first_day_of_month']).reset_index(0, drop=True)
    train['is_test'] = 0

    # drop the test data that has already been revealed
    test = pd.read_csv(f'{data_dir}/test.csv')
    drop_index = (test.first_day_of_month == '2022-11-01') | (test.first_day_of_month == '2022-12-01')
    test = test.loc[~drop_index, :]
    test['is_test'] = 1
    
    sub = pd.read_csv(f'{data_dir}/sample_submission.csv')
    return train, test, sub

train, test, sub = get_train_test()
print(len(train), len(test), len(sub))

128535 18810 25080


# Combine raw data

In [4]:
def get_raw(train, test):
    raw = pd.concat([train, test]).sort_values(['cfips', 'row_id']).reset_index(0, drop=True)
    raw['first_day_of_month'] = pd.to_datetime(raw["first_day_of_month"])
    raw['county'] = raw.groupby('cfips')['county'].ffill()
    raw['state'] = raw.groupby('cfips')['state'].ffill()
    raw['county_i'] = (raw['county'] + raw['state']).factorize()[0]
    raw['state_i'] = raw['state'].factorize()[0]
    raw['month_number'] = raw.groupby(['cfips'])['row_id'].cumcount()
    raw['y'] = raw['microbusiness_density']
    raw = raw.drop('microbusiness_density', axis=1)
    features = ['state_i']    
    return raw, features

raw, features = get_raw(train, test)

In [5]:
# os.environ['CUDA_VISIBLE_DEVICES']='0'
# raw['scale'] = (raw['first_day_of_month'] - raw['first_day_of_month'].min()).dt.days
# raw['scale'] = raw['scale'].factorize()[0]
# raw
# raw.groupby('cfips')['microbusiness_density'].shift(-1)/raw['microbusiness_density'] - 1
# raw['lastactive'] = raw.groupby('cfips')['active'].transform('last')

# Feature engineering: lag and rolling

In [6]:
def add_lag(raw, features, target='y', max_lag=8):
    for lag in range(1, max_lag):
        raw[f'{target}_lag_{lag}'] = raw.groupby('cfips')[target].shift(lag)
        features.append(f'{target}_lag_{lag}')        
        # raw[f'active_lag_{lag}'] = raw.groupby('cfips')['active'].diff(lag)        
        # feats.append(f'active_lag_{lag}')        
    return raw, features

def add_rolling(raw, features, target='y', lags=[1]):
    
    def get_rolling(s, window):
        return s.rolling(window, min_periods=1).sum()
        
    for lag in lags:
        for window in [2, 4, 6, 8, 10]:
            raw[f'{target}_roll_{window}_{lag}'] = raw.groupby('cfips')[f'{target}_lag_{lag}'].transform(get_rolling, window=window)
            features.append(f'{target}_roll_{window}_{lag}')
    return raw, features

# Feature engineering: internal and external census

In [7]:
def add_internal_census(raw, features):
    census = pd.read_csv(f'{data_dir}/census_starter.csv')
    census_cols = list(census.columns)
    census_cols.remove('cfips')
    raw = raw.merge(census, on='cfips', how='left')
    features += census_columns
    return raw, features

def add_external_census(raw, features):
    '''
    data: https://www2.census.gov/programs-surveys/popest/datasets/2020-2021/counties/totals/co-est2021-alldata.csv
    schema: https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2020-2021/CO-EST2021-ALLDATA.pdf
    '''
    census = pd.read_csv(f'{data_dir}/co-est2021-alldata.csv', encoding='latin-1')
    census['cfips'] = survey.STATE*1000 + survey.COUNTY
    features += [
        'SUMLEV',
        'REGION',
        'DIVISION',
        'ESTIMATESBASE2020',
        'POPESTIMATE2020',
        'POPESTIMATE2021',
        'NPOPCHG2020',
        'NPOPCHG2021',
        'BIRTHS2020',
        'BIRTHS2021',
        'DEATHS2020',
        'DEATHS2021',
        'NATURALCHG2020',
        'NATURALCHG2021',
        'INTERNATIONALMIG2020',
        'INTERNATIONALMIG2021',
        'DOMESTICMIG2020',
        'DOMESTICMIG2021',
        'NETMIG2020',
        'NETMIG2021',
        'RESIDUAL2020',
        'RESIDUAL2021',
        'GQESTIMATESBASE2020',
        'GQESTIMATES2020',
        'GQESTIMATES2021',
        'RBIRTH2021',
        'RDEATH2021',
        'RNATURALCHG2021',
        'RINTERNATIONALMIG2021',
        'RDOMESTICMIG2021',
        'RNETMIG2021'
    ]
    raw = raw.merge(survey, on='cfips', how='left')
    return raw, features

# Feature engineering: coordinates

In [8]:
def add_coords(raw, features):
    '''
    https://www.kaggle.com/datasets/alejopaullier/usa-counties-coordinates
    '''
    coords = pd.read_csv(f'{data_dir}/cfips_location.csv').drop('name', axis=1) 
    raw = raw.merge(coords, on='cfips')
    features += ['lng', 'lat']
    return raw, features

# Feature engineering: all

In [9]:
raw, features = add_lag(raw, features)
features

['state_i',
 'y_lag_1',
 'y_lag_2',
 'y_lag_3',
 'y_lag_4',
 'y_lag_5',
 'y_lag_6',
 'y_lag_7']

# Model

In [10]:
def get_model():
    from sklearn.ensemble import VotingRegressor
    import lightgbm as lgb
    import xgboost as xgb
    import catboost as cat
    from sklearn.pipeline import Pipeline
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.impute import KNNImputer    

    # we should decrease the num_iterations of catboost
    cat_model = cat.CatBoostRegressor(
        iterations=800,
        loss_function="MAPE",
        verbose=0,
        grow_policy='SymmetricTree',
        learning_rate=0.035,
        colsample_bylevel=0.8,
        max_depth=5,
        l2_leaf_reg=0.2,
        # max_leaves = 17,
        subsample=0.70,
        max_bin=4096,
    )
    return cat_model

get_model()

<catboost.core.CatBoostRegressor at 0x21b09885fc0>

# Training

In [22]:
ACT_THR = 140
MONTH_1 = 39
MONTH_last = 40

raw['ypred_last'] = np.nan
raw['ypred'] = np.nan
raw['k'] = 1.
raw['y'].fillna(0, inplace = True)

for TS in range(MONTH_1, MONTH_last): #40):
    print(TS)   
    model = get_model()            
    train_idxs = (raw.is_test==0) & (raw.month_number  < TS) & (raw.month_number >= 1) # & (raw.lastactive>ACT_THR) 
    valid_idxs = (raw.is_test==0) & (raw.month_number == TS) 
    model.fit(
        raw.loc[train_idxs, features],
        # raw.loc[train_idxs, 'target'].clip(-0.0043, 0.0045),
        raw.loc[train_idxs, 'y'],
    )

    ypred = model.predict(raw.loc[valid_indices, features])
    raw.loc[valid_idxs, 'k'] = (ypred + 1)*raw.loc[valid_idxs,'y']
    raw.loc[valid_idxs, 'yhat'] = ypred
    

    # Validate
    ylast_dict = raw.loc[raw.month_number==TS, ['cfips', 'y']].set_index('cfips').to_dict()['y']
    yhat_dict = raw.loc[raw.month_number==TS, ['cfips', 'yhat']].set_index('cfips').to_dict()['yhat']
    df = raw.loc[raw.month_number == (TS+1), 
                 ['cfips', 'y', 'state', 
#                   'lastactive', 
                  'y_lag_1']].reset_index(drop=True)
    df['yhat'] = df['cfips'].map(yhat_dict)
    df['ylast'] = df['cfips'].map(ylast_dict)
#     df.loc[df['lastval'].isnull(), 'lastval'] = df.loc[df['lastval'].isnull(), 'microbusiness_density']    
    
    # df.loc[df['lastactive']<=ACT_THR, 'pred'] = df.loc[df['lastactive']<=ACT_THR, 'lastval']
        
    raw.loc[raw.month_number==(TS+1), 'ypred'] = df['yhat'].values
    raw.loc[raw.month_number==(TS+1), 'ypred_last'] = df['ylast'].values
    
    print(f'TS: {TS}')
    print('Last Value SMAPE:', smape(df['y'], df['ylast']) )
    print('SMAPE:', smape(df['y'], df['yhat']))

ind = (raw.month_number > MONTH_1)&(raw.month_number <= MONTH_last)

print( 'SMAPE:', smape( raw.loc[ind, 'y'],  raw.loc[ind, 'ypred'] ) )
print( 'Last Value SMAPE:', smape( raw.loc[ind, 'y'],  raw.loc[ind, 'ypred_last'] ) )

39
TS: 39
Last Value SMAPE: 1.889206717018118
SMAPE: 3.387869772778022
SMAPE: 3.387869772778022
Last Value SMAPE: 1.889206717018118
