In [1]:
import os
import pathlib
from datetime import datetime
from dateutil import relativedelta

import numpy as np
import pandas as pd

In [2]:
input_dir = pathlib.Path("../input/godaddy-microbusiness-density-forecasting/")

train = pd.read_csv(input_dir / "train.csv")
census = pd.read_csv(input_dir / "census_starter.csv")
revealed_test = pd.read_csv(input_dir / "revealed_test.csv")
test = pd.read_csv(input_dir / "test.csv")
sample_submission = pd.read_csv(input_dir / "sample_submission.csv")

In [3]:
def diff_month(start_date, end_date):
    s = datetime.strptime(start_date, "%Y-%m-%d")
    e = datetime.strptime(end_date, "%Y-%m-%d")
    
    diff = relativedelta.relativedelta(e, s)
    return diff.months
    
    

# NOTE: Testデータに含まれるデータは11ヶ月分
sample_date = sample_submission["row_id"].str.split('_').str[1]
print(sample_date.iloc[0], "~", sample_date.iloc[-1])
print(str(diff_month(sample_date.iloc[0], sample_date.iloc[-1])) + "ヶ月")

2022-11-01 ~ 2023-06-01
7ヶ月


In [4]:
raw = pd.concat([train, revealed_test]).sort_values(by=['cfips','first_day_of_month']).reset_index(drop=True)
raw["first_day_of_month"] = pd.to_datetime(raw["first_day_of_month"], format="%Y-%m-%d")

raw.tail()

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
128530,56045_2022-08-01,56045,Weston County,Wyoming,2022-08-01,1.785395,100
128531,56045_2022-09-01,56045,Weston County,Wyoming,2022-09-01,1.785395,100
128532,56045_2022-10-01,56045,Weston County,Wyoming,2022-10-01,1.785395,100
128533,56045_2022-11-01,56045,Weston County,Wyoming,2022-11-01,1.785395,100
128534,56045_2022-12-01,56045,Weston County,Wyoming,2022-12-01,1.803249,101


## Split Data

In [5]:
base_date = "2022-06-01"

train = raw.loc[raw["first_day_of_month"] < base_date, :]
valid = raw.loc[raw["first_day_of_month"] >= base_date, :]

## Feature

## Train

In [6]:
cfips_lastTarget = train.groupby("cfips")["microbusiness_density"].last().to_dict()

In [7]:
valid = valid.assign(predict=valid["cfips"].map(cfips_lastTarget))

## Evaluation

In [8]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

def vsmape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true!=0)|(y_pred!=0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * smap

In [9]:
smape(valid["microbusiness_density"].to_numpy(), valid["predict"].to_numpy())

3.8283454743276013

## Submission

In [10]:
cfips_lastTarget = raw.groupby("cfips")["microbusiness_density"].last().to_dict()
test["microbusiness_density"] = test["cfips"].map(cfips_lastTarget)

In [11]:
test.head()

Unnamed: 0,row_id,cfips,first_day_of_month,microbusiness_density
0,1001_2022-11-01,1001,2022-11-01,3.470915
1,1003_2022-11-01,1003,2022-11-01,8.25063
2,1005_2022-11-01,1005,2022-11-01,1.252272
3,1007_2022-11-01,1007,2022-11-01,1.28724
4,1009_2022-11-01,1009,2022-11-01,1.85206


In [12]:
test[["row_id", "microbusiness_density"]].to_csv("submission.csv", index=False)

In [13]:
!head submission.csv

row_id,microbusiness_density
1001_2022-11-01,3.4709148
1003_2022-11-01,8.2506304
1005_2022-11-01,1.2522722
1007_2022-11-01,1.28724
1009_2022-11-01,1.8520604
1011_2022-11-01,1.0260957
1013_2022-11-01,2.1990211
1015_2022-11-01,2.7915807
1017_2022-11-01,1.472754
