<a href="https://www.kaggle.com/code/jiayii1/godaddy-density-forecasting?scriptVersionId=118086749" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

referred to: https://www.kaggle.com/code/egorphysics/naive-forecasting-baseline-model

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LinearRegression
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
train = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/test.csv")
submission = pd.read_csv("/kaggle/input/godaddy-microbusiness-density-forecasting/sample_submission.csv")
train.head(3)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269


#### data pre-processing

In [3]:
# converting first_day_of_month column to date
train["first_day_of_month"] = train["first_day_of_month"].apply(lambda x: pd.to_datetime(x))

# sorting values 
train = train.sort_values(['cfips', 'first_day_of_month'])

In [4]:
train.groupby('cfips').count()

Unnamed: 0_level_0,row_id,county,state,first_day_of_month,microbusiness_density,active
cfips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1001,39,39,39,39,39,39
1003,39,39,39,39,39,39
1005,39,39,39,39,39,39
1007,39,39,39,39,39,39
1009,39,39,39,39,39,39
...,...,...,...,...,...,...
56037,39,39,39,39,39,39
56039,39,39,39,39,39,39
56041,39,39,39,39,39,39
56043,39,39,39,39,39,39


#### plotting microbusiness density across time per cfips (train)

In [5]:
num_plots = 3
fig = make_subplots(rows=num_plots, cols=1,
                   subplot_titles=(train.groupby('cfips').head(1)['cfips'].iloc[:num_plots].to_list()))

for idx, cfip in enumerate(train['cfips'].unique()[:num_plots]):
    
    fig.append_trace(go.Scatter(
    x=train['first_day_of_month'].loc[train['cfips'] == cfip],
    y=train['microbusiness_density'].loc[train['cfips'] == cfip],
    name=str(train['county'].loc[train['cfips'] == cfip].tail(1).values[0]) +\
        ', ' + str(train['state'].loc[train['cfips'] == cfip].tail(1).values[0])    
    ), row=idx+1, col=1)


fig.update_layout(font=dict(size=18), width=1000, height=1400)
fig.show()

In [6]:
train.head(3)

Unnamed: 0,row_id,cfips,county,state,first_day_of_month,microbusiness_density,active
0,1001_2019-08-01,1001,Autauga County,Alabama,2019-08-01,3.007682,1249
1,1001_2019-09-01,1001,Autauga County,Alabama,2019-09-01,2.88487,1198
2,1001_2019-10-01,1001,Autauga County,Alabama,2019-10-01,3.055843,1269


In [7]:
test['microbusiness_density'] = 0

In [8]:
for cfip in tqdm(train['cfips'].unique()):
    
    # Get the naive forecast
    naive_model = train['microbusiness_density'].loc[train['cfips'] == cfip].tail(1)
    
    # Insert the forecast into the test set
    test['microbusiness_density'].loc[test['cfips'] == cfip] = naive_model.values[0]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

100%|██████████| 3135/3135 [00:01<00:00, 1687.41it/s]


#### plotting microbusiness density across time per cfips (test)

In [9]:
num_plots = 3
fig = make_subplots(rows=num_plots, cols=1,
                   subplot_titles=(train.groupby('cfips').head(1)['cfips'].iloc[:num_plots].to_list()))

for idx, cfip in enumerate(train['cfips'].unique()[:num_plots]):
    
    fig.append_trace(go.Scatter(
    x=train['first_day_of_month'].loc[train['cfips'] == cfip],
    y=train['microbusiness_density'].loc[train['cfips'] == cfip],    
    name='Train',
    line=dict(color="blue", width=2)), row=idx+1, col=1)
    
    fig.append_trace(go.Scatter(
    x=test['first_day_of_month'].loc[test['cfips'] == cfip],
    y=test['microbusiness_density'].loc[test['cfips'] == cfip],    
    name='Forecast',
    line=dict(color="red", width=2)), row=idx+1, col=1)
    
# Removing repeating of names in the legend    
names = set()
fig.for_each_trace(
    lambda trace:
    trace.update(showlegend=False)
    if (trace.name in names) else names.add(trace.name))

fig.update_layout(template="simple_white", font=dict(size=18), width=1000, height=1500)
fig.show()

In [10]:
submission["microbusiness_density"] = test['microbusiness_density'].values
submission.to_csv("submission.csv", index=False)