## Example of Predictive Model

In [1]:
import pandas as pd
import os
from sklearn.ensemble import RandomForestRegressor

In [2]:
ROOT_PATH = pd.read_csv("ROOT_PATH.txt").columns[0]
PROCESSED_PATH = os.path.join(ROOT_PATH, "datasets", "processed")
del(ROOT_PATH)
file_path_train = os.path.join(PROCESSED_PATH, "train.csv")
file_path_test = os.path.join(PROCESSED_PATH, "test.csv")
file_path_sample_submission = os.path.join(PROCESSED_PATH, "sample_submission.csv")
file_path_submission = os.path.join(PROCESSED_PATH, "submission.csv")

In [3]:
def read_supplementary(filename, parammeter):
    file_path_supplementary_LEAD = os.path.join(PROCESSED_PATH, filename)
    supplementary = pd.read_parquet(file_path_supplementary_LEAD)
    supplementary = supplementary[supplementary['Parameter.Name'] == parammeter]
    supplementary.drop(["Parameter.Name", "X1st.Max.Value"], axis=1, inplace=True)
    supplementary['Date.Local'] = supplementary['Date.Local'].apply(lambda x: x.year)
    supplementary.rename(columns={'Date.Local': 'Year', 'Arithmetic.Mean': parammeter}, inplace=True)
    supplementary = pd.DataFrame(supplementary.groupby(["State.Name","Year"])[parammeter].median())
    return supplementary

In [4]:
def input_supplementary(input_data):
    df = read_supplementary(input_data.iloc[0].file, input_data.iloc[0].parameter)
    for i in range(1, input_data.shape[0]):
        df1 = read_supplementary(input_data.iloc[i].file, input_data.iloc[i].parameter)
        df = pd.merge(df, df1, on=["State.Name", "Year"], how='outer')
    return df

In [5]:
def supp_feature(supplementary, state, year, lag):
    supplementary_state = supplementary[supplementary.index.get_level_values('State.Name') == state]
    return pd.concat([pd.Series(index = ['State.Name', 'Year'], data = [state, year]), 
                    supplementary_state[(supplementary_state.index.get_level_values('Year') >= year-lag)&
                               (supplementary_state.index.get_level_values('Year') <= year)] \
    .median()])

In [6]:
def age_cleaning(x):
    return int(x.split("-")[0])/5-13 

In [7]:
input_data = pd.DataFrame({
    "file" : ["supplementary_NONOxNOy.parquet", "supplementary_LEAD.parquet", "supplementary_VOCS.parquet", 
              "supplementary_VOCS.parquet", "supplementary_HAPS.parquet",
              "supplementary_HAPS.parquet"],
    "parameter": ["Nitric oxide (NO)", "Lead (TSP) STP", "Toluene", 
                  "Total NMOC (non-methane organic compound)", "Benzene", "Ethylene dichloride"]
})

In [8]:
features = list({'Age','Nitric oxide (NO)',
       'Lead (TSP) STP', 'Toluene',
       'Total NMOC (non-methane organic compound)', 'Benzene',
       'Ethylene dichloride'})

In [9]:
supplementary = input_supplementary(input_data)

In [10]:
train = pd.read_csv(file_path_train)
train['Age'] = train['Age'].map(age_cleaning)

In [11]:
locations = sorted(set(supplementary.index.map(lambda x: x[0])))

In [12]:
supplementary = pd.DataFrame([supp_feature(supplementary, loc, y, 10) for loc in locations
                   for y in range(1990, 2019)])

In [13]:
train_supp = pd.merge(train, supplementary, on=["State.Name", "Year"], how='left')
del(train)
del(locations)

In [14]:
y = train_supp['Incidence']

In [15]:
X = train_supp[features]

In [16]:
del(train_supp)

In [17]:
X = X.interpolate(method="spline", order=3)

In [18]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [19]:
model.fit(X, y)

In [20]:
test = pd.read_csv(file_path_test)
test['Age'] =  test['Age'].map(age_cleaning)

In [21]:
test_supp = pd.merge(test, supplementary, on=["State.Name", "Year"], how='left')
del(test)

In [22]:
X = test_supp[features]
del(test_supp)

In [23]:
X = X.interpolate(method="spline", order=3)

In [24]:
X = X.fillna(0)

In [25]:
submission = pd.read_csv(file_path_sample_submission)

In [26]:
submission['Incidence'] = model.predict(X)

In [27]:
submission.to_csv(file_path_submission, index=False)