In [1]:
import pandas as pd
import numpy as np
import holidays

train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
label = 'num_sold'
categorical_features = ['country', 'store', 'product']
numerical_features = []

In [2]:
def feature_engineer(train, test):
    new_train = train.copy()
    new_test = test.copy()
    
    new_train["date"] = pd.to_datetime(new_train["date"])
    new_test["date"] = pd.to_datetime(new_test["date"])

    first_date = new_train["date"].min()

    holiday_map = {
        'Canada':    holidays.country_holidays('CA', years=range(2010, 2020)),
        'Finland':   holidays.country_holidays('FI', years=range(2010, 2020)),
        'Italy':     holidays.country_holidays('IT', years=range(2010, 2020)),
        'Kenya':     holidays.country_holidays('KE', years=range(2010, 2020)),
        'Norway':    holidays.country_holidays('NO', years=range(2010, 2020)),
        'Singapore': holidays.country_holidays('SG', years=range(2010, 2020)),
    }

    for df in [new_train, new_test]:

        # For each country in our holiday_map:
        for country, h in holiday_map.items():
            # Create a column called "holiday_<country>" 
            # that is 1 if the row's date is a holiday in that country, else 0.
            df[f'holiday_{country}'] = df['date'].apply(lambda x: 1 if x in h else 0)

        df['week_of_year'] = df['date'].dt.isocalendar().week
        df["year"] = df["date"].dt.year
        df["month"] = df["date"].dt.month

        df["day_of_month"] = df["date"].dt.day
        df["month_sin"] = np.sin(df['day_of_month'] * (2 * np.pi / 31))
        df["month_cos"] = np.cos(df['day_of_month'] * (2 * np.pi / 31))

        df["day_of_week"] = df["date"].dt.dayofweek
        df["week_sin"] = np.sin(df['day_of_week'] * (2 * np.pi / 7))
        df["week_cos"] = np.cos(df['day_of_week'] * (2 * np.pi / 7))
        
        df["day_of_year"] = df['date'].dt.dayofyear
        df["year_sin"] = np.sin(df['day_of_year'] * (2 * np.pi / 365))
        df["year_cos"] = np.cos(df['day_of_year'] * (2 * np.pi / 365))
        df["2yr_sin"] = np.sin(df['day_of_year'] * (2 * np.pi / 730))
        df["2yr_cos"] = np.cos(df['day_of_year'] * (2 * np.pi / 730))

        df["days_since_first_observation"] = (df["date"] - first_date).dt.days
        
    return new_train, new_test

In [3]:
train, test = feature_engineer(train, test)

In [4]:
agg_train, agg_test = feature_engineer(
    train.groupby(['date'])['num_sold'].sum().reset_index(),
    pd.DataFrame(test.date.unique(), columns=['date'])
)

In [5]:
def make_autogluon_model(time_limit):
    from autogluon.tabular import TabularPredictor
    
    class AutoGluonWrapper:
        def __init__(self, time_limit):
            self.model = None
            self.time_limit = time_limit
        
        def fit(self, train_data, label):
            self.model = TabularPredictor(
                label=label,
                eval_metric='mean_absolute_percentage_error'
            )
            self.model.fit(
                train_data,
                presets='best_quality',
                time_limit=self.time_limit,
            )
            
        def predict(self, test_data):
            return self.model.predict(test_data).values
    
    return AutoGluonWrapper(time_limit=time_limit)

def make_mljar_model(time_limit):
    from supervised.automl import AutoML
    class MLJARWrapper:
        def __init__(self, time_limit):
            self.model = None
            self.time_limit = time_limit
        
        def fit(self, train_data, label):
            self.model = AutoML(
                eval_metric='mape',
                mode='Compete',
                total_time_limit=self.time_limit
            )
            self.model.fit(
                train_data.drop(columns=[label]),
                train_data[label]
            )
        
        def predict(self, test_data):
            return self.model.predict(test_data)
    
    return MLJARWrapper(time_limit=time_limit)

def make_flaml_model(time_limit):
    from flaml import AutoML
    
    class FLAMLWrapper:
        def __init__(self, time_limit):
            self.model = AutoML()
            self.time_limit = time_limit
        
        def fit(self, train_data, label):
            self.model.fit(
                dataframe=train_data,
                label=label,
                time_budget=self.time_limit,
                metric='mape',
                task='regression'
            )
        
        def predict(self, test_data):
            return self.model.predict(test_data)
    
    return FLAMLWrapper(time_limit=time_limit)

time_limit = 2*60*60
#model = make_autogluon_model(time_limit=time_limit)
#model = make_mljar_model(time_limit=time_limit)
model = make_flaml_model(time_limit=time_limit)

model.fit(agg_train, 'num_sold')

agg_test['num_sold_agg'] = model.predict(agg_test)

[flaml.automl.logger: 01-10 00:26:47] {1728} INFO - task = regression
[flaml.automl.logger: 01-10 00:26:47] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 01-10 00:26:47] {1838} INFO - Minimizing error metric: mape
[flaml.automl.logger: 01-10 00:26:47] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 01-10 00:26:47] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 01-10 00:26:47] {2393} INFO - Estimated sufficient time budget=735s. Estimated necessary time budget=6s.
[flaml.automl.logger: 01-10 00:26:47] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.0724,	best estimator lgbm's best error=0.0724
[flaml.automl.logger: 01-10 00:26:47] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 01-10 00:26:47] {2442} INFO -  at 0.2s,	estimator lgbm's best error=0.0724,	best estimator lgbm's best error=0.0724
[flaml.automl.logger: 01-10 00:26:4

In [6]:
pred_test = pd.merge(
    test,
    agg_test[['date', 'num_sold_agg']],
    on='date',
    how='left'
)[['date', 'country', 'store', 'product', 'num_sold_agg']]
pred_test

Unnamed: 0,date,country,store,product,num_sold_agg
0,2017-01-01,Canada,Discount Stickers,Holographic Goose,82770.929688
1,2017-01-01,Canada,Discount Stickers,Kaggle,82770.929688
2,2017-01-01,Canada,Discount Stickers,Kaggle Tiers,82770.929688
3,2017-01-01,Canada,Discount Stickers,Kerneler,82770.929688
4,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode,82770.929688
...,...,...,...,...,...
98545,2019-12-31,Singapore,Premium Sticker Mart,Holographic Goose,71327.570312
98546,2019-12-31,Singapore,Premium Sticker Mart,Kaggle,71327.570312
98547,2019-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,71327.570312
98548,2019-12-31,Singapore,Premium Sticker Mart,Kerneler,71327.570312


# **COUNTRY RATIO**

In [7]:
def get_country_ratio(df):
    import requests
    def get_gdp_per_capita(alpha3, year):
        url='https://api.worldbank.org/v2/country/{0}/indicator/NY.GDP.PCAP.CD?date={1}&format=json'
        response = requests.get(url.format(alpha3,year)).json()
        return response[1][0]['value']
    rel = df[['date', 'country']].copy()
    rel['date'] = pd.to_datetime(rel['date'])
    alpha3s = {
        'Canada':'CAN',
        'Finland':'FIN',
        'Italy':'ITA',
        'Kenya':'KEN',
        'Norway':'NOR',
        'Singapore':'SGP'
    }
    rel['alpha3'] = rel['country'].map(alpha3s)
    years = range(2010,2020)
    rel['year'] = rel['date'].dt.year
    gdp = pd.DataFrame([
        [get_gdp_per_capita(alpha3, year) for year in years]
        for alpha3 in alpha3s.values()
    ], columns=years, index=alpha3s.keys())
    gdp = gdp/gdp.sum(axis=0)
    rel['relative_GDP'] = rel.apply(lambda s: gdp.loc[s['country'], s['year']], axis=1).drop(columns='alpha3')
    return rel['relative_GDP']

In [8]:
pred_test['country_ratio'] = get_country_ratio(pred_test)

# **PRODUCT RATIO**

In [9]:
from sklearn.linear_model import LinearRegression

def prepare_features(df, start_date):
    df = df.copy()
    df['t'] = (df['date'] - start_date).dt.days
    
    # 1-year cycle
    period_1yr = 365
    df['sin_1yr'] = np.sin(2 * np.pi * df['t'] / period_1yr)
    df['cos_1yr'] = np.cos(2 * np.pi * df['t'] / period_1yr)
    
    # 2-year cycle
    period_2yr = 730
    df['sin_2yr'] = np.sin(2 * np.pi * df['t'] / period_2yr)
    df['cos_2yr'] = np.cos(2 * np.pi * df['t'] / period_2yr)
    
    return df

def train_models_by_product(df):
    df = df.copy()
    df['date'] = pd.to_datetime(df['date'])
    start_date = df['date'].min()
    product_list = df['product'].unique().tolist()
    
    # Prepare the features (sin/cos for 1yr, sin/cos for 2yr)
    df_all_features = prepare_features(df, start_date)
    
    product_models = {}
    
    for product in product_list:
        df_prod = df_all_features[df_all_features['product'] == product]
        
        X = df_prod[['sin_1yr', 'cos_1yr', 'sin_2yr', 'cos_2yr']]
        y = df_prod['product_ratio']
        model = LinearRegression()
        model.fit(X, y)
        
        product_models[product] = model
        
    return product_models, start_date


def predict_ratios(df_new, product_models, start_date):
    df_new = df_new.copy()
    df_new['date'] = pd.to_datetime(df_new['date'])
    df_new_features = prepare_features(df_new, start_date)
    
    df_new_features['product_ratio'] = np.nan
    
    for product, model in product_models.items():
        mask = (df_new_features['product'] == product)
        
        X_future = df_new_features.loc[mask, ['sin_1yr','cos_1yr','sin_2yr','cos_2yr']]
        df_new_features.loc[mask, 'product_ratio'] = model.predict(X_future)
    
    return df_new_features['product_ratio']

def get_product_ratio(train, test):
    agg_sales = train.groupby(['date', 'product'])['num_sold'].sum().reset_index()
    total_sales = train.groupby(['date'])['num_sold'].sum().reset_index()
    product_ratio = pd.merge(
        agg_sales,
        total_sales,
        how='left',
        on='date',
        suffixes=('_agg','_total')
    )
    product_ratio['product_ratio'] = product_ratio['num_sold_agg'] / product_ratio['num_sold_total']

    product_models, start_date = train_models_by_product(product_ratio)

    return predict_ratios(test, product_models, start_date)

In [10]:
pred_test['product_ratio'] = get_product_ratio(train, pred_test)

# **STORE RATIO**

In [11]:
def get_store_ratio(train, test):
    agg_sales = train.groupby(['date', 'store'])['num_sold'].sum().reset_index()
    total_sales = train.groupby(['date'])['num_sold'].sum().reset_index()
    sales_ratio = pd.merge(
        agg_sales,
        total_sales,
        how='left',
        on='date',
        suffixes=('_agg','_total')
    )
    sales_ratio['store_ratio'] = sales_ratio['num_sold_agg'] / sales_ratio['num_sold_total']
    return test['store'].map(sales_ratio.groupby('store')['store_ratio'].mean().to_dict())

In [12]:
pred_test['store_ratio'] = get_store_ratio(train, pred_test)

# **FINAL PREDICTIONS**

In [13]:
pred_test['num_sold'] = pred_test['num_sold_agg'] * pred_test['country_ratio'] * pred_test['product_ratio'] * pred_test['store_ratio']

In [14]:
pred_test

Unnamed: 0,date,country,store,product,num_sold_agg,country_ratio,product_ratio,store_ratio,num_sold
0,2017-01-01,Canada,Discount Stickers,Holographic Goose,82770.929688,0.171772,0.051994,0.183811,135.879879
1,2017-01-01,Canada,Discount Stickers,Kaggle,82770.929688,0.171772,0.347890,0.183811,909.165999
2,2017-01-01,Canada,Discount Stickers,Kaggle Tiers,82770.929688,0.171772,0.263949,0.183811,689.796948
3,2017-01-01,Canada,Discount Stickers,Kerneler,82770.929688,0.171772,0.154391,0.183811,403.481065
4,2017-01-01,Canada,Discount Stickers,Kerneler Dark Mode,82770.929688,0.171772,0.181775,0.183811,475.046096
...,...,...,...,...,...,...,...,...,...
98545,2019-12-31,Singapore,Premium Sticker Mart,Holographic Goose,71327.570312,0.242061,0.050433,0.442659,385.444608
98546,2019-12-31,Singapore,Premium Sticker Mart,Kaggle,71327.570312,0.242061,0.329944,0.442659,2521.687661
98547,2019-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,71327.570312,0.242061,0.294004,0.442659,2247.008708
98548,2019-12-31,Singapore,Premium Sticker Mart,Kerneler,71327.570312,0.242061,0.149457,0.442659,1142.264872


In [15]:
import plotly.express as px

def plot_ratios(df, column):
    agg_sales = df.groupby(['date', column])['num_sold'].sum().reset_index()
    total_sales = df.groupby(['date'])['num_sold'].sum().reset_index()
    sales_ratio = pd.merge(
        agg_sales,
        total_sales,
        how='left',
        on='date',
        suffixes=('_agg','_total')
    )
    sales_ratio['ratio'] = sales_ratio['num_sold_agg'] / sales_ratio['num_sold_total']

    fig = px.line(
        sales_ratio,
        x='date',
        y='ratio',
        color=column,
        title=f"Time series of sales ratio aggregated by '{column}'.",
        template='simple_white',
        color_discrete_sequence=px.colors.qualitative.Pastel,
    )
    fig.update_layout(
        font=dict(
            family="Helvetica",
            size=12,
            color="black"
        ),
        xaxis_title=f"Time",
        yaxis_title="Ratio",
        showlegend=True,
        margin=dict(l=30, r=30, t=60, b=30)
    )
    fig.show()

plot_ratios(pred_test, 'store')
plot_ratios(pred_test, 'country')
plot_ratios(pred_test, 'product')

In [16]:
px.line(
    pd.concat([
        agg_train,
        agg_test.rename(columns={'num_sold_agg':'num_sold'})
    ]),
    x='date',
    y='num_sold'
)

In [17]:
pd.DataFrame(
    pred_test['num_sold'].astype('int32').values,
    index=test.index
).to_csv('feature_engineering_flaml_2hrs.csv')