In [1]:
import pandas as pd
import load_datasets
import os
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np

# Load the meta features

In [6]:
OUTPUT_DIR = 'output'
training_names = ['lightgbm', 'xgboost', 'rf', 'linear']
X_train = []
X_test = []

for t in training_names:
    HDF = os.path.join(OUTPUT_DIR, t + '.h5')
    X_train.append(pd.read_hdf(HDF, key='val'))
    X_test.append(pd.read_hdf(HDF, key='test'))

In [7]:
df_X_train = pd.concat(X_train, axis=1)
df_X_test = pd.concat(X_test, axis=1)

In [8]:
df_X_train

Unnamed: 0,lightgbmval,xgboostval,rfval,linearval
0,0.020091,0.020430,0.032904,-0.001108
1,0.055717,0.029540,0.070650,-0.000976
2,0.233411,0.281526,0.267985,0.052926
3,0.246624,0.262361,0.283776,0.218397
4,0.017394,0.002105,0.032904,0.001057
...,...,...,...,...
238167,0.131691,0.148181,0.130867,0.049634
238168,0.144753,0.157528,0.150896,0.000904
238169,0.142401,0.163682,0.145907,0.013563
238170,0.022507,0.003364,0.066953,-0.000014


In [9]:
_, y_val = load_datasets.val()

# Train the meta Model

In [10]:
meta_model = LinearRegression()
meta_model.fit(df_X_train, y_val)

LinearRegression()

In [18]:
meta_model.coef_

array([ 0.8081588 ,  0.41086136, -0.32446391,  0.03528179])

# Make Predictions

In [13]:
DATA_DIR = 'data'

pred_test = meta_model.predict(df_X_test)

pred = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
pred['item_cnt_month'] = pred_test
pred['item_cnt_month'] = pred['item_cnt_month'].clip(lower=0, upper=20)
pred.to_csv(os.path.join(OUTPUT_DIR, 'stacking.csv'), index=False)

In [19]:
pred.head()

Unnamed: 0,ID,item_cnt_month
0,0,0.323341
1,1,0.223566
2,2,0.85878
3,3,0.419711
4,4,4.621639


Your public LB scores are: 0.879874

# Post-processing

In [22]:
items = pd.read_csv(os.path.join(DATA_DIR, "items.csv"))
raw_test = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
def fix_fallout4(s):
    fallout4_items = items[items.item_name.str.contains('Fallout 4')].item_id.unique()
    fallout4_digital_items = items[(items.item_name.str.contains('Fallout 4')) & (items.item_category_id==31)].item_id.unique()
    fallout4_non_digital_items = items[(items.item_name.str.contains('Fallout 4')) & (items.item_category_id!=31)].item_id.unique()
    # print(fallout4_items, fallout4_digital_items)
    
    s.loc[raw_test[raw_test.item_id.isin(fallout4_items)]['ID']] = 20
    s.loc[raw_test[(raw_test.item_id.isin(fallout4_digital_items)) & (raw_test.shop_id!=55)]['ID']] = 0
    s.loc[raw_test[(raw_test.item_id.isin(fallout4_non_digital_items)) & (raw_test.shop_id==55)]['ID']] = 0
    return s

def fix_digital(s):
    cond_digital_items = (
        (items.item_name.str.contains("Цифровая версия")) 
        | (items.item_name.str.contains(r'\[Цифровая'))
        | (items.item_category_id.isin([31, 44, 54, 76, 78]))
    )
    cond_mix_items = (cond_digital_items) & (
        (items.item_name.str.contains("ИгроМир 2015"))
        | (items.item_name.str.contains("Комплект предзаказа"))
    )
    digital_items = items[cond_digital_items]
    non_digital_items = items[~cond_digital_items]
    mix_items = items[cond_mix_items]

    # internet shop sells only digital items
    sel_cond = (raw_test.shop_id==55) & (raw_test.item_id.isin(non_digital_items.item_id))
    # print((s.loc[raw_test[sel_cond]['ID']] ** 2).sum())
    s.loc[raw_test[sel_cond]['ID']] = 0
    # non-internet shops sell only non-digital or mixed items
    sel_cond = (raw_test.shop_id!=55) & (raw_test.item_id.isin(digital_items.item_id)) & ~(raw_test.item_id.isin(mix_items))
    # print((s.loc[raw_test[sel_cond]['ID']] ** 2).sum())
    s.loc[raw_test[sel_cond]['ID']] = 0    
    return s

s_final = pred.set_index("ID")["item_cnt_month"]
s_final = fix_fallout4(s_final)
s_final = fix_digital(s_final)

fn = os.path.join(OUTPUT_DIR, 'submission-final-2.csv')
s_final.to_csv(fn, index=True, index_label='ID', header=True)

In [23]:
s_final

ID
0         0.323341
1         0.223566
2         0.858780
3         0.419711
4         4.621639
            ...   
214195    0.142718
214196    0.037261
214197    0.043539
214198    0.026807
214199    0.053649
Name: item_cnt_month, Length: 214200, dtype: float64

Your public LB scores are: 0.828486