In [4]:
import pandas as pd
import os
import lightgbm as lgb

In [5]:
train = pd.read_csv("consumption_train.csv")
test = pd.read_csv("cold_start_test.csv")
meta = pd.read_csv("meta.csv")
submission_format = pd.read_csv("submission_format.csv")

In [6]:
train = train.set_index("_id")
test = test.set_index("_id")
#train.dtypes

In [7]:
train.head()

Unnamed: 0_level_0,series_id,timestamp,consumption,temperature
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,103088,2014-12-24 00:00:00,101842.233424,
1,103088,2014-12-24 01:00:00,105878.048906,
2,103088,2014-12-24 02:00:00,91619.105008,
3,103088,2014-12-24 03:00:00,94473.706203,
4,103088,2014-12-24 04:00:00,96976.755526,


In [8]:
test.head()

Unnamed: 0_level_0,series_id,timestamp,consumption,temperature
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,102781,2013-02-27 00:00:00,15295.740389,17.0
1,102781,2013-02-27 01:00:00,15163.209562,18.25
2,102781,2013-02-27 02:00:00,15022.264079,18.0
3,102781,2013-02-27 03:00:00,15370.420458,17.0
4,102781,2013-02-27 04:00:00,15303.103213,16.9


In [9]:
train_meta = train.join(meta.set_index("series_id"), on="series_id")
test_meta = test.join(meta.set_index("series_id"), on="series_id")

In [10]:
train_meta.dtypes

series_id                 int64
timestamp                object
consumption             float64
temperature             float64
surface                  object
base_temperature         object
monday_is_day_off          bool
tuesday_is_day_off         bool
wednesday_is_day_off       bool
thursday_is_day_off        bool
friday_is_day_off          bool
saturday_is_day_off        bool
sunday_is_day_off          bool
dtype: object

In [11]:

# surface definition we put an order to all of them 
surfaces = {'xx-small' : 0,'x-small' : 1,'small' :2,'medium' : 3,'large' :4,'x-large':5, 'xx-large':6}
train_meta["order_surface"] = train_meta['surface'].apply(lambda x : surfaces.get(x,-1))
test_meta["order_surface"] = test_meta['surface'].apply(lambda x : surfaces.get(x,-1))

In [12]:
base_temperatures = {"low":-1, "high": 1}
train_meta["order_base_temperature"] = train_meta['base_temperature'].apply(lambda x : base_temperatures.get(x,-1))
test_meta["order_base_temperature"] = test_meta['base_temperature'].apply(lambda x : base_temperatures.get(x,-1))

In [13]:
train_meta["timestamp_typed"] = train_meta.timestamp.apply(lambda x: pd.to_datetime(x))

In [14]:
test_meta["timestamp_typed"] = test_meta.timestamp.apply(lambda x: pd.to_datetime(x))

In [15]:
train_meta['year'] = train_meta.timestamp_typed.apply(lambda x : x.year)
train_meta['month'] = train_meta.timestamp_typed.apply(lambda x : x.month)
train_meta['day'] = train_meta.timestamp_typed.apply(lambda x : x.day)
train_meta['hours'] = train_meta.timestamp_typed.apply(lambda x : x.hour)

In [51]:
d = """monday_is_day_off
tuesday_is_day_off         
wednesday_is_day_off       
thursday_is_day_off        
friday_is_day_off          
saturday_is_day_off        
sunday_is_day_off""".split()

In [53]:
for i in d:
    train_meta[i] = train_meta[i].apply(lambda x : 1 if x else 0)
    test_meta[i] = test_meta[i].apply(lambda x : 1 if x else 0)

In [16]:
test_meta['year'] = test_meta.timestamp_typed.apply(lambda x : x.year)
test_meta['month'] = test_meta.timestamp_typed.apply(lambda x : x.month)
test_meta['day'] = test_meta.timestamp_typed.apply(lambda x : x.day)
test_meta['hours'] = test_meta.timestamp_typed.apply(lambda x : x.hour)

In [17]:
train_meta.columns

Index(['series_id', 'timestamp', 'consumption', 'temperature', 'surface',
       'base_temperature', 'monday_is_day_off', 'tuesday_is_day_off',
       'wednesday_is_day_off', 'thursday_is_day_off', 'friday_is_day_off',
       'saturday_is_day_off', 'sunday_is_day_off', 'order_surface',
       'order_base_temperature', 'timestamp_typed', 'year', 'month', 'day',
       'hours'],
      dtype='object')

In [54]:
y_train = train_meta["consumption"]
X_train = train_meta.drop(['series_id', 'timestamp','surface', 'temperature','consumption',"base_temperature",
                          "timestamp_typed"],axis=1)

In [55]:
feature_name = list(X_train.columns.values)

In [56]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [57]:
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)

In [58]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=1000,
                valid_sets=lgb_train,  # eval training data
                feature_name=feature_name)

[1]	training's l1: 88115.5
[2]	training's l1: 87022.6
[3]	training's l1: 85968.5
[4]	training's l1: 85020.4
[5]	training's l1: 84116.3
[6]	training's l1: 83328.7
[7]	training's l1: 82608.9
[8]	training's l1: 81906
[9]	training's l1: 81375.1
[10]	training's l1: 80878.5
[11]	training's l1: 80514.3
[12]	training's l1: 80188.8
[13]	training's l1: 79780.8
[14]	training's l1: 79629.4
[15]	training's l1: 79260.2
[16]	training's l1: 78945.6
[17]	training's l1: 78498.9
[18]	training's l1: 78277
[19]	training's l1: 77867.8
[20]	training's l1: 77622
[21]	training's l1: 77345.6
[22]	training's l1: 76980.2
[23]	training's l1: 76647.7
[24]	training's l1: 76431.9
[25]	training's l1: 76208.2
[26]	training's l1: 76135.7
[27]	training's l1: 75811.9
[28]	training's l1: 75607.3
[29]	training's l1: 75462.5
[30]	training's l1: 75396.9
[31]	training's l1: 75234.4
[32]	training's l1: 75100.2
[33]	training's l1: 74941.2
[34]	training's l1: 74763.7
[35]	training's l1: 74659.3
[36]	training's l1: 74485.4
[37]	tr

[302]	training's l1: 67762.7
[303]	training's l1: 67757.4
[304]	training's l1: 67745.5
[305]	training's l1: 67742.8
[306]	training's l1: 67734.4
[307]	training's l1: 67730.1
[308]	training's l1: 67727.8
[309]	training's l1: 67714.1
[310]	training's l1: 67701.3
[311]	training's l1: 67695.5
[312]	training's l1: 67674.1
[313]	training's l1: 67667.9
[314]	training's l1: 67663.3
[315]	training's l1: 67658.6
[316]	training's l1: 67643.9
[317]	training's l1: 67629.9
[318]	training's l1: 67616
[319]	training's l1: 67602.4
[320]	training's l1: 67601.1
[321]	training's l1: 67595.3
[322]	training's l1: 67593.9
[323]	training's l1: 67593.1
[324]	training's l1: 67592.4
[325]	training's l1: 67588.8
[326]	training's l1: 67580
[327]	training's l1: 67574.9
[328]	training's l1: 67573.3
[329]	training's l1: 67547.8
[330]	training's l1: 67546
[331]	training's l1: 67544.3
[332]	training's l1: 67539.1
[333]	training's l1: 67522
[334]	training's l1: 67501.3
[335]	training's l1: 67482
[336]	training's l1: 674

[589]	training's l1: 65949.7
[590]	training's l1: 65949
[591]	training's l1: 65942.6
[592]	training's l1: 65936.6
[593]	training's l1: 65934.1
[594]	training's l1: 65930.9
[595]	training's l1: 65926.3
[596]	training's l1: 65925.6
[597]	training's l1: 65925.1
[598]	training's l1: 65919
[599]	training's l1: 65918
[600]	training's l1: 65912.9
[601]	training's l1: 65906.7
[602]	training's l1: 65885.3
[603]	training's l1: 65866.6
[604]	training's l1: 65850.6
[605]	training's l1: 65840.9
[606]	training's l1: 65839.6
[607]	training's l1: 65825.6
[608]	training's l1: 65813.6
[609]	training's l1: 65803.4
[610]	training's l1: 65800.4
[611]	training's l1: 65797.5
[612]	training's l1: 65793.3
[613]	training's l1: 65781.8
[614]	training's l1: 65777.8
[615]	training's l1: 65773.4
[616]	training's l1: 65769.3
[617]	training's l1: 65768.2
[618]	training's l1: 65747.6
[619]	training's l1: 65743
[620]	training's l1: 65738
[621]	training's l1: 65733.2
[622]	training's l1: 65729.2
[623]	training's l1: 657

[877]	training's l1: 64726.1
[878]	training's l1: 64720.2
[879]	training's l1: 64718.4
[880]	training's l1: 64717.9
[881]	training's l1: 64712.2
[882]	training's l1: 64710.3
[883]	training's l1: 64710
[884]	training's l1: 64705.2
[885]	training's l1: 64704.9
[886]	training's l1: 64704.4
[887]	training's l1: 64704.1
[888]	training's l1: 64703.8
[889]	training's l1: 64703.5
[890]	training's l1: 64700.2
[891]	training's l1: 64699.6
[892]	training's l1: 64699
[893]	training's l1: 64698.5
[894]	training's l1: 64697.8
[895]	training's l1: 64697.4
[896]	training's l1: 64695.8
[897]	training's l1: 64692.8
[898]	training's l1: 64691.5
[899]	training's l1: 64690.4
[900]	training's l1: 64689.4
[901]	training's l1: 64684.8
[902]	training's l1: 64684.5
[903]	training's l1: 64683.5
[904]	training's l1: 64681.2
[905]	training's l1: 64680.9
[906]	training's l1: 64680.7
[907]	training's l1: 64680
[908]	training's l1: 64679.7
[909]	training's l1: 64673
[910]	training's l1: 64672.4
[911]	training's l1: 6

In [59]:
sub = pd.merge(test_meta, submission_format,  how='right', left_on=['series_id','timestamp'], right_on = ['series_id','timestamp'])

In [60]:
sub.dtypes

series_id                          int64
timestamp                         object
consumption_x                    float64
temperature_x                    float64
surface                           object
base_temperature                  object
monday_is_day_off                float64
tuesday_is_day_off               float64
wednesday_is_day_off             float64
thursday_is_day_off              float64
friday_is_day_off                float64
saturday_is_day_off              float64
sunday_is_day_off                float64
order_surface                    float64
order_base_temperature           float64
timestamp_typed           datetime64[ns]
year                             float64
month                            float64
day                              float64
hours                            float64
pred_id                            int64
temperature_y                    float64
consumption_y                    float64
prediction_window                 object
dtype: object

In [61]:
data = sub[feature_name]
predicts = gbm.predict(data)

In [62]:
submission_format.consumption = predicts

In [63]:
submission_format.to_csv("submission.csv", index=False)