In [1]:
from starter import *

%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [82]:
pd.set_option('display.max_rows', 500)

## Load data

In [7]:
!ls

01_Basic_Modeling_Prophet.ipynb consumption_train.csv
02_Basic_Modeling_LSTM.ipynb    meta.csv
EDA.ipynb                       new_meta.csv
[34m__pycache__[m[m                     submission_format.csv
cold_start_test.csv


In [8]:
data = dict()

In [165]:
data['test'] = pd.read_csv('cold_start_test.csv', index_col=[0], parse_dates=[2])
data['train'] = pd.read_csv('consumption_train.csv', index_col=[0], parse_dates=[2])
data['meta'] = pd.read_csv('new_meta.csv', index_col=[0], converters={'days_off':str})
data['submission'] = pd.read_csv('new_submission_format.csv', index_col=[0], parse_dates=[2])

## Prepare data

In [243]:
df = data['train']
df['train'] = 1
df = df.append(data['test'])
df['train'].fillna(0, inplace=True)

In [235]:
data['meta'].days_off.value_counts()

0000011    1236
0000000     104
0000001      21
0000111      10
0000010       5
1111111       4
0011111       3
Name: days_off, dtype: int64

In [236]:
data['submission']

Unnamed: 0,series_id,timestamp,temperature,consumption,prediction_window,total_row
0,102781,2013-03-03 00:00:00,19.931250,0.0,daily,7
1,102781,2013-03-04 00:00:00,20.034375,0.0,daily,7
2,102781,2013-03-05 00:00:00,19.189583,0.0,daily,7
3,102781,2013-03-06 00:00:00,18.397917,0.0,daily,7
4,102781,2013-03-07 00:00:00,20.762500,0.0,daily,7
5,102781,2013-03-08 00:00:00,19.800000,0.0,daily,7
6,102781,2013-03-09 00:00:00,20.466667,0.0,daily,7
7,103342,2013-06-26 00:00:00,10.486607,0.0,weekly,2
8,103342,2013-07-03 00:00:00,10.006548,0.0,weekly,2
9,102969,2013-12-15 00:00:00,20.214583,0.0,daily,7


In [237]:
data['meta']

Unnamed: 0,series_id,surface,base_temperature,monday_is_day_off,tuesday_is_day_off,wednesday_is_day_off,thursday_is_day_off,friday_is_day_off,saturday_is_day_off,sunday_is_day_off,days_off,test,prediction_window,total_row
0,100003,x-large,low,0,0,0,0,0,1,1,0000011,0.0,,
1,100004,x-large,low,0,0,0,0,0,1,1,0000011,1.0,weekly,2.0
2,100006,x-small,low,0,0,0,0,0,1,1,0000011,0.0,,
3,100008,x-small,low,0,0,0,0,0,1,1,0000011,0.0,,
4,100010,x-small,low,0,0,0,0,0,1,1,0000011,1.0,hourly,24.0
5,100012,x-large,low,0,0,0,0,0,1,1,0000011,1.0,hourly,24.0
6,100017,medium,low,0,0,0,0,0,1,1,0000011,0.0,,
7,100020,medium,low,0,0,0,0,0,1,1,0000011,1.0,weekly,2.0
8,100021,x-small,low,0,0,0,0,0,1,1,0000011,0.0,,
9,100025,x-small,low,0,0,0,0,0,1,1,0000011,0.0,,


Create column for initial time

In [244]:
time_initial = df.drop_duplicates(subset='series_id')[['series_id','timestamp']].reset_index()
time_initial.rename(columns={'timestamp': 'time_initial', 'index':'index_initial'}, inplace=True)

In [245]:
df = df.merge(data['meta'], on='series_id', how='left')
df = df.merge(time_initial, on='series_id', how='left')

In [246]:
df['delta_time'] = df['timestamp'] - df['time_initial']
df['delta_index'] = df.index - df['index_initial']

In [247]:
df['delta_time'] = df['delta_time'].dt.seconds//3600

Create dummy variables

In [248]:
df = pd.concat([df.drop(['surface','base_temperature','days_off','test','prediction_window','total_row'], axis=1), 
           pd.get_dummies(df.surface, prefix='surface'), pd.get_dummies(df.base_temperature, prefix='base_temp'), pd.get_dummies(df.days_off, prefix='days_off')], axis=1)

In [249]:
train = df[df.train==1].drop(['series_id','temperature', 'timestamp', 'time_initial', 'train'], axis=1)
test = df[df.train==0].drop(['series_id','temperature','timestamp','time_initial','train'], axis=1)

In [250]:
train.head()

Unnamed: 0,consumption,monday_is_day_off,tuesday_is_day_off,wednesday_is_day_off,thursday_is_day_off,friday_is_day_off,saturday_is_day_off,sunday_is_day_off,index_initial,delta_time,...,surface_xx-small,base_temp_high,base_temp_low,days_off_0000000,days_off_0000001,days_off_0000010,days_off_0000011,days_off_0000111,days_off_0011111,days_off_1111111
0,101842.233424,0,0,0,0,0,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
1,105878.048906,0,0,0,0,0,1,1,0,1,...,0,0,1,0,0,0,1,0,0,0
2,91619.105008,0,0,0,0,0,1,1,0,2,...,0,0,1,0,0,0,1,0,0,0
3,94473.706203,0,0,0,0,0,1,1,0,3,...,0,0,1,0,0,0,1,0,0,0
4,96976.755526,0,0,0,0,0,1,1,0,4,...,0,0,1,0,0,0,1,0,0,0


In [251]:
test.head()

Unnamed: 0,consumption,monday_is_day_off,tuesday_is_day_off,wednesday_is_day_off,thursday_is_day_off,friday_is_day_off,saturday_is_day_off,sunday_is_day_off,index_initial,delta_time,...,surface_xx-small,base_temp_high,base_temp_low,days_off_0000000,days_off_0000001,days_off_0000010,days_off_0000011,days_off_0000111,days_off_0011111,days_off_1111111
509376,15295.740389,0,0,0,0,0,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
509377,15163.209562,0,0,0,0,0,1,1,0,1,...,0,0,1,0,0,0,1,0,0,0
509378,15022.264079,0,0,0,0,0,1,1,0,2,...,0,0,1,0,0,0,1,0,0,0
509379,15370.420458,0,0,0,0,0,1,1,0,3,...,0,0,1,0,0,0,1,0,0,0
509380,15303.103213,0,0,0,0,0,1,1,0,4,...,0,0,1,0,0,0,1,0,0,0


Must figure out how to impute missing variables for temperature...skip for now

## Build Model

In [265]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [253]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

Standardize data

In [254]:
scaler = MinMaxScaler(feature_range=(-1,1))

In [257]:
X_train = train.drop(['consumption'], axis=1)
y_train = train['consumption']
X_test = test.drop(['consumption'], axis=1)
y_test = test['consumption']

In [279]:
X_train_scaled = scaler.fit_transform(X_train, y_train)
X_test_scaled = scaler.transform(X_test)

### Linear Regression

In [263]:
cross_validator = KFold(n_splits=5, shuffle=True, random_state=1)

In [285]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

In [287]:
mean_squared_error(y_test, y_pred)

7.223822276575534e+31

In [219]:
scores = cross_val_score(lr, X_scaled, y_train, scoring='neg_mean_squared_error', cv=cross_validator)
print(scores)

[-2.22831488e+10 -2.22628964e+10 -2.22599822e+10 -2.26515152e+10
 -2.23073822e+10]


### LASSO Regression

In [288]:
lasso = Lasso()
lasso.fit(X_train_scaled, y_train)
y_pred = lasso.predict(X_test_scaled)

In [289]:
mean_squared_error(y_test, y_pred)

5611460350965.574

In [220]:
scores = cross_val_score(lasso, X_scaled, y_train, scoring='neg_mean_squared_error', cv=cross_validator)
print(scores)

[-2.22829830e+10 -2.22631646e+10 -2.22598228e+10 -2.26517448e+10
 -2.23073792e+10]


### Ridge Regression

In [290]:
ridge = Ridge()
ridge.fit(X_train_scaled, y_train)
y_pred = ridge.predict(X_test_scaled)

In [291]:
mean_squared_error(y_test, y_pred)

5639172015276.223

In [221]:
scores = cross_val_score(ridge, X_scaled, y_train, scoring='neg_mean_squared_error', cv=cross_validator)
print(scores)

[-2.22829886e+10 -2.22631587e+10 -2.22598322e+10 -2.26517466e+10
 -2.23073678e+10]


### Random Forest

In [214]:
from sklearn.ensemble import RandomForestRegressor

In [292]:
rfr = RandomForestRegressor(n_estimators=20)
rfr.fit(X_train_scaled, y_train)
y_pred = rfr.predict(X_test_scaled)

In [293]:
mean_squared_error(y_test, y_pred)

323529803484.01666

In [230]:
scores = cross_val_score(rfr, X_scaled, y_train, scoring='neg_mean_squared_error', cv=cross_validator)
print(scores)

[-5.41404971e+08 -5.19038750e+08 -5.50335812e+08 -5.21253315e+08
 -5.74329259e+08]


### XGBoost

In [224]:
import xgboost as xgb

In [294]:
xgb_model = xgb.XGBRegressor(n_estimators=200)
xgb_model.fit(X_train_scaled, y_train)
y_pred = xgb_model.predict(X_test_scaled)

In [295]:
mean_squared_error(y_test, y_pred)

308376223115.3079

In [228]:
scores = cross_val_score(xgb_model, X_scaled, y_train, scoring='neg_mean_squared_error', cv=cross_validator)
print(scores)

[-8.81122723e+09 -8.58285970e+09 -8.79254331e+09 -8.85352100e+09
 -8.95018973e+09]
