In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [3]:
train = pd.read_csv('./dts/train.csv')
test = pd.read_csv('./dts/test.csv')

In [4]:
train.head()

Unnamed: 0,ID,Exercise_Duration,Body_Temperature(F),BPM,Height(Feet),Height(Remainder_Inches),Weight(lb),Weight_Status,Gender,Age,Calories_Burned
0,TRAIN_0000,26.0,105.6,107.0,5.0,9.0,154.3,Normal Weight,F,45,166.0
1,TRAIN_0001,7.0,103.3,88.0,6.0,6.0,224.9,Overweight,M,50,33.0
2,TRAIN_0002,7.0,103.3,86.0,6.0,3.0,218.3,Overweight,M,29,23.0
3,TRAIN_0003,17.0,104.0,99.0,5.0,6.0,147.7,Normal Weight,F,33,91.0
4,TRAIN_0004,9.0,102.7,88.0,5.0,10.0,169.8,Normal Weight,M,38,32.0


In [5]:
train_x = train.drop(['ID','Calories_Burned'], axis=1)
train_y = train['Calories_Burned']

test_x = test.drop('ID', axis=1)

In [6]:
ordinal_features = ['Weight_Status','Gender']

for feature in ordinal_features:
    le = LabelEncoder()
    le.fit(train_x[feature])
    train_x[feature] = le.transform(train_x[feature])
    
    for label in np.unique(test_x[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
            
    test_x[feature] = le.transform(test_x[feature])

In [7]:
model = DecisionTreeRegressor(random_state=42)
model2 = RandomForestRegressor(random_state=42)

In [8]:
model.fit(train_x, train_y)
model2.fit(train_x, train_y)

In [9]:
pred = model.predict(test_x)
pred2 = model2.predict(test_x)

In [10]:
submission = pd.read_csv('./dts/sample_submission.csv')

submission['Calories_Burned'] = pred
submission.to_csv('./submit.csv', index=False)

submission['Calories_Burned'] = pred2
submission.to_csv('./submit2.csv', index=False)

In [15]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error as mse

In [13]:
gbm = GradientBoostingRegressor(random_state=42)
lgbm = LGBMRegressor(random_state=42)
rfc = RandomForestRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)

In [14]:
gbm.fit(train_x, train_y)
lgbm.fit(train_x, train_y)
rfc.fit(train_x, train_y)
xgb.fit(train_x, train_y)

In [19]:
print('gbm')
print(mse(train_y,gbm.predict(train_x),squared=False))

print('lgbm')
print(mse(train_y,lgbm.predict(train_x),squared=False))

print('rfc')
print(mse(train_y,rfc.predict(train_x),squared=False))

print('xgb')
print(mse(train_y,xgb.predict(train_x),squared=False))

gbm
3.3384837382279366
lgbm
1.64810964633507
rfc
1.3107426342853632
xgb
1.1603452941446832


In [25]:
# pred_blend = (gbm.predict(train_x)+lgbm.predict(train_x)+rfc.predict(train_x)+xgb.predict(train_x))/4
pred_blend = (lgbm.predict(train_x)+rfc.predict(train_x)+xgb.predict(train_x))/3
print('blended')
print(mse(train_y, pred_blend, squared=False))

blended
0.9829675129389263


In [26]:
submission = pd.read_csv('./dts/sample_submission.csv')

submission['Calories_Burned'] = pred_blend
submission.to_csv('./submit.csv', index=False)