In [1]:
import pandas as pd
import numpy as np
import xgboost
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
import copy
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 500)

In [2]:
df = pd.read_csv("data/train_competition_2023.csv")

In [3]:
df.shape

(432600, 18)

Same preprocessing as in checkpoint 1

In [4]:
df['time']=pd.to_datetime(df['time']) 
df['hour']=df['time'].dt.hour
df['month']=df['time'].dt.month
df['year']=df['time'].dt.year

In [5]:
df.sort_values(['obs','time'])
mean_df=df.groupby('obs').first()[['num_0','num_1','num_2','cat_0','cat_1','cat_2','cat_3','cat_4','t_0','t_1','t_2','t_3','t_4','year','month','hour','y_1','y_2']]

In [6]:
mean_df_last=df.groupby('obs').last()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_mean=df.groupby('obs').mean()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_var=df.groupby('obs').var()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_med=df.groupby('obs').median()[['t_0','t_1','t_2','t_3','t_4']]


  mean_df_mean=df.groupby('obs').mean()[['t_0','t_1','t_2','t_3','t_4']]
  mean_df_var=df.groupby('obs').var()[['t_0','t_1','t_2','t_3','t_4']]
  mean_df_med=df.groupby('obs').median()[['t_0','t_1','t_2','t_3','t_4']]


In [7]:
mean_df[['t_0d','t_1d','t_2d','t_3d','t_4d']]=mean_df_last
mean_df[['t_0m','t_1m','t_2m','t_3m','t_4m']]=mean_df_mean
mean_df[['t_0v','t_1v','t_2v','t_3v','t_4v']]=mean_df_var
mean_df[['t_0med','t_1med','t_2med','t_3med','t_4med']]=mean_df_med

In [25]:
x_train, x_test, y_train, y_test = train_test_split(mean_df[['t_0','t_1','t_2','t_3','t_4','t_0d','t_1d','t_2d','t_3d','t_4d','t_0m','t_1m','t_2m','t_3m','t_4m','t_0med','t_1med','t_2med','t_3med','t_4med']], mean_df[['y_1','y_2']], test_size=0.2, random_state=13)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=13)
                                                    

### Pytorch 

In [26]:
x_train = torch.tensor(x_train.values, dtype=torch.float32)
x_val = torch.tensor(x_val.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_val = torch.tensor(y_val.values, dtype=torch.float32)

In [10]:
x_train.shape, y_train.shape

(torch.Size([8652, 20]), torch.Size([8652, 2]))

In [11]:
model = nn.Sequential(
    nn.Linear(20, 160),
    nn.ReLU(),
    nn.Linear(160, 80),
    nn.ReLU(),
    nn.Linear(80, 40),
    nn.ReLU(),
    nn.Linear(40, 20),
    nn.ReLU(),
    nn.Linear(20, 2)
)


loss_fn = nn.L1Loss()  # mean absolute error
optimizer = optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-5)

In [12]:
n_epochs = 1500   # number of epochs to run
batch_size = 90  # size of each batch
batch_start = torch.arange(0, len(x_train), batch_size)

best_mae = np.inf   # init to infinity
best_weights = None
history = []

### Train and Validation

In [13]:
for epoch in range(n_epochs):
    model.train()
    with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar: # Can set disable=False if you want to see the progress
        bar.set_description(f"Epoch {epoch}")
        for start in bar:
            # take a batch
            X_batch = x_train[start:start+batch_size]
            y_batch = y_train[start:start+batch_size]
            # forward pass
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            # backward pass
            optimizer.zero_grad()
            loss.backward()
            # update weights
            optimizer.step()
            # print progress
            bar.set_postfix(mae=float(loss))
    # evaluate accuracy at end of each epoch
    model.eval()
    y_pred = model(x_val)
    mae = loss_fn(y_pred, y_val)
    mae = float(mae)
    history.append(mae)
    if mae < best_mae:
        best_mae = mae
        best_weights = copy.deepcopy(model.state_dict())

In [14]:
model.load_state_dict(best_weights)
print("MAE: %.2f" % best_mae)

MAE: 4.17


### Test MAE

In [27]:
model.eval()
y_pred_test = model(torch.tensor(x_test.values, dtype=torch.float32))
y_test = torch.tensor(y_test.values, dtype=torch.float32)

In [29]:
loss_fn(y_pred_test, y_test).detach()

tensor(4.1873)

### Using the Test Dataset to produce second batch of inferences

In [31]:
df_test = pd.read_csv("data/test_no_outcome.csv")
df_test['time']=pd.to_datetime(df_test['time']) 
df_test['hour']=df_test['time'].dt.hour
df_test['month']=df_test['time'].dt.month
df_test['year']=df_test['time'].dt.year
df_test.sort_values(['obs','time'],inplace=True)
mean_df_test=df_test.groupby('obs').first()[['num_0','num_1','num_2','cat_0','cat_1','cat_2','cat_3','cat_4','t_0','t_1','t_2','t_3','t_4','year','month','hour']]
mean_df_test_last=df_test.groupby('obs').last()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_test_mean=df_test.groupby('obs').mean()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_test_var=df_test.groupby('obs').var()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_test_med=df_test.groupby('obs').median()[['t_0','t_1','t_2','t_3','t_4']]

mean_df_test[['t_0d','t_1d','t_2d','t_3d','t_4d']]=mean_df_test_last
mean_df_test[['t_0m','t_1m','t_2m','t_3m','t_4m']]=mean_df_test_mean
mean_df_test[['t_0med','t_1med','t_2med','t_3med','t_4med']]=mean_df_test_med

  mean_df_test_mean=df_test.groupby('obs').mean()[['t_0','t_1','t_2','t_3','t_4']]
  mean_df_test_var=df_test.groupby('obs').var()[['t_0','t_1','t_2','t_3','t_4']]
  mean_df_test_med=df_test.groupby('obs').median()[['t_0','t_1','t_2','t_3','t_4']]


In [32]:
# make predictions using neural network
kaggle_x = mean_df_test[['t_0','t_1','t_2','t_3','t_4','t_0d','t_1d','t_2d','t_3d','t_4d','t_0m','t_1m','t_2m','t_3m','t_4m','t_0med','t_1med','t_2med','t_3med','t_4med']]
kaggle_x = torch.tensor(kaggle_x.values, dtype=torch.float32)
kaggle_x.shape

torch.Size([3450, 20])

In [33]:
model.eval()
y_pred = model(kaggle_x)
y_pred

tensor([[ 40.9478, 105.1709],
        [ 31.8946, 100.5568],
        [ 34.9711,  95.5676],
        ...,
        [ 50.1896, 112.6436],
        [ 46.6133,  99.8016],
        [ 56.1247, 105.5437]], grad_fn=<AddmmBackward0>)

In [34]:
pred_df = pd.DataFrame(y_pred.detach().numpy(),index=mean_df_test.index,columns=['y_1','y_2'])

In [35]:
pred_df

Unnamed: 0_level_0,y_1,y_2
obs,Unnamed: 1_level_1,Unnamed: 2_level_1
18,40.947823,105.170876
19,31.894579,100.556824
20,34.971069,95.567619
21,34.118431,94.770287
22,33.065632,94.224350
...,...,...
17850,55.067841,87.783844
17862,58.339146,103.796722
17863,50.189644,112.643623
17864,46.613251,99.801582


In [36]:
pred_df.to_csv('submission_nn.csv')

### XGboost

In [37]:
df = pd.read_csv("data/train_competition_2023.csv")
df['time']=pd.to_datetime(df['time']) 
df['hour']=df['time'].dt.hour
df['month']=df['time'].dt.month
df['year']=df['time'].dt.year
df.sort_values(['obs','time'])
#check how many sub_id's per obs
# mean_df=df.groupby('obs').mean()[['num_0','num_1','num_2','cat_0','cat_1_2','cat_1_1','cat_1_5','cat_1_6','cat_2','cat_3','cat_4','t_0','t_1','t_2','t_3','t_4','y_1','y_2']]
mean_df=df.groupby('obs').first()[['num_0','num_1','num_2','cat_0','cat_1','cat_2','cat_3','cat_4','t_0','t_1','t_2','t_3','t_4','year','month','hour','y_1','y_2']]
mean_df_last=df.groupby('obs').last()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_mean=df.groupby('obs').mean()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_var=df.groupby('obs').var()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_med=df.groupby('obs').median()[['t_0','t_1','t_2','t_3','t_4']]
mean_df[['t_0d','t_1d','t_2d','t_3d','t_4d']]=mean_df_last
mean_df[['t_0m','t_1m','t_2m','t_3m','t_4m']]=mean_df_mean
mean_df[['t_0v','t_1v','t_2v','t_3v','t_4v']]=mean_df_var
mean_df[['t_0med','t_1med','t_2med','t_3med','t_4med']]=mean_df_med
x_train, x_test, y_train, y_test = train_test_split(mean_df[['t_0','t_1','t_2','t_3','t_4','t_0d','t_1d','t_2d','t_3d','t_4d','t_0m','t_1m','t_2m','t_3m','t_4m','t_0v','t_1v','t_2v','t_3v','t_4v','t_0med','t_1med','t_2med','t_3med','t_4med']], mean_df[['y_1','y_2']], test_size=0.2, random_state=13)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=13)
                                                    

  mean_df_mean=df.groupby('obs').mean()[['t_0','t_1','t_2','t_3','t_4']]
  mean_df_var=df.groupby('obs').var()[['t_0','t_1','t_2','t_3','t_4']]
  mean_df_med=df.groupby('obs').median()[['t_0','t_1','t_2','t_3','t_4']]


In [38]:
x_train.columns

Index(['t_0', 't_1', 't_2', 't_3', 't_4', 't_0d', 't_1d', 't_2d', 't_3d',
       't_4d', 't_0m', 't_1m', 't_2m', 't_3m', 't_4m', 't_0v', 't_1v', 't_2v',
       't_3v', 't_4v', 't_0med', 't_1med', 't_2med', 't_3med', 't_4med'],
      dtype='object')

In [39]:
x_train.shape

(8652, 25)

In [40]:
params = {
        'n_estimators': [500, 2000, 2500, 5000],
        'eta': [0.05, 0.1],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7]
        }

In [41]:
model = xgboost.XGBRegressor()

In [42]:
grid_cv = GridSearchCV(estimator=model,
            param_grid=params,
            cv=3,
            scoring = 'neg_mean_absolute_error',
            n_jobs = 4,
            verbose = 1, 
            return_train_score = True)

I used grid search to find the best parameters for the XGBoost model 

In [43]:
grid_cv.fit(x_train, y_train)
grid_cv.best_params_

Fitting 3 folds for each of 216 candidates, totalling 648 fits




{'colsample_bytree': 0.6,
 'eta': 0.05,
 'max_depth': 3,
 'n_estimators': 500,
 'subsample': 0.8}

In [44]:
# best model
model = xgboost.XGBRegressor(n_estimators=500, max_depth=3, eta=0.05, subsample=0.8, colsample_bytree=1, alpha= 5, gamma=5, reg_lambda=5)

In [45]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, x_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

XGBoost performed worse than the neural network

In [46]:
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

Mean MAE: 4.299 (0.128)


In [48]:
df_test = pd.read_csv("data/test_no_outcome.csv")
df_test['time']=pd.to_datetime(df_test['time']) 
df_test['hour']=df_test['time'].dt.hour
df_test['month']=df_test['time'].dt.month
df_test['year']=df_test['time'].dt.year
df_test.sort_values(['obs','time'],inplace=True)
mean_df_test=df_test.groupby('obs').first()[['num_0','num_1','num_2','cat_0','cat_1','cat_2','cat_3','cat_4','t_0','t_1','t_2','t_3','t_4','year','month','hour']]
mean_df_test_last=df_test.groupby('obs').last()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_test_mean=df_test.groupby('obs').mean()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_test_var=df.groupby('obs').var()[['t_0','t_1','t_2','t_3','t_4']]
mean_df_test_med=df.groupby('obs').median()[['t_0','t_1','t_2','t_3','t_4']]


mean_df_test[['t_0d','t_1d','t_2d','t_3d','t_4d']]=mean_df_test_last
mean_df_test[['t_0m','t_1m','t_2m','t_3m','t_4m']]=mean_df_test_mean
mean_df_test[['t_0v','t_1v','t_2v','t_3v','t_4v']]=mean_df_test_var
mean_df_test[['t_0med','t_1med','t_2med','t_3med','t_4med']]=mean_df_test_med
# make predictions using neural network
kaggle_x = mean_df_test[['t_0','t_1','t_2','t_3','t_4','t_0d','t_1d','t_2d','t_3d','t_4d','t_0m','t_1m','t_2m','t_3m','t_4m','t_0v','t_1v','t_2v','t_3v','t_4v','t_0med','t_1med','t_2med','t_3med','t_4med']]
# kaggle_x = torch.tensor(kaggle_x.values, dtype=torch.float32)

  mean_df_test_mean=df_test.groupby('obs').mean()[['t_0','t_1','t_2','t_3','t_4']]
  mean_df_test_var=df.groupby('obs').var()[['t_0','t_1','t_2','t_3','t_4']]
  mean_df_test_med=df.groupby('obs').median()[['t_0','t_1','t_2','t_3','t_4']]


In [49]:
model.fit(x_train, y_train)

XGBRegressor(alpha=5, base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False, eta=0.05,
             eval_metric=None, feature_types=None, gamma=5, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.0500000007,
             max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=500, n_jobs=0,
             num_parallel_tree=1, ...)

In [50]:
yhat = model.predict(kaggle_x)

In [51]:
xg_df = pd.DataFrame(yhat,index=mean_df_test.index,columns=['y_1','y_2'])

In [52]:
xg_df

Unnamed: 0_level_0,y_1,y_2
obs,Unnamed: 1_level_1,Unnamed: 2_level_1
18,47.632801,102.961861
19,34.066547,99.767471
20,39.727699,94.508415
21,38.666443,94.296005
22,40.546028,93.643608
...,...,...
17850,60.875542,85.320419
17862,61.855297,100.711723
17863,56.507080,109.092285
17864,54.376888,99.590324


In [53]:
torch_df = pd.DataFrame(y_pred.detach().numpy(),index=mean_df_test.index,columns=['y_1','y_2'])

Output from the ensembling of neural network and XGBoost predictions

In [54]:
fin_df = (xg_df + torch_df)/2

In [35]:
fin_df.to_csv('submission_nn_xgb.csv')