<a href="https://colab.research.google.com/github/jppgks/DL-from-Scratch-with-PyTorch/blob/main/Regression_with_Feedforward_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
from torch import nn
from torch.nn import functional as F

import pandas as pd

from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

from utils import train_loop, test_loop

In [3]:
import os
os.environ["KAGGLE_USERNAME"] = input()
import getpass
os.environ["KAGGLE_KEY"] = getpass.getpass()

joppegeluykens
··········


## Data

In [4]:
!kaggle competitions download -c tabular-playground-series-jan-2022

Downloading test.csv to /content
  0% 0.00/306k [00:00<?, ?B/s]
100% 306k/306k [00:00<00:00, 40.9MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/64.2k [00:00<?, ?B/s]
100% 64.2k/64.2k [00:00<00:00, 67.0MB/s]
Downloading train.csv.zip to /content
  0% 0.00/186k [00:00<?, ?B/s]
100% 186k/186k [00:00<00:00, 5.95MB/s]


In [5]:
!unzip train.csv

Archive:  train.csv.zip
  inflating: train.csv               


### Vectorize

In [6]:
df_train = pd.read_csv("/content/train.csv", index_col="row_id")
df_test = pd.read_csv("/content/test.csv", index_col="row_id")

In [7]:
def add_time_cols(df):
  df.date = pd.to_datetime(df.date)
  df["weekday"] = df.date.dt.weekday
  df["month"] = df.date.dt.month

  return df

In [8]:
df_train = add_time_cols(df_train)
df_test = add_time_cols(df_test)

In [9]:
df_train.head()

Unnamed: 0_level_0,date,country,store,product,num_sold,weekday,month
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329,3,1
1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520,3,1
2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146,3,1
3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572,3,1
4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911,3,1


In [10]:
mapper = DataFrameMapper([
     (['weekday'], OneHotEncoder()),
     (['month'], OneHotEncoder()),
     (['country'], OneHotEncoder()),
     ('store', LabelBinarizer()),
     (['product'], OneHotEncoder())
])

In [11]:
train_data = mapper.fit_transform(df_train)
test_data = mapper.fit_transform(df_test)



In [12]:
train_data[0]

array([0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 0.])

In [13]:
train_data.shape

(26298, 26)

In [14]:
test_data.shape

(6570, 26)

In [15]:
train_ds = torch.utils.data.TensorDataset(torch.from_numpy(train_data).float(), torch.from_numpy(df_train["num_sold"].values).float())
num_train = int(.8 * len(train_ds))
num_val = len(train_ds) - num_train
train_ds, val_ds = torch.utils.data.random_split(train_ds, [num_train, num_val])

In [16]:
len(train_ds), len(val_ds)

(21038, 5260)

## Model

In [17]:
class SalesRegressor(nn.Module):
  def __init__(self):
    super(SalesRegressor, self).__init__()
    self.linear1 = nn.Linear(26, 512)
    self.out = nn.Linear(512, 1)
  
  def forward(self, inputs):
    x = F.relu(self.linear1(inputs))
    num_sold = self.out(x)

    return num_sold

In [18]:
model = SalesRegressor()

In [19]:
pred = model(torch.from_numpy(train_data).float()[0])
pred

tensor([-0.0665], grad_fn=<AddBackward0>)

## Optimization

In [20]:
learning_rate = 1e-3
batch_size = 64
epochs = 50

In [21]:
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size)
val_dataloader = torch.utils.data.DataLoader(val_ds, batch_size)

In [22]:
X, y = next(iter(train_dataloader))

In [23]:
preds = model(X)

In [24]:
loss = nn.MSELoss()

In [25]:
loss(preds.squeeze(), y)

tensor(308166.2500, grad_fn=<MseLossBackward0>)

In [26]:
if torch.cuda.is_available():
  device = 'cuda'
else:
  device = 'cpu'

In [27]:
model = model.to(device)

In [28]:
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)

In [None]:
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss, optimizer, device)
    test_loop(val_dataloader, model, loss, device)
print("Done!")

## Inference

In [None]:
preds = model(torch.from_numpy(test_data).float())

In [None]:
df_pred = pd.Series(preds.squeeze().detach().numpy(), name="num_sold", index=df_test.index).to_frame()

In [None]:
df_pred.head()

In [None]:
df_pred.to_csv("/content/submission.csv")

In [None]:
#!kaggle competitions submit -f /content/submission.csv -m "nn submission" tabular-playground-series-jan-2022