<a href="https://colab.research.google.com/github/juanprida/pytorch-practice/blob/master/nn_toy_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files

In [2]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [6]:
pdf_train_raw = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/tabular_playground_series_sep_2022/train.csv")
pdf_test = pd.read_csv("/content/gdrive/MyDrive/Colab Notebooks/tabular_playground_series_sep_2022/test.csv")

In [7]:
def add_dateparts(
    df: pd.DataFrame,
    date: str
    ) -> pd.DataFrame:
    """Add date parts to date column."""

    pdf = df.copy()
    pdf[date] = pd.to_datetime(pdf[date])
    pdf['year'] = pdf[date].dt.year
    pdf['quarter'] = pdf[date].dt.quarter
    pdf['month'] = pdf[date].dt.month
    pdf['dayofweek'] = pdf[date].dt.dayofweek
    pdf['dayofmonth'] = pdf[date].dt.day
    pdf.drop(columns=[date], inplace=True)

    return pdf

def concat_frames(
    pdf_train: pd.DataFrame,
    pdf_test : pd.DataFrame, y: str
    ) -> pd.DataFrame:
    """ Concatenate train and test df."""
    
    pdf_tr, pdf_te = pdf_train.copy(), pdf_test.copy()
    pdf_tr['partition'], pdf_te['partition'] = 'train', 'test'
    pdf_te[y] = np.NaN

    return pd.concat([pdf_train, pdf_test])

def preprocess(
    settings_dict: dict,
    pdf_fit: pd.DataFrame,
    pdf_transform: pd.DataFrame
    ) -> pd.DataFrame:
    """Encode columns based on settings_dict."""

    pdf_in = pdf_fit.copy()
    pdf_out = pdf_transform.copy()

    # Iterate through encoders.
    for i in settings_dict:
        # Get columns.
        cols = settings_dict[i][0]
        # Get sklearn encoder.
        p = settings_dict[i][1]
        # Fit encoder.
        p.fit(pdf_in[cols])
        # Drop columns and add encoded ones.
        pdf_aux = pdf_out[cols]
        pdf_out.drop(columns=cols, inplace=True)
        pdf_out[p.get_feature_names_out()] = p.transform(pdf_aux[cols])
    
    return pdf_out

def split_cols(
    pdf: pd.DataFrame,
    X_cols: list,
    y_cols: list
    ) -> pd.DataFrame:
    """ Split pdf in X and y."""

    return pdf[X_cols].to_numpy(), pdf[y_cols].to_numpy()

In [8]:
pdf_train = add_dateparts(pdf_train_raw, 'date')

preprocess_settings = {'categorical_columns': (['year', 'quarter', 'month', 'dayofweek', 'dayofmonth', 'country', 'store', 'product'], OneHotEncoder(handle_unknown='ignore', sparse=False))}
pdf_train = preprocess(preprocess_settings, pdf_train, pdf_train)

X_cols = [c for c in pdf_train if c not in ['row_id', 'num_sold']]
y_cols = ['num_sold']

X, y = split_cols(pdf_train, X_cols, y_cols)
X_train, X_val, y_train, y_val  = train_test_split(X, y, random_state=97)

In [9]:
reg = LinearRegression().fit(X_train, y_train)
print(np.sum((reg.predict(X_val) - y_val) ** 2) / len(y_val))

3013.3236938170203


In [10]:
learning_rate = 1e-3
lambda_l2 = 1e-5
batch_size = 16

In [17]:
train_data = TensorDataset(torch.Tensor(X_train).to(device), torch.Tensor(y_train).to(device))
val_data = TensorDataset(torch.Tensor(X_val).to(device), torch.Tensor(y_val).to(device))

train_dataloader = DataLoader(train_data, batch_size=batch_size)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

In [31]:
class JuanNet(nn.Module):
    def __init__(self):
        super(JuanNet, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_model = nn. Sequential(
            nn.Linear(70, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, X):
        X = self.flatten(X)
        return self.linear_model(X)

model = JuanNet().to(device)

criterion = nn.MSELoss()

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=lambda_l2)

In [45]:
def train(model, dataloader, criterion, optimizer):
    size = len(dataloader.dataset)
    # To check what .train does.
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = criterion(y, pred)

        # Backprop.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # loss, current = loss.item(), batch * len(X)
        # print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def validate(model, dataloader, criterion):
    size = len(dataloader.dataset)
    n_batches = len(dataloader)
    # To check what .eval does
    model.eval()
    test_loss = 0
    out = {'pred': [], 'y': [], 'loss':[]} 
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            loss = criterion(y, pred)

            out['pred'].append(pred)
            out['y'].append(y)
            out['loss'].append(loss)

            test_loss += loss.item()
    print(test_loss / n_batches)
    return out
            # print(test_loss)

In [46]:
epochs = 5
for e in range(epochs):
    print(f"epoch {e}")
    train(model, train_dataloader, criterion, optimizer)
print('Done')

epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
Done
