In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
from torch.utils.data import TensorDataset, Dataset, DataLoader, Subset
import random
from sklearn import preprocessing
import numpy as np

In [None]:
rating_df = pd.read_csv('data/MovieLens100K/u.data', sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
rating_df.head()

In [None]:
user_df = pd.read_csv('data/MovieLens100K/u.user', sep='|', names=['userId', 'age', 'gender', 'occupation', 'zip_code'])
print(user_df.nunique())
# Deep network:
#   'Age': Continuous features
#   'Gender': Convert M/F to 0/1
#   'Occupation': Embedding into 8-D features
# Wide network
#   - Crossed: 'Occupation' x 'Genre'

# Process user features: age, gender, occupation
user_df['age_emb'] = user_df.age / 100
user_df['gender_emb'] = np.where(user_df.gender == 'M', 1, 0)
occupation_le = preprocessing.LabelEncoder()
user_df['occupation_le'] = occupation_le.fit_transform(user_df.occupation)
user_df.head()

In [None]:
movie_df = pd.read_csv('data/MovieLens100K/u.item', sep='|', encoding='latin-1', names=['movieId', 'title', 'release_date', 'video_release_date', 'IMDb_URL', *[f'genre_{i}' for i in range(19)]])
# Convert one-hot-encoded columns back to the original one
# The purpose is to create crossed-features (not doable to one-hot encoding),
#   then apply one-hot-encoding to the crossed-features for Wide network
genre_cols = []
for i in range(19):
    genre_cols.append('genre_' + str(i))
    
movie_df['genre'] = movie_df[genre_cols].idxmax(axis='columns').str.replace("genre_", "")
movie_df.drop(['title', 'release_date',	'video_release_date', 'IMDb_URL'], axis=1, inplace=True)
movie_df.head()

In [None]:
user_movie_df = pd.merge(rating_df[['userId', 'movieId', 'rating']],
                         user_df[['userId', 'age_emb', 'gender_emb', 'occupation_le']],
                         on='userId', how='left'
                         )
user_movie_df.head()

In [None]:
user_movie_df = pd.merge(user_movie_df, 
                         movie_df, on='movieId', how='left')
user_movie_df.head()

In [None]:
# Create crossed-features
user_movie_df['occupation_genre'] = user_movie_df['occupation_le'].astype(str) + "_" + user_movie_df['genre'].astype(str)
one_hot_crossed_df = pd.get_dummies(user_movie_df.occupation_genre, dtype=float)
user_movie_df = pd.merge(user_movie_df, one_hot_crossed_df, left_index=True, right_index=True)
user_movie_df.head()

In [None]:
wide_features = one_hot_crossed_df.columns
embedded_features = ['occupation_le']
# age, gender, genre
onehot_features = ['genre_' + str(i) for i in range(19)]
deep_features = ['age_emb', 'gender_emb'] + onehot_features

wide_dim = len(wide_features)
num_embeddings = len(set(user_movie_df.occupation_le))
onehot_dim = len(onehot_features)
target = ['rating']

In [None]:
class DataFrameCustomDataset(Dataset):
    def __init__(self, df, wide_features, embedded_features, deep_features, target):
        self.df = df
        self.wide_features = wide_features
        self.embedded_features = embedded_features
        self.deep_features = deep_features
        self.target = target

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        x_wide = torch.tensor(self.df[self.wide_features].iloc[index].to_numpy(), dtype=torch.float)
        x_embedded = torch.tensor(self.df[self.embedded_features].iloc[index].to_numpy(), dtype=torch.long)
        x_deep = torch.tensor(self.df[self.deep_features].iloc[index].to_numpy(), dtype=torch.float)
        y = torch.tensor(self.df[self.target].iloc[index].to_numpy(), dtype=torch.float)

        return (x_wide, x_embedded, x_deep), y        

In [None]:
user_movie_dataset = DataFrameCustomDataset(user_movie_df, wide_features, embedded_features, deep_features, target)
user_movie_dataloader = DataLoader(user_movie_dataset, batch_size=64, shuffle=True)

In [None]:
# Deep network:
#   'Age': Continuous features
#   'Gender': Convert M/F to 0/1
#   'Occupation': Embedding into 8-D features
#   'Genre':    One hot encoded
# Wide network: 
#   occupation x genre
class WideDeepNetwork(nn.Module):
    def __init__(self, wide_dim, num_embeddings, onehot_dim, deep_dim=2, embedding_dim=8):
        super().__init__()
        self.wide = nn.Linear(in_features=wide_dim, out_features=1, bias=True)
        self.embedding = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
        self.deep = nn.Sequential(
            nn.Linear(in_features=deep_dim + embedding_dim + onehot_dim, out_features=64, bias=True),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(in_features=64, out_features=32, bias=True),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(in_features=32, out_features=16, bias=True),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Linear(in_features=16, out_features=1, bias=True),
        )
    
    # wide: occupation x genre = x_wide
    # deep:
    #   occupation: embedded = x_embedded
    #   genre: one-hot = x_deep
    #   age, gender = x_deep
    def forward(self, x):
        x_wide, x_embedded, x_deep = x
        wide_output = self.wide(x_wide)
        embedded = self.embedding(x_embedded).squeeze(1) # (batch_size, 1, embedding_dim) -> (batch_size, embedding_dim) 
        deep_input = torch.cat((embedded, x_deep), dim=1)
        deep_output = self.deep(deep_input)
        return wide_output + deep_output

In [None]:
widedeep = WideDeepNetwork(wide_dim, num_embeddings, onehot_dim)
epochs = 100
optimizer = optim.Adam(widedeep.parameters(), lr=1e-3)

for epoch in range(epochs):
    epoch_loss = 0
    for (x, y) in user_movie_dataloader:
        y_output = widedeep(x)
        loss = F.mse_loss(y_output, y)
        epoch_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Loss={loss.item()}")
    
    print(f"Loss={epoch_loss}")    