# Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.get_data import *
from IPython.display import clear_output
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

# Load data

In [54]:
%load_ext autoreload
%autoreload 2

b_data, u_data, train_reviews = get_training_data(verbose=True)
valid_reviews = get_validation_reviews()

# due to quirks with how PyTorch works, I have to subtract one from the stars, 
#   so that they range from [0, 4]
train_reviews['stars'] -= 1
valid_reviews['stars'] -= 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
TYPE: boolean. Changing False -> 0, True -> 1.
Detected NaN in column. Replacing with mean of non-NaN values.

TYPE: boolean. Changing False -> 0, True -> 1.
Detected NaN in column. Replacing with mean of non-NaN values.

TYPE: boolean. Changing False -> 0, True -> 1.
Detected NaN in column. Replacing with mean of non-NaN values.

TYPE: boolean. Changing False -> 0, True -> 1.
Detected NaN in column. Replacing with mean of non-NaN values.

TYPE: numeric.
Detected NaN in column. Replacing with mean of non-NaN values.

TYPE: boolean. Changing False -> 0, True -> 1.
Detected NaN in column. Replacing with mean of non-NaN values.

TYPE: string. Doing one-hot encoding.



# Set up neural net

## Hyperparameters

In [30]:
# Fixed
input_size = len(b_data.columns) + len(u_data.columns)
output_size = 5

# Hyperparameters / architecture choices
num_epochs = 10
batch_size = 100
learning_rate = 0.001

hidden_size = 100
num_hidden_layers = 2
activation = nn.ReLU

## Network code

In [77]:
class FFNN(nn.Module):
    """
    Feed forward neural network.
    """
    
    def __init__(self, in_size, hidden_size, num_hidden_layers, out_size, activation):
        super(FFNN, self).__init__()
        
        self.in_size = in_size
        self.out_size = out_size
        
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        
        layers = OrderedDict()
        layers['linear_in'] = nn.Linear(in_size, hidden_size)
        layers['activ_in'] = activation()
        
        for i in range(0, num_hidden_layers-1):
            layers['linear_%d' % i] = nn.Linear(hidden_size, hidden_size)
            layers['activ_%d' % i] = activation()
            
        layers['linear_out'] = nn.Linear(hidden_size, out_size)
        
        self.model = nn.Sequential(layers)
        
    def forward(self, x):
#         for i, layer in enumerate(self.model.children()):
#             x = layer(x)
            
        return self.model(x)
    
    def predict(self, x, return_probs=False):
        """
        Returns class predictions for each data point in x - a numpy array of shape
            (len(x), ).
        
        If return_probs=True, returns class probabilities instead - as a numpy array 
            of size ( len(x), num_classes ).
        """
        N = x.shape[0]
        
        if isinstance(x, Variable):
            scores = self.model(x).data.numpy()
        elif isinstance(x, torch.Tensor):
            scores = self.model(x).numpy()
        else:
            scores = self.model(torch.FloatTensor(x)).numpy()
            
        probs = np.exp(scores)
        probs /= probs.sum(axis=1).reshape(N, 1)
        
        if return_probs:
            return probs
        
        predictions = probs.argmax(axis=1)
        return predictions
        

## Helper functions for training

In [68]:
def make_arrays(b_data, u_data, reviews):
    """
    Returns X, y as numpy arrays.
    """
    Nr = len(reviews['stars'])
    Db, Du = len(b_data.columns), len(u_data.columns)
    X = np.zeros((Nr, Db+Du))
    y = np.zeros(Nr)
    
    for j, (i, review) in enumerate(reviews.iterrows()):
        u_id = review['user_id']
        b_id = review['business_id']
        y[j] = review['stars']
        
        X[j, :Db] = b_data.loc[b_id].values
        X[j, Db:] = u_data.loc[u_id].values
        
    return X, y

def one_hot_encode(y):
    y_out = np.zeros((y.shape[0], 5))
    
    for i, c in enumerate(y):
        y_out[i, c] = 1
        
    return y_out

def calculate_accuracy(model, reviews, b_data, u_data):
    X, y = make_arrays(b_data, u_data, reviews)
    
    y_pred = np.zeros_like(y)
    for i in range(0, X.shape[0], 100):
        x_in = X[i:i+100]
        
        y_out = model.predict(x_in)
        
        y_pred[i:i+100] = y_out
        
    accuracy = np.mean(np.where(y_pred == y, 1, 0))
    return accuracy

def calculate_mse(model, reviews, b_data, u_data):
    X, y = make_arrays(b_data, u_data, reviews)
    
    y_pred = np.zeros_like(y)
    for i in range(0, X.shape[0], 100):
        x_in = X[i:i+100]
        
        y_out = model.predict(x_in)
        
        y_pred[i:i+100] = y_out
        
    mse = np.mean((y - y_pred)**2)
    return mse

# Train network

In [63]:
model = FFNN(input_size, hidden_size, num_hidden_layers, output_size, activation)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

stats = np.zeros((num_epochs, 3))
N = len(train_reviews['stars'])
num_batches = int(np.ceil(N / batch_size))

for epoch in range(num_epochs):
    # shuffle review order
    shuf_idx = np.random.permutation(N)
    
    print('Epoch [%d/%d]' % (epoch+1, num_epochs))
    losses = []
    for b_num in range(num_batches):
        if (b_num % 500) == 0:
            print('Batch [%d/%d]' % (b_num+1, num_batches))
        # Create batch
        batch_idx = shuf_idx[b_num:b_num+100]
        batch_reviews = train_reviews.iloc[batch_idx]
        
        X_batch, y_batch = make_arrays(b_data, u_data, batch_reviews)
        X_batch = Variable(torch.FloatTensor(X_batch))
        y_batch = torch.LongTensor(y_batch)
        
        # forward pass; compute loss
        outputs = model(X_batch)
        try:
            loss = criterion(outputs, y_batch)
        except RuntimeError:
            print(np.unique(y_batch.numpy()))
            print(epoch, b_num)
            raise
        
        # Backward pass; update parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.data.item())
        
    avg_loss = np.mean(losses)
    stats[epoch, 0] = avg_loss
    print('avg_loss: ', avg_loss)
        
#     train_accuracy = calculate_accuracy(model, train_reviews, b_data, u_data)
#     stats[epoch, 1] = train_accuracy
    
#     valid_accuracy = calculate_accuracy(model, valid_reviews, b_data, u_data)
#     stats[epoch, 2] = valid_accuracy
    
    clear_output()
    

    

# See results

In [81]:
#train_mse = calculate_mse(model, train_reviews, b_data, u_data)
#valid_mse = calculate_mse(model, valid_reviews, b_data, u_data)

# mse = calculate_mse(model, train_reviews.iloc[:2], b_data, u_data)

def predict2(self, x, return_probs=False):
    """
    Returns class predictions for each data point in x - a numpy array of shape
        (len(x), ).

    If return_probs=True, returns class probabilities instead - as a numpy array 
        of size ( len(x), num_classes ).
    """
    N = x.shape[0]

    if isinstance(x, Variable):
        scores = self.model(x).data.numpy()
    elif isinstance(x, torch.Tensor):
        scores = self.model(x).numpy()
    else:
        scores = self.model(torch.FloatTensor(x)).data.numpy()

    probs = np.exp(scores)
    probs /= probs.sum(axis=1).reshape(N, 1)

    if return_probs:
        return probs

    predictions = probs.argmax(axis=1)
    return predictions

def calculate_mse2(model, reviews, b_data, u_data):
    X, y = make_arrays(b_data, u_data, reviews)
    
    y_pred = np.zeros_like(y)
    for i in range(0, X.shape[0], 100):
        x_in = X[i:i+100]
        
        y_out = predict2(model, x_in)
        
        y_pred[i:i+100] = y_out
        
    mse = np.mean((y - y_pred)**2)
    return mse

train_mse = calculate_mse2(model, train_reviews, b_data, u_data)
valid_mse = calculate_mse2(model, valid_reviews, b_data, u_data)

print('Final training MSE: ', train_mse)
print('Final validation MSE: ', valid_mse)

Final training MSE:  1.5281165131263645
Final validation MSE:  1.7213091838568604


