In [1]:
import glob
import re
import numpy as np
import pandas as pd

<h3>File Parser</h3>
<p>My function for injesting the Netflix dataset text files:</p>

In [49]:
files = glob.glob('/Users/frjo6001/Desktop/netflix-prize-data/combined_*.txt')

def netflixDataParse( file_path ):
    movieExt = re.compile('(\d+)\:\n')
    dataExt  = re.compile('(\d+),(\d+),(\d{4}\-\d{2}-\d{2})')
    
    last_movie = None
    data=[]
    for line in open(file_path).readlines():
        movie = movieExt.findall(line)
        review= dataExt.findall(line)
        if len(movie) > 0: last_movie = int(movie[0])
        elif len(review) > 0:
            review = review[0]
            data.append((int(review[0]), last_movie, int(review[1]), review[2]))
    
    return pd.DataFrame(data=data, columns=['userID', 'movieID', 'rating','date'])
        

In [137]:
%time d=netflixDataParse(files[0])

CPU times: user 1min 58s, sys: 18 s, total: 2min 16s
Wall time: 2min 21s


This works, but text file i/o is slow. We can spead this up by going to a binary file format!

In [51]:
sorted(files)

['/Users/frjo6001/Desktop/netflix-prize-data/combined_data_1.txt',
 '/Users/frjo6001/Desktop/netflix-prize-data/combined_data_2.txt',
 '/Users/frjo6001/Desktop/netflix-prize-data/combined_data_3.txt',
 '/Users/frjo6001/Desktop/netflix-prize-data/combined_data_4.txt']

In [52]:
p='/Users/frjo6001/Desktop/netflix-prize-data/data_bin/{0}_data/df_{0}_{1}.pkl'

for i,f in enumerate(sorted(files)):
    d = netflixDataParse(f).astype({'rating':np.uint8, 
                                    'userID':np.uint32, 
                                    'movieID':np.uint32}
    indx = np.arange(d.shape[0])
    
    np.random.seed(0)
    np.random.shuffle(indx)
    point = round(indx.shape[0]*.8)
    
    df_train, df_test = d.iloc[0:point], d.iloc[point:]
    df_train.to_pickle(p.format('train',i),compression='bz2')
    df_test.to_pickle(p.format('test',i),compression='bz2')

<h3>Pytorch Dataloader ABC</h3>
<p>To use PyTorch built in tools for data loading we need to implement our own "\__len__" and "\__getitem__" methods. See doccumentation in torch.utils.data.dataset.Dataset
</p>

In [108]:
import torch
import torch.utils.data

class NetflixDS( torch.utils.data.dataset.Dataset ):
    def __init__(self, path):
        super().__init__()
        self.path = path
        self.df   = None
    
    def load(self): 
        d = pd.read_pickle(self.path, compression='bz2')
        self.df = d[['userID','movieID','rating']]
        self.df = self.df.astype({'rating':np.uint8, 
                                  'userID':np.uint32, 
                                  'movieID':np.uint32}
                                )
    def clear(self):
        del self.df
        self.df = None
        
    def __len__(self):
        if not isinstance(self.df, pd.DataFrame ):
            self.load()
        return self.df.shape[0]
    
    def __getitem__(self, index):
        if not isinstance(self.df, pd.DataFrame ):
            self.load()          
        return self.df.iloc[index].to_dict()

In [109]:
bin_file_paths = glob.glob('/Users/frjo6001/Desktop/netflix-prize-data/data_bin/train_data/*.pkl')
df = torch.utils.data.ConcatDataset([NetflixDS(d) for d in bin_file_paths])

In [110]:
unique_users  = set()
unique_movies = set()

for ds in df.datasets:
    unique_users = unique_users.union( set(ds.df.userID.unique()) )
    unique_movies = unique_movies.union( set(ds.df.movieID.unique()) )
    
unique_users = {u:i for i,u in enumerate(unique_users)}
unique_movies = {u:i for i,u in enumerate(unique_movies)}

for ds in df.datasets:
    ds.df.userID = ds.df.userID.apply(lambda x: unique_users[x])
    ds.df.movieID = ds.df.movieID.apply(lambda x: unique_movies[x])

In [111]:
dataLoader = torch.utils.data.DataLoader(df, batch_size=500, num_workers=2)

In [112]:
class NetflixRecom(torch.nn.Module):
    def __init__(self, unique_users, unique_movies):
        super().__init__()
        self.userEmbed  = torch.nn.Embedding(len(unique_users)+1,8)
        self.movieEmbed = torch.nn.Embedding(len(unique_movies)+1,8)
        
        self.batch_norm = torch.nn.BatchNorm1d(1)
    
    def forward(self,**b):
        e_u = self.userEmbed(b['userID'].view(-1,1))
        e_m = self.movieEmbed(b['movieID'].view(-1,1))
        
        p = self.batch_norm(torch.matmul( e_u, e_m.transpose(1,2)))
        
        return p
        

In [113]:
myMod = NetflixRecom(unique_users, unique_movies)
opt   = torch.optim.SGD(myMod.parameters(),lr=1.e-3)

loss_fn = torch.nn.MSELoss()

In [114]:
import tqdm
netloss = 0
for i, b in enumerate(tqdm.tqdm_notebook(dataLoader)):
    preds = myMod(**b)
    loss = loss_fn(preds.view(-1,1),b['rating'].unsqueeze(-1).float())
    loss.backward()
    netloss += loss.item()
    if i%10000==0: print("Step {} of {}: {}".format(i+1,len(dataLoader), round(netloss/(i+1),2)))
    opt.step()
    myMod.zero_grad()
    

HBox(children=(IntProgress(value=0, max=160769), HTML(value='')))

Step 1 of 160769: 12.29
Step 10001 of 160769: 1.51
Step 20001 of 160769: 1.35
Step 30001 of 160769: 1.27
Step 40001 of 160769: 1.25


Process Process-63:
Process Process-64:
Traceback (most recent call last):
  File "/Users/frjo6001/.pyenv/versions/3.6.5/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()


KeyboardInterrupt: 

  File "/Users/frjo6001/.pyenv/versions/3.6.5/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/frjo6001/.pyenv/versions/3.6.5/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 57, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/Users/frjo6001/.pyenv/versions/3.6.5/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 57, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/Users/frjo6001/.pyenv/versions/3.6.5/lib/python3.6/site-packages/torch/utils/data/dataset.py", line 81, in __getitem__
    return self.datasets[dataset_idx][sample_idx]
  File "<ipython-input-108-e0fcb1636d31>", line 27, in __getitem__
    return self.df.iloc[index].to_dict()
  File "/Users/frjo6001/.pyenv/versions/3.6.5/lib/python3.6/site-packages/pandas/core/indexing.py", line 1478, in __getitem__
    return self._getitem_axis(maybe_callable, axis=

In [123]:
torch.cud

TypeError: 'module' object is not callable