In [115]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch as tr

## MovieLens 1M Preprocessing

The first dataset that will be used to train and test recommendation systems is the MovieLens 1M dataset. Fields for the dataset must be convereted into vectors in order to be properly inputed into a Transformer model. Thus, ennumerating non-numerical variables is essential for the second part, input embeddings. 

#### Users

In [116]:
users = pd.read_csv("data/users.dat", sep="::", header=None, engine="python", encoding="latin-1")
users.columns = ["UserID", "Gender", "Age", "Occupation", "Zip-code"]
users['Gender'] = users['Gender'].apply(lambda sex: sex == 'M')
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,False,1,10,48067
1,2,True,56,16,70072
2,3,True,25,15,55117
3,4,True,45,7,2460
4,5,True,25,20,55455


In [117]:
print("Number of users x features:", users.shape)

Number of users x features: (6040, 5)


#### Movies

In [118]:
movies = pd.read_csv("data/movies.dat", sep="::", names=["MovieID", "Movie", "Genre"], encoding='latin-1')
movies['Genre'] = movies['Genre'].copy().str.split("|")


new_movies = movies.explode('Genre')
ennumerate_genre = pd.crosstab(new_movies.Movie, new_movies.Genre).reset_index()
movie_indicies = movies[['MovieID', "Movie"]].copy()
enumerated_movies = movie_indicies.merge(ennumerate_genre, on='Movie')
enumerated_movies.head()

Unnamed: 0,MovieID,Movie,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Ratings

In [119]:
np.ones(4)

array([1., 1., 1., 1.])

In [120]:
ratings = pd.read_csv("data/ratings.dat", sep="::", names=['UserID', 'MovieID', 'Rating', 'time'],encoding='latin-1')
ratings['interaction'] = np.ones(len(ratings))
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,time,interaction
0,1,1193,5,978300760,1.0
1,1,661,3,978302109,1.0
2,1,914,3,978301968,1.0
3,1,3408,4,978300275,1.0
4,1,2355,5,978824291,1.0


#### Combined

In [121]:
ratings_plus_movie = ratings.merge(enumerated_movies, on='MovieID')
movies_combined = users.merge(ratings_plus_movie, on='UserID')

### Can get rid of Movie probably
movies_combined.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code,MovieID,Rating,time,interaction,Movie,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,False,1,10,48067,1193,5,978300760,1.0,One Flew Over the Cuckoo's Nest (1975),...,0,0,0,0,0,0,0,0,0,0
1,1,False,1,10,48067,661,3,978302109,1.0,James and the Giant Peach (1996),...,0,0,0,1,0,0,0,0,0,0
2,1,False,1,10,48067,914,3,978301968,1.0,My Fair Lady (1964),...,0,0,0,1,0,1,0,0,0,0
3,1,False,1,10,48067,3408,4,978300275,1.0,Erin Brockovich (2000),...,0,0,0,0,0,0,0,0,0,0
4,1,False,1,10,48067,2355,5,978824291,1.0,"Bug's Life, A (1998)",...,0,0,0,0,0,0,0,0,0,0


## Train-Test Split

Following from the train test split process of BERT4Rec, each movie watched by a single user will be grouped and order to become the historical session for that particular user. For each user, we will take out the most recent movie that was watched and save it for the testing set, and we will remove the second most recent movie to be used for validation set. 

In [127]:
test_val = movies_combined.sort_values(by='time').groupby('UserID').tail(2).reset_index(drop=True)
test = test_val.groupby('UserID').tail(1).reset_index(drop=True)
val = test_val.groupby('UserID').head(1).reset_index(drop=True)

## Input Embeddings

After the process of getting the data into the proper format, it is time to process the fields so that the Transformer model can be properly trained.