In [1]:
import os 
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px

ROOT_DIR = os.getcwd()
DATA_DIR = os.path.join(ROOT_DIR, 'ml-1m/')
STATE_SIZE = 10

In [2]:
#Loading datasets
ratings_list = [i.strip().split("::") for i in open(os.path.join(DATA_DIR,'ratings.dat'), 'r').readlines()]
users_list = [i.strip().split("::") for i in open(os.path.join(DATA_DIR,'users.dat'), 'r').readlines()]
movies_list = [i.strip().split("::") for i in open(os.path.join(DATA_DIR,'movies.dat'),encoding='latin-1').readlines()]
ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = np.uint32)
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)
users_df = pd.DataFrame(users_list, columns=['UserID','Gender','Age','Occupation','Zip-code'])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
movies_id_to_movies = {movie[0]: movie[1:] for movie in movies_list}
len(movies_list)

movies_df.head(5)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
genres = ['Action',
        'Adventure',
        'Animation',
        "Children's",
        'Comedy',
        'Crime',
        'Documentary',
        'Drama',
        'Fantasy',
        'Film-Noir',
        'Horror',
        'Musical',
        'Mystery',
        'Romance',
        'Sci-Fi',
        'Thriller',
        'War',
        'Western']

movies_genres_df = movies_df[['MovieID', 'Genres']]

def _split_and_index(string):
    string = string.split('|')
    for i, s in enumerate(string):
        string[i] = genres.index(s)
    return string

movies_genres_df['Genres'] = movies_genres_df['Genres'].map(lambda x : _split_and_index(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
movies_genres_df.head(5)
movies_genres_df.isnull().sum()
movies_genres_df['MovieID'].nunique()

3883

In [7]:
user_movie_rating_df = ratings_df[['UserID','MovieID','Rating']]
user_movie_rating_df.isnull().sum()

user_movie_rating_df['UserID'].nunique()

user_movie_rating_df['MovieID'].nunique()

3706

In [14]:
import torch.nn as nn
import torch
import torch.optim as optim

class Embedding_Network(nn.Module):
    def __init__(self, item_num,user_num, embedding_dim, padding_idx=0):
        super(Embedding_Network, self).__init__()
        self.item_num = user_num # item_num
        self.user_num = item_num # user_num
        self.embedding_dim = embedding_dim
        self.u_embedding = nn.Embedding(self.user_num, self.embedding_dim, padding_idx=padding_idx)
        self.m_embedding = nn.Embedding(self.item_num, self.embedding_dim, padding_idx=padding_idx)


        self.m_u_fc = nn.Linear(self.embedding_dim,1)
        #self.sig = nn.Sigmoid(inplace=True)


    def forward(self, data):
        uemb = self.u_embedding(data[0])
        memb = self.u_embedding(data[1])
        m_u = self.m_u_fc(torch.matmul(uemb,memb))
        return nn.Sigmoid(m_u)

In [17]:
movies_genres_dict = {m : g for m, g in zip(movies_genres_df["MovieID"],movies_genres_df["Genres"])}
positive_m_g_pairs = []
negative_m_g_pairs = []
for movie in movies_genres_df["MovieID"]:
    for i in range(len(genres)):
        if i in movies_genres_dict[movie]:
            positive_m_g_pairs.append((movie, i, 1))
        else:
            negative_m_g_pairs.append((movie, i, 0))

In [20]:
def generate_movie_genre_batch(positive_pairs, negative_pairs, batch_size, negative_ratio=0.5):
    
    batch = np.zeros((batch_size, 3))
    num_of_positive = batch_size-int(batch_size*negative_ratio)
    
    while True:
        idx = np.random.choice(len(positive_pairs), num_of_positive)
        positive_data = np.array(positive_pairs)[idx]
        for i, data in enumerate(positive_data):
            batch[i] = data
        
        idx = np.random.choice(len(negative_pairs), int(batch_size*negative_ratio))
        negative_data = np.array(negative_pairs)[idx]
        for i, data in enumerate(negative_data):
            batch[num_of_positive+i] = data
        
        np.random.shuffle(batch)
        yield batch[:,0], batch[:,1], batch[:,2]
        
user_movie_rating_df = user_movie_rating_df.apply(np.int32)
user_movie_rating_df.head()

modified_user_movie_rating_df = user_movie_rating_df.apply(np.int32)
index_names = modified_user_movie_rating_df[modified_user_movie_rating_df['Rating']<4].index
modified_user_movie_rating_df = modified_user_movie_rating_df.drop(index_names)
modified_user_movie_rating_df = modified_user_movie_rating_df.drop('Rating', axis=1)
u_m_pairs = modified_user_movie_rating_df.to_numpy()
u_m_pairs[:5]

positive_user_movie_dict = {u : [] for u in range(1, max(modified_user_movie_rating_df['UserID'])+1)}
for data in modified_user_movie_rating_df.iterrows():
    positive_user_movie_dict[data[1][0]].append(data[1][1])
positive_user_movie_dict[1]

[1193,
 3408,
 2355,
 1287,
 2804,
 594,
 919,
 595,
 938,
 2398,
 2918,
 1035,
 2791,
 2018,
 3105,
 2797,
 1270,
 527,
 48,
 1097,
 1721,
 1545,
 2294,
 3186,
 1566,
 588,
 1907,
 783,
 1836,
 1022,
 2762,
 150,
 1,
 1961,
 1962,
 2692,
 260,
 1028,
 1029,
 1207,
 2028,
 531,
 3114,
 608,
 1246]

In [23]:
def generate_user_movie_batch(positive_pairs, batch_size, negative_ratio=0.5):
    batch = np.zeros((batch_size, 3))
    positive_batch_size = batch_size - int(batch_size*negative_ratio)
    max_user_id = max(modified_user_movie_rating_df['UserID'])+1
    max_movie_id = max(modified_user_movie_rating_df['MovieID'])+1
    
    while True:
        idx = np.random.choice(len(positive_pairs), positive_batch_size)
        data = positive_pairs[idx]
        for i, d in enumerate(data):
            batch[i] = (d[0], d[1], 1)
        
        while i+1 < batch_size:
            u = np.random.randint(1, max_user_id)
            m = np.random.randint(1, max_movie_id)
            if m not in positive_user_movie_dict[u]:
                i += 1
                batch[i] = (u, m, 0)
        
        np.random.shuffle(batch)
        yield batch[:,0], batch[:,1], batch[:,2]

LEN_MOVIES = max(movies_genres_df["MovieID"])+1
LEN_GENRES = len(genres)+1
LEN_USERS = max(user_movie_rating_df['UserID'])+1
EMBEDDING_SIZE = 100

MAX_EPOCH = 150
INIT_USER_BATCH_SIZE = 64
FINAL_USER_BATCH_SIZE = 1024

Embedding_Network(
  (u_embedding): Embedding(3953, 100, padding_idx=0)
  (m_embedding): Embedding(6041, 100, padding_idx=0)
  (m_u_fc): Linear(in_features=100, out_features=1, bias=True)
)


In [25]:
import torch.nn.functional as F
class DenseNet(nn.Module):

    def __init__(self, n_users, n_items, n_factors, H1, D_out):
        """
        Simple Feedforward with Embeddings
        """
        super().__init__()
   	# user and item embedding layers
        self.user_factors = torch.nn.Embedding(n_users, n_factors,
                                               sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, n_factors,
                                               sparse=True)
   	# linear layers
        self.linear1 = torch.nn.Linear(n_factors*2, H1)
        self.linear2 = torch.nn.Linear(H1, D_out)

    def forward(self, users, items):
        users_embedding = self.user_factors(users)
        items_embedding = self.item_factors(items)
	# concatenate user and item embeddings to form input
        x = torch.cat([users_embedding, items_embedding], 1)
        h1_relu = F.relu(self.linear1(x))
        output_scores = self.linear2(h1_relu)
        return output_scores

    def predict(self, users, items):
        # return the score
        output_scores = self.forward(users, items)
        return output_scores

D_out = 1
H1 = 128
# u_m_model = Embedding_Network(item_num =LEN_MOVIES ,user_num=LEN_USERS ,embedding_dim=EMBEDDING_SIZE)
model = DenseNet(LEN_USERS ,LEN_MOVIES ,EMBEDDING_SIZE,H1,D_out)
print(model)

DenseNet(
  (user_factors): Embedding(6041, 100, sparse=True)
  (item_factors): Embedding(3953, 100, sparse=True)
  (linear1): Linear(in_features=200, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=1, bias=True)
)


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not numpy.ndarray

In [32]:
loss_fn = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

test_losses = []

for epoch in range(MAX_EPOCH):
    batch_size = INIT_USER_BATCH_SIZE * (epoch+1)
    if batch_size > FINAL_USER_BATCH_SIZE:
        batch_size = FINAL_USER_BATCH_SIZE
    u_m_generator = generate_user_movie_batch(u_m_pairs, batch_size)
    for step in range(len(user_movie_rating_df)//batch_size):
        # embedding layer update
        u_batch, m_batch, u_m_label_batch = next(u_m_generator)
        model.zero_grad()
        
        u_batch = torch.tensor(u_batch)
        m_batch = torch.tensor(m_batch)
        u_m_label_batch = torch.tensor(u_m_label_batch)
        ratings_scores = model(u_batch,m_batch)
        loss = loss_fn(ratings_scores,u_m_label_batch)
        loss.backward()
        optimizer.step()
    

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.DoubleTensor instead (while checking arguments for embedding)