In [1]:
import pandas as pd
import numpy as np

import torch
import numpy as np
import pandas as pd
import pickle
import multiprocessing
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


def get_cv_idxs(n, cv_idx=0, val_pct=0.2, seed=42):
    """ Get a list of index values for Validation set from a dataset

    Arguments:
        n : int, Total number of elements in the data set.
        cv_idx : int, starting index [idx_start = cv_idx*int(val_pct*n)]
        val_pct : (int, float), validation set percentage
        seed : seed value for RandomState

    Returns:
        list of indexes
    """
    np.random.seed(seed)
    n_val = int(val_pct * n)
    idx_start = cv_idx * n_val
    idxs = np.random.permutation(n)
    return idxs[idx_start:idx_start + n_val]


def split_by_idx(idxs, *a):
    """
    Split each array passed as *a, to a pair of arrays like this (elements selected by idxs,  the remaining elements)
    This can be used to split multiple arrays containing training data to validation and training set.

    :param idxs [int]: list of indexes selected
    :param a list: list of np.array, each array should have same amount of elements in the first dimension
    :return: list of tuples, each containing a split of corresponding array from *a.
            First element of each tuple is an array composed from elements selected by idxs,
            second element is an array of remaining elements.
    """
    mask = np.zeros(len(a[0]), dtype=bool)
    mask[np.array(idxs)] = True
    return [(o[mask], o[~mask]) for o in a]


class AutoEncoder(object):

    def __init__(self, data, validation_perc=0.2, lr=0.001,
                 intermediate_size=1000, encoded_size=100):

        # create training dataloader and validation tensor
        self.data = data
        self.val_idxs = get_cv_idxs(n=data.shape[0], val_pct=validation_perc)
        [(self.val, self.train)] = split_by_idx(self.val_idxs, data)
        self.dataset = AETrainingData(self.train)
        self.dataloader = DataLoader(self.dataset, batch_size=64, shuffle=True,
                                     num_workers=multiprocessing.cpu_count())
        #print('datal=',self.dataloader)
        self.val = torch.from_numpy(self.val.values).\
            type(torch.FloatTensor).cuda()

        # instantiate the encoder and decoder nets
        size = data.shape[1]
        self.encoder = Encoder(size, intermediate_size, encoded_size).cuda()
        self.decoder = Decoder(size, intermediate_size, encoded_size).cuda()

        # instantiate the optimizers
        self.encoder_optimizer = optim.Adam(
            self.encoder.parameters(), lr=lr, weight_decay=1e-8)
        self.decoder_optimizer = optim.Adam(
            self.decoder.parameters(), lr=lr, weight_decay=1e-8)

        # instantiate the loss criterion
        self.criterion = nn.MSELoss(reduction='mean')

        self.train_losses = []
        self.val_losses = []

    def train_step(self, input_tensor, target_tensor):
        # clear the gradients in the optimizers
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        # Forward pass through
        encoded_representation = self.encoder(input_tensor)
        reconstruction = self.decoder(encoded_representation)

        # Compute the loss
        loss = self.criterion(reconstruction, target_tensor)

        # Compute the gradients
        loss.backward()

        # Step the optimizers to update the model weights
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()

        # Return the loss value to track training progress
        return loss.item()
    
    def reset(self, train=True):
        # due to dropout the network behaves differently in training and
        # evaluation modes
        if train: self.encoder.train(); self.decoder.train()
        else: self.encoder.eval(); self.decoder.eval()

    def get_val_loss(self, input_tensor, target_tensor):
        self.reset(train=False)
        encoded = self.encoder(input_tensor)
        decoded = self.decoder(encoded)
        loss = self.criterion(decoded, target_tensor)
        return loss.item()

    def train_loop(self, epochs, print_every_n_batches=50):

        # Cycle through epochs
        for epoch in range(epochs):
            print(f'Epoch {epoch + 1}/{epochs}')

            # Cycle through batches
            for i, batch in enumerate(self.dataloader):
                #print(i,batch)
                
                self.reset(train=True)

                input_tensor = batch['input'].cuda()
                target_tensor = batch['target'].cuda()

                loss = self.train_step(input_tensor, target_tensor)

                if i % print_every_n_batches == 0 and i != 0:
                    #print('i=',i)
                    val_loss = self.get_val_loss(self.val, self.val)
                    print(f'train loss: {round(loss, 8)} | ' +
                          f'validation loss: {round(val_loss, 8)}')
                    self.train_losses.append(loss)
                    self.val_losses.append(val_loss)

    def get_encoded_representations(self):
        to_encode = torch.from_numpy(self.data.values).type(
            torch.FloatTensor).cuda()
        self.reset(train=False)
        encodings = self.encoder(to_encode).cpu().data.numpy()
        return encodings
    
    def get_decoded_representations(self):
        to_encode = torch.from_numpy(self.data.values).type(
            torch.FloatTensor).cuda()
        self.reset(train=False)
        encodings = self.encoder(to_encode).cpu().data.numpy()
        encodings=pd.DataFrame(encodings)
        to_decode = torch.from_numpy(encodings.values).type(
            torch.FloatTensor).cuda()
        self.reset(train=False)
        decodings = self.decoder(to_decode).cpu().data.numpy()
        return decodings


class AETrainingData(Dataset):
    """
    Format the training dataset to be input into the auto encoder.
    Takes in dataframe and converts it to a PyTorch Tensor
    """

    def __init__(self, x_train):
        self.x = x_train

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        """
        Returns a example from the data set as a pytorch tensor.
        """
        # Get example/target pair at idx as numpy arrays
        x, y = self.x.iloc[idx].values, self.x.iloc[idx].values

        # Convert to torch tensor
        x = torch.from_numpy(x).type(torch.FloatTensor)
        y = torch.from_numpy(y).type(torch.FloatTensor)

        # Return pair
        return {'input': x, 'target': y}


class Encoder(nn.Module):
    def __init__(self, input_size, intermediate_size, encoding_size):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, intermediate_size),
            nn.BatchNorm1d(intermediate_size),
            nn.ReLU(True),
            nn.Dropout(0.2),
            nn.Linear(intermediate_size, encoding_size),
            nn.BatchNorm1d(encoding_size),
            nn.ReLU(True),
            nn.Dropout(0.2))

    def forward(self, x):
        x = self.encoder(x)
        return x


class Decoder(nn.Module):
    def __init__(self, output_size, intermediate_size, encoding_size):
        super().__init__()
        self.decoder = nn.Sequential(
            nn.Linear(encoding_size, intermediate_size),
            nn.BatchNorm1d(intermediate_size),
            nn.ReLU(True),
            nn.Dropout(0.2),
            nn.Linear(intermediate_size, output_size),
            nn.BatchNorm1d(output_size),
            nn.Sigmoid())

    def forward(self, x):
        x = self.decoder(x)
        return x


In [2]:
with open(f'new_df_matrix.pkl', 'rb') as fh:
    new_df1 = pickle.load(fh)

In [3]:
ndf=new_df1

In [4]:
ndf=ndf.apply(lambda x: x/x.max(), axis=0)

In [5]:
ndf2=ndf.T

In [6]:
ndf2.shape

(69878, 10677)

In [7]:
#ndf1.shape
ndf2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10667,10668,10669,10670,10671,10672,10673,10674,10675,10676
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.2,0.6,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Training Part I

In [None]:
u_data1=ndf2.head(40000)

In [None]:
ae1 = AutoEncoder(u_data1, validation_perc=0.1, lr=1e-3, intermediate_size=1000, encoded_size=100)

In [None]:
ae1.train_loop(epochs=40)

In [None]:
losses1 = pd.DataFrame(data=list(zip(ae1.train_losses, ae1.val_losses)), columns=['train_loss', 'validation_loss'])
losses1['epoch'] = (losses1.index + 1) / 3

In [None]:
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'svg'
fig, ax = plt.subplots()
ax.plot(losses1['epoch'], losses1['train_loss'])
ax.plot(losses1['epoch'], losses1['validation_loss'])
ax.set_ylabel('MSE loss')
ax.set_xlabel('epoch')
ax.set_title('autoencoder loss over time')
ax.legend()

In [None]:
encoded_1 = ae1.get_encoded_representations()

In [None]:
with open('autoen_userid_part1_embeddings.pkl', 'wb') as fh:
    pickle.dump(encoded_1, fh)

In [None]:
decoded_1 = ae1.get_decoded_representations()

In [None]:
with open('autoen_userid_part1_decoded_embeddings.pkl', 'wb') as fh:
    pickle.dump(decoded_1, fh)

# Training Part II

In [None]:
ndf3=ndf2.tail(29878)

In [None]:
ae2 = AutoEncoder(ndf3, validation_perc=0.1, lr=1e-3, intermediate_size=1000, encoded_size=100)

In [None]:
ae2.train_loop(epochs=40)

In [None]:
losses2 = pd.DataFrame(data=list(zip(ae2.train_losses, ae2.val_losses)), columns=['train_loss', 'validation_loss'])
losses2['epoch'] = (losses2.index + 1) / 3

In [None]:
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'svg'
fig, ax = plt.subplots()
ax.plot(losses2['epoch'], losses2['train_loss'])
ax.plot(losses2['epoch'], losses2['validation_loss'])
ax.set_ylabel('MSE loss')
ax.set_xlabel('epoch')
ax.set_title('autoencoder loss over time')
ax.legend()

In [None]:
encoded_2 = ae2.get_encoded_representations()

In [None]:
with open('autoen_userid_part2_embeddings.pkl', 'wb') as fh:
    pickle.dump(encoded_2, fh)

In [None]:
decoded_2 = ae2.get_decoded_representations()

In [None]:
with open('autoen_userid_part2_decoded_embeddings.pkl', 'wb') as fh:
    pickle.dump(decoded_2, fh)

# Load pickel files

In [11]:
with open(f'autoen_userid_part1_embeddings.pkl', 'rb') as fh:
    en1 = pickle.load(fh)

In [12]:
with open(f'autoen_userid_part2_embeddings.pkl', 'rb') as fh:
    en2 = pickle.load(fh)

In [8]:
with open(f'autoen_userid_part1_decoded_embeddings.pkl', 'rb') as fh:
    de1 = pickle.load(fh)

In [9]:
with open(f'autoen_userid_part2_decoded_embeddings.pkl', 'rb') as fh:
    de2 = pickle.load(fh)

In [13]:
en1=pd.DataFrame(en1)
en2=pd.DataFrame(en2)

In [14]:
en1=en1.astype(np.float32)
en2=en2.astype(np.float32)

In [15]:
de1.shape

(40000, 10677)

In [16]:
de2.shape

(29878, 10677)

In [17]:
de1=pd.DataFrame(de1)
de2=pd.DataFrame(de2)

# Combining decoded Parts I and II

In [18]:
new_de=pd.concat([de1,de2],axis=0)
new_de=pd.DataFrame(new_de)

In [19]:
new_de.shape

(69878, 10677)

In [20]:
new_de.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10667,10668,10669,10670,10671,10672,10673,10674,10675,10676
0,0.049732,0.22073,0.2628,0.140662,0.290778,0.340142,0.180855,0.832761,0.165737,0.543845,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054
1,0.018958,0.030328,0.034226,0.029149,0.04196,0.063139,0.015043,0.138526,0.024242,0.03603,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054
2,0.016304,0.023875,0.015128,0.025326,0.025916,0.024211,0.016716,0.174446,0.026571,0.026365,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054
3,0.018492,0.430593,0.435108,0.376917,0.737678,0.806885,0.067799,0.724406,0.064136,0.793628,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054
4,0.011427,0.042108,0.019168,0.023947,0.023361,0.022289,0.024191,0.089681,0.04427,0.100754,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054


In [21]:
new_usd=new_de.to_numpy()

In [22]:
new_usd.shape

(69878, 10677)

In [23]:
X=ndf2.values

In [24]:
new_usd2=new_usd * (X==0)

In [None]:
# with open('autoen_userid_final_embeddings.pkl', 'wb') as fh:
#     pickle.dump(new_usd2, fh)

In [25]:
new_usd1=pd.DataFrame(new_usd2)

In [26]:
new_usd1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10667,10668,10669,10670,10671,10672,10673,10674,10675,10676
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054
1,0.018958,0.030328,0.034226,0.029149,0.04196,0.063139,0.015043,0.138526,0.024242,0.03603,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054
2,0.016304,0.023875,0.015128,0.025326,0.025916,0.024211,0.016716,0.174446,0.026571,0.026365,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054
3,0.018492,0.430593,0.0,0.0,0.0,0.0,0.067799,0.724406,0.064136,0.0,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054
4,0.011427,0.042108,0.019168,0.023947,0.023361,0.022289,0.024191,0.089681,0.04427,0.100754,...,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054,0.015054


In [27]:
ndf2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10667,10668,10669,10670,10671,10672,10673,10674,10675,10676
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.2,0.6,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
print(new_usd1.values.min(), new_usd1.values.max())

0.0 0.99998736


In [None]:
# matrix of userid and ratingid before calculating the similarity score

In [29]:
new_m=pd.read_csv('movie_demographics.csv')

In [30]:
new_m.head()

Unnamed: 0,movieId,title,genres,ratings_count,avg_rating
0,0,Boomerang (1992),Comedy|Romance,2412,2.861318
1,1,"Net, The (1995)",Action|Crime|Thriller,14975,3.125209
2,2,Dumb & Dumber (1994),Comedy,17851,2.93695
3,3,Outbreak (1995),Action|Drama|Sci-Fi|Thriller,16075,3.418414
4,4,Stargate (1994),Action|Adventure|Sci-Fi,18925,3.349353


In [31]:
new_m= new_m.drop(['ratings_count', 'avg_rating'], 1)


In [32]:
def recommender_for_user(user_id, interact_matrix, df_content, topn = 10):
    '''
    Recommender movie for Users
    '''
    pred_scores = interact_matrix.loc[user_id].values

    df_scores   = pd.DataFrame({'movieId': list(ndf2.columns), 
                               'rating': pred_scores})

#     df_rec      = df_scores.set_index('movieId')\
#                     .join(df_content.set_index('movieId'))\
#                     .sort_values('rating', ascending=False)\
#                     .head(topn)[['rating', 'movie']]
    df_rec= pd.merge(df_scores, df_content, on="movieId")
    df_rec= df_rec.sort_values('rating', ascending=False).head(10)
    
    return df_rec[df_rec['rating'] > 0]

In [33]:
recommender_for_user(
    user_id         = 26000, 
    #interact_matrix = new_usd1,
    interact_matrix = ndf2,
    df_content      = new_m)

Unnamed: 0,movieId,rating,title,genres
798,798,1.0,"Lord of the Rings: The Fellowship of the Ring,...",Action|Adventure|Fantasy
64,64,1.0,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Fantasy
244,244,1.0,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller
268,268,1.0,Double Indemnity (1944),Crime|Drama|Film-Noir
2370,2370,1.0,Gentlemen Prefer Blondes (1953),Comedy|Musical|Romance
217,217,1.0,Singin' in the Rain (1952),Comedy|Musical|Romance
426,426,1.0,Grease (1978),Comedy|Musical|Romance
60,60,1.0,"Lord of the Rings: The Two Towers, The (2002)",Action|Adventure|Fantasy
2090,2090,1.0,Pride & Prejudice (2005),Drama|Romance
63,63,1.0,Pirates of the Caribbean: The Curse of the Bla...,Action|Adventure|Comedy|Fantasy


In [34]:
recommender_for_user(
    user_id         = 10070, 
    interact_matrix = new_usd1,
    #interact_matrix = ndf2,
    df_content      = new_m)

Unnamed: 0,movieId,rating,title,genres
1466,1466,0.379001,Fantasia (1940),Animation|Children|Fantasy|Musical
193,193,0.328887,Shakespeare in Love (1998),Comedy|Drama|Romance
426,426,0.325006,Grease (1978),Comedy|Musical|Romance
156,156,0.318019,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi
1185,1185,0.27421,Dumbo (1941),Animation|Children|Drama|Musical
177,177,0.269206,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi
3116,3116,0.267058,Sleeping Beauty (1959),Animation|Children|Musical
178,178,0.238268,"Princess Bride, The (1987)",Action|Adventure|Comedy|Fantasy|Romance
1434,1434,0.227323,"Hunchback of Notre Dame, The (1996)",Animation|Children|Musical
96,96,0.215917,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [35]:
recommender_for_user(
    user_id         = 10023, 
    #interact_matrix = new_usd1,
    interact_matrix = ndf2,
    df_content      = new_m)

Unnamed: 0,movieId,rating,title,genres
74,74,1.0,Apollo 13 (1995),Adventure|Drama
96,96,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,3,1.0,Outbreak (1995),Action|Drama|Sci-Fi|Thriller
4,4,1.0,Stargate (1994),Action|Adventure|Sci-Fi
81,81,1.0,"Santa Clause, The (1994)",Comedy|Drama|Fantasy
22,22,1.0,Braveheart (1995),Action|Drama|War
175,175,1.0,"Fugitive, The (1993)",Thriller
18,18,1.0,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
26,26,1.0,Dances with Wolves (1990),Adventure|Drama|Western
94,94,1.0,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance


In [None]:
10023
10027