<a href="https://colab.research.google.com/github/kairamilanifitria/DeepLearning/blob/main/Deep_Learning_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [1]:
# To store the data
import pandas as pd

# To do linear algebra
import numpy as np

# To create plots
import matplotlib.pyplot as plt

# To create interactive plots
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

# To shift lists
from collections import deque

# To compute similarities between vectors
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# To use recommender systems
import surprise as sp
from surprise.model_selection import cross_validate

# To create deep learning models
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# To create sparse matrices
from scipy.sparse import coo_matrix

# To light fm
from lightfm import LightFM
from lightfm.evaluation import precision_at_k

# To stack sparse matrices
from scipy.sparse import vstack

Load Movie-Data

In [2]:
# Load data for all movies
movie_titles = pd.read_csv('/content/drive/MyDrive/research/movies/netflix/movie_titles.csv', on_bad_lines='skip',
                           encoding = 'ISO-8859-1',
                           header = None,
                           names = ['Id', 'Year', 'Name']).set_index('Id')

print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
movie_titles.sample(5)

Shape Movie-Titles:	(17434, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
12219,2000.0,Dragon Ball Z: Vegeta Saga 1
14137,1995.0,Johnny Mnemonic
16369,1978.0,Little Feat: Rockpalast Live
14614,2003.0,Gate Keepers 21
2646,1999.0,Ice from the Sun


In [3]:
# Load a movie metadata dataset
movie_metadata = pd.read_csv('/content/drive/MyDrive/research/movies/movies_metadata.csv', low_memory=False)[['original_title', 'overview', 'vote_count']].set_index('original_title').dropna()
# Remove the long tail of rarly rated moves
movie_metadata = movie_metadata[movie_metadata['vote_count']>10].drop('vote_count', axis=1)

print('Shape Movie-Metadata:\t{}'.format(movie_metadata.shape))
movie_metadata.sample(5)

Shape Movie-Metadata:	(21604, 1)


Unnamed: 0_level_0,overview
original_title,Unnamed: 1_level_1
Klimt,A portrait of Austrian artist Gustav Klimt who...
Экипаж,"A story about bravery, self-sacrifice and huma..."
दंगल,Dangal is an extraordinary true story based on...
Black Dynamite,This is the story of 1970s African-American ac...
Nomads,French anthropologist Jean-Charles Pommier and...


In [4]:
# Load a movie metadata dataset
movie_metadata = pd.read_csv('/content/drive/MyDrive/research/movies/movies_metadata.csv', low_memory=False)[['original_title', 'overview', 'vote_count']].set_index('original_title').dropna()
# Remove the long tail of rarly rated moves
movie_metadata = movie_metadata[movie_metadata['vote_count']>10].drop('vote_count', axis=1)

print('Shape Movie-Metadata:\t{}'.format(movie_metadata.shape))
movie_metadata.sample(5)

Shape Movie-Metadata:	(21604, 1)


Unnamed: 0_level_0,overview
original_title,Unnamed: 1_level_1
At First Sight,A blind man has an operation to regain his sig...
Castaway,Middle-aged Gerald Kingsland advertises in a L...
The Cameraman,Buster is a tintype portrait photographer who ...
La battaglia di Algeri,Tracing the struggle of the Algerian Front de ...
Open Water,Two divers are left out at sea without a boat....


Load User-Data And Preprocess Data-Structure

In [5]:
# Load single data-file
df_raw = pd.read_csv('/content/drive/MyDrive/research/movies/netflix/combined_data_1.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])


# Find empty rows to slice dataframe for each movie
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)


# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):

    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()

    # Create movie_id column
    tmp_df['Movie'] = movie_id

    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
df = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)

Shape User-Ratings:	(24053764, 4)


Unnamed: 0,User,Rating,Date,Movie
18433366,286783,4.0,2005-10-05,3526
10012314,968039,3.0,2004-08-27,1962
22396149,1432211,3.0,2005-06-03,4256
17950470,1773651,5.0,2005-10-02,3427
16906949,1246721,5.0,2001-01-21,3276


Movies Release

In [10]:
# Get data
data = movie_titles['Year'].value_counts().sort_index()

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movies Grouped By Year Of Release'.format(movie_titles.shape[0]),
              xaxis = dict(title = 'Release Year'),
              yaxis = dict(title = 'Movies'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
fig.show()
fig.write_html("plot.html")
