# Netflix dataset

In [None]:
import os
import random
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy

from datetime import datetime
from itertools import combinations

import sys
sys.path.append('modules')

import netflix_module as netflix
import graph_module as graph
import loocv_module as cv

In [None]:
plt.rcParams.update({'font.size': 16})

In [None]:
dir_path = '/Users/eglantine.karle/Docs/These/Ranking/LS approach/Code/Revision/netflix_data/'

# Extract data (run only the first time to create data files)

In [None]:
if not os.path.isfile(dir_path+'data.csv'):
    # Create a file 'data.csv' before reading it
    # Read all the files in the dataset and store them in one big file ('data.csv')
    # We're reading from each of the four files and appending each rating to a global file 'data.csv'
    data = open(dir_path+'data.csv', mode='w')
    
    row = list()
    files = [
        dir_path+'combined_data_1.txt',
        dir_path+'combined_data_2.txt', 
        dir_path+'combined_data_3.txt', 
        dir_path+'combined_data_4.txt'
    ]
    for file in files:
        print("Reading ratings from {}\n".format(file))
        with open(file) as f:
            for line in f: 
                line = line.strip()
                if line.endswith(':'):
                    # All below are ratings for this movie, until another movie appears.
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
    data.close()

In [None]:
print("Creating the dataframe from data.csv file")
df = pd.read_csv(dir_path+'data.csv', sep=',', 
    names=['movie', 'user', 'rating', 'date'])
df.date = pd.to_datetime(df.date)
df.date = df['date'].dt.to_period('M')

In [None]:
# We are arranging the ratings according to time-stamp(s)
print('Sorting the dataframe by Date')
df.sort_values(by='date', inplace=True)

#df.to_csv('netflix_data/data.csv')

In [None]:
df.head()

In [None]:
dates = df.date.unique()
dates

In [None]:
movie_id = pd.read_csv(dir_path+'movie_titles.csv',sep=';',header=None)
movie_id = movie_id.iloc[:,[0,1]]
movie_id.columns = ['Year','Movie'] 
movie_id.head()

# Dataset with $N=100$ movies (run only the first time to create data files)

In [None]:
titles_100 = pd.read_csv('netflix_data/titles_100_movies.csv',sep=';')
titles_100.head()

In [None]:
l_movies = [i+1 for i in titles_100.Id.unique()]

In [None]:
# Create the list of dates to merge in order to get all connected graphs

N = 100
list_A = [np.zeros((N,N))]
df_100 = df.loc[df.movie.isin(l_movies)]

k = 0
d = []
merged_dates = [dates[0]]
while k< len(dates):
    if graph.connected(list_A[-1]):
        # if the last graph is connected, we start a new graph using the data of the current graph
        A = np.zeros((N,N))
        d = [dates[k]]
        df_d = df_100.loc[df_100.date.isin(d)] # Dataset at time d
        l_movies_d = df_d.movie.unique() # list of rated movies at time d
        for [i,j] in combinations(l_movies_d,2):
            a = l_movies.index(i)
            b = l_movies.index(j)

            A[a,b] = 1
            A[b,a] = 1
        list_A.append(A)
        merged_dates.append(d)
            
    else:
        # if the last graph is not connected, we add the data of the current graph
        d.append(dates[k])
        A = np.zeros((N,N))
        df_d = df_100.loc[df_100.date.isin(d)] # Dataset at time d
        l_movies_d = df_d.movie.unique() # list of rated movies at time d
        for [i,j] in combinations(l_movies_d,2):
            a = l_movies.index(i)
            b = l_movies.index(j)

            A[a,b] = 1
            A[b,a] = 1
        list_A[-1] = A
        merged_dates[-1] = d
        
    k = k+1

In [None]:
N = 100
T = len(merged_dates)
Y = np.zeros((T,N,N))
A = np.zeros((T,N,N))
df_100 = df.loc[df.movie.isin(l_movies)]
for k,d in enumerate(merged_dates):
    df_d = df_100.loc[df_100.date.isin(d)] # Dataset at time d
    l_movies_d = df_d.movie.unique() # list of rated movies at time d
    for [i,j] in combinations(l_movies_d,2):
        a = l_movies.index(i)
        b = l_movies.index(j)
        
        # Mean score for each movie at time d
        rating_i = np.mean(df_d['rating'].values[df_d.movie == i])
        rating_j = np.mean(df_d['rating'].values[df_d.movie == j])
        Y[k,a,b] = rating_i-rating_j
        Y[k,b,a] = -Y[k,a,b]
        A[k,a,b] = 1 # Movies i and j were compared at time t so we add an edge on the comparison graph
        A[k,b,a] = 1

In [None]:
import pickle
with open("netflix_data/y_merged_transync_100.txt", "wb") as y:
    pickle.dump(Y, y)
with open("netflix_data/a_merged_transync_100.txt", "wb") as a:
    pickle.dump(A, a)
with open("netflix_data/100_movies.txt", "wb") as m:
    pickle.dump(l_movies, m)
with open("netflix_data/merged_dates_100.txt", "wb") as mer:
    pickle.dump(merged_dates, mer)

# Analysis - 100 movies

In [None]:
import pickle
with open(dir_path+'y_merged_transync_100.txt', "rb") as y:
    Y = pickle.load(y)
with open(dir_path+'a_merged_transync_100.txt', "rb") as a:
    A = pickle.load(a)
with open(dir_path+'100_movies.txt', "rb") as m:
    movies = pickle.load(m)
with open(dir_path+"merged_dates_100.txt", "rb") as mer:
    merged_dates = pickle.load(mer)

In [None]:
titles = pd.read_csv(dir_path+'titles_100_movies.csv',sep=';')
df_movies = pd.DataFrame({'Movies': titles.Title.loc[titles.Id.isin([m-1 for m in movies])]})
df_movies

In [None]:
T,N = np.shape(Y)[:2]
T,N

In [None]:
import graph_module as graph
print(graph.connected(sum(A)))
for t in range(T):
    print(graph.connected(A[t,:,:]))

In this case, the union graph is connected and all the merged graphs are connected.
We've reduced the number of timepoints to 23 to obtain this connectivity

In [None]:
sparsity = []
for t in range(T):
    sparsity.append(1.0 - ( np.count_nonzero(A[t,:,:]) / float(A[t,:,:].size) ))
sparsity

The observed graph here are dense

## Analysis

In [None]:
import loocv_module as cv
import ls_module as ls

In [None]:
E = ls.penalty_E(N,T)
with open("/Users/eglantine.karle/Docs/These/Ranking/LS approach/Code/Revision/eigenpairs_E/eigenvectors_E_N"+str(N)+"_T"+str(T)+".txt", "rb") as v:
        V_E = pickle.load(v)
with open("/Users/eglantine.karle/Docs/These/Ranking/LS approach/Code/Revision/eigenpairs_E/eigenvalues_E_N"+str(N)+"_T"+str(T)+".txt", "rb") as e:
        eigs_E = pickle.load(e)

In [None]:
# Cross Validation for Upsets criterion
random.seed(0)
np.random.seed(0)

lambda_list_up = np.linspace(0,0.2,20)
tau_list_up = np.linspace(370,410,20)

result_dls_up = cv.cv_dls_transync_up(Y,A,E,lambda_list_up,num_loocv = 40)
result_dproj_up = cv.cv_dproj_transync_up(Y,A,E,V_E,eigs_E,tau_list_up,num_loocv = 40)
lam_up_dls,z_up_dls,error_up_dls = result_dls_up
tau_up_dproj,z_up_dproj,error_up_dproj = result_dproj_up
print(lam_up_dls,tau_up_dproj)

In [None]:
# Cross Validation for MSE criterion
random.seed(0)
np.random.seed(0)

lambda_list_mse = np.linspace(0,500,20)
tau_list_mse = np.linspace(1e-6,1,20)

result_dls_mse = cv.cv_dls_transync_mse(Y,A,E,lambda_list_mse,num_loocv = 40)
result_dproj_mse = cv.cv_dproj_transync_mse(Y,A,E,V_E,eigs_E,tau_list_mse,num_loocv = 40)
lam_mse_dls,z_mse_dls,error_mse_dls = result_dls_mse
tau_mse_dproj,z_mse_dproj,error_mse_dproj = result_dproj_mse
print(lam_mse_dls,tau_mse_dproj)

In [None]:
lam_mse_dls,tau_mse_dproj = [421.05263157894734,0.2631586315789473]

## Analysis for parameter $\tau^*$ and $\lambda^*$

In [None]:
first_merged_dates = [d[0] for d in merged_dates]
df_movies = pd.DataFrame({'Movies': titles.Title.loc[titles.Id.isin([m-1 for m in movies])]})

In [None]:
l_ls_mse,l_dls_mse,l_dproj_mse = netflix.get_ranks_transync(lam_mse_dls,tau_mse_dproj,Y,A,movies,titles,first_merged_dates,
                                                            ls_flag = True,dls_flag = True,dproj_flag = True)
l_ls_up,l_dls_up,l_dproj_up = netflix.get_ranks_transync(lam_up_dls,tau_up_dproj,Y,A,movies,titles,first_merged_dates,
                                                            ls_flag = True,dls_flag = True,dproj_flag = True)

In [None]:
# Optimal Results for MSE criteria
z_dls_mse,df_dls_mse,lam_mse = l_dls_mse
z_ls_mse,df_ls_mse = l_ls_mse
z_dproj_mse,df_dproj_mse,tau_mse = l_dproj_mse

In [None]:
# Optimal Results for Upsets criteria
z_dls_up,df_dls_up,lam_up = l_dls_up
z_ls_up,df_ls_up = l_ls_up
z_dproj_up,df_dproj_up,tau_up = l_dproj_up

In [None]:
# Number of upsets for upsets results
upsets_ls_up = netflix.get_mean_upsets_transync(Y,A,z_ls_up)
upsets_dls_up = netflix.get_mean_upsets_transync(Y,A,z_dls_up)
upsets_dproj_up = netflix.get_mean_upsets_transync(Y,A,z_dproj_up)

# MSE for upsets results
mse_ls_up = netflix.get_mse_upsets_transync(Y,A,z_ls_up)
mse_dls_up = netflix.get_mse_upsets_transync(Y,A,z_dls_up)
mse_dproj_up = netflix.get_mse_upsets_transync(Y,A,z_dproj_up)

In [None]:
# Number of upsets for mse results
upsets_ls_mse = netflix.get_mean_upsets_transync(Y,A,z_ls_mse)
upsets_dls_mse = netflix.get_mean_upsets_transync(Y,A,z_dls_mse)
upsets_dproj_mse = netflix.get_mean_upsets_transync(Y,A,z_dproj_mse)

# MSE for mse results
mse_ls_mse = netflix.get_mse_upsets_transync(Y,A,z_ls_mse)
mse_dls_mse = netflix.get_mse_upsets_transync(Y,A,z_dls_mse)
mse_dproj_mse = netflix.get_mse_upsets_transync(Y,A,z_dproj_mse)

In [None]:
# Mean number of upsets for each method
np.mean(upsets_ls_up),np.mean(upsets_dls_up),np.mean(upsets_dproj_up)

In [None]:
# Mean MSE for each method
np.mean(mse_ls_mse),np.mean(mse_dls_mse),np.mean(mse_dproj_mse)

# Another performance criterion

## Check smoothness of the data

We define the ground truth vector $z^*$ such that 
$$z^*_{t,i} = \frac{1}{N_{t,i}} \sum_{j \in N_{t,i}} y_{ij}(t)$$
where $N_{t,i}$ denotes the set of neighbours of $i$ at time $t$

In [None]:
# Define ground truth
z_star = np.zeros((T,N))
for t in range(T):
    for i in range(N):
        Nti = np.sum(A[t,i,:]) # Number of games played by team i at time t
        if Nti != 0:
            z_star[t,i] = np.sum(Y[t,i,:])/Nti
        else:
            z_star[t,i] = 0

In [None]:
fig,ax = plt.subplots(figsize=(8,6))
for i in range(5):
    ax.plot(z_star[:,i],label=df_movies.iloc[i]['Movies'])
ax.set_ylabel('Ground truth $z^{*,emp}_{t,i}$')
ax.set_xlabel('Time')
ax.set_title('Evolution of the strength of movies')
plt.legend(loc='best',frameon=False)

fig.savefig(res_path+'smoothness_netflix_5films.png')
plt.show()