In [2]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import networkx as nx
import utility as util

In [3]:
class Netflix_Ratings(object):
    
    def __init__(self,ratings_dir,film_title_csv):
        import pandas as pd
        import networkx as nx
        self.ratings_dir = ratings_dir
        self.film_title_csv = film_title_csv
        
        NetflixTitle = pd.read_excel(self.film_title_csv)
        NetflixTitle['Name2'] = NetflixTitle['Name2'].fillna('')
        NetflixTitle['Name3'] = NetflixTitle['Name3'].fillna('')
        NetflixTitle['Unnamed: 5'] = NetflixTitle['Unnamed: 5'].fillna('')
        NetflixTitle['Name'] = NetflixTitle.apply(self.combine, axis=1)
        self.names_df = NetflixTitle
        
        self.ID_to_name, self.name_to_ID = self.create_mappings(self.names_df)
        
    def combine(self,row):
        if row['Name2'] != '':
            row['Name'] = str(row['Name']) + ', ' + str(row['Name2'])
        if row['Name3'] != '':
            row['Name'] = row['Name'] + ', ' + str(row['Name3'])
        if row['Unnamed: 5'] != '':
            row['Unnamed: 5'] = row['Name'] + ', ' + str(row['Unnamed: 5'])
        return row['Name']
    
    def create_mappings(self,names_df):
        IDs = list(names_df['ID'])
        names = list(names_df['Name'].str.lower())
        ID_to_name = dict(zip(IDs,names))
        name_to_ID = dict(zip(names,IDs))
        return ID_to_name, name_to_ID
    
    def create_utility_matrix(self,G):
        '''
        Given a network G, this method will construct the utility matrix for the movies present in 
        the nodeset of G that are also within the ratings listed here.
        '''
        import csv
        import numpy as np
        import scipy.sparse as ss
        
        titles_in_matrix = [i for i in G.nodes()]
        ids_in_matrix = [self.name_to_ID[x] for x in titles_in_matrix]
        
        # Loop through the files for each movie, compile the ratings for each movie, and 
        # get all of the users who rated each movie. 
        ratings_dict = {}
        users = []
        for title in titles_in_matrix[:]: #REMOVE THROTTLE
            # Get the Netflix id of this movie, and the title of the review file.
            filename = f"{self.ratings_dir}mv_{self.name_to_ID[title]:07}.txt"
            # Build a nested dictionary, where the outer key is the title of the movie,
            # the inner key is the numeric identifier of the user, and the value is the
            # rating.
            ratings = {}
            with open(filename,'r') as f:
                reader = csv.reader(f)
                for i,row in enumerate(reader):
                    if i == 0:
                        continue
                    else:
                        users.append(row[0])
                        ratings[row[0]] = row[1]
            ratings_dict[title] = ratings
        users = list(set(users))
        
        # Make mappings for the movie title and user to index
        title_to_index = dict(zip(titles_in_matrix,range(len(titles_in_matrix))))
        index_to_title = dict(zip(range(len(titles_in_matrix)),titles_in_matrix))
        user_to_index = dict(zip(users,range(len(users))))
        index_to_user = dict(zip(range(len(users)),users))

        # Build the utility matrix. [j,i] where j is user and i is movie.
        um = np.full((len(users),len(titles_in_matrix)),0)
        um = ss.lil_matrix(um)
        for title in ratings_dict:
            i = title_to_index[title]
            for user in ratings_dict[title]:
                j = user_to_index[user]
                um[j,i] = ratings_dict[title][user]
        um = ss.csr_matrix(um)
        
        self.um = um
        self.index_to_title = index_to_title
        self.index_to_user = index_to_user
        self.title_to_index = title_to_index
        self.user_to_index = user_to_index

G = util.parse_nodes_edge_file('AllActorG.net')
ratings = Netflix_Ratings('training_set/','Netflix-Dataset/movie_titles_test.xls')
ratings.create_utility_matrix(G)

In [4]:
# create_utility_matrix(G)

In [26]:
class Network_Recommender(object):
    
    def __init__(self,G,ratings,test_size=0.2):
        self.G = G # graph
        self.ratings = ratings # ratings object
        self.U = self.ratings.um
        self.test_size = test_size
        
    def train_test_split(self,user_i,):
        """
        Given a user-vector at index user_i, will return with test_size percent as 0s.
        Will also return the indices of the test.
        
        Returns:
            - j,i,v for the sparse vector being tested, after replaced with testing points of 0 rating
            - list of tuples that contain the indicies (j,i) of the test elements from self.U
        """
        import scipy.sparse as ss
        import numpy as np
        
        test_size=self.test_size
                
        j,i,v = ss.find(self.U[user_i,:])
        np.random.seed(42)
        rand = np.random.uniform(size=j.size)
        test_i = []
        
        x = 0
        for jj,ii,r in zip(j,i,rand):
            if r < test_size:
                v[x] = 0
                test_i.append((jj,ii))
            x+=1
                
        #print(Uc)
        #print(test_i)
        
        return j,i,v,test_i
        

        
    def match_nodes(self,x,y):
        if x['rating'] == y['rating']:
            return True
        else:
            return False
        
    def ICA(self,user_i,):
        """
        Parameters:
            - user_i: index of the user to perform ICA with
        Returns:
            - vector of predictions for user_i
            - mae score of this vector
        """
        import networkx as nx
        import numpy as np
        
        j,i,v,test_i = self.train_test_split(user_i)
        mean_rating = np.average(v)
        G_ = self.G.copy()
        
        # Set initial ratings from the user. These include any testing points.
        for x,ii in enumerate(i):
            if v[x] != 0:
                G_.nodes[self.ratings.index_to_title[ii]]['rating'] = v[x]
                
        # Initialize the rest of the nodes:
        for n in G_.nodes():
            if 'rating' not in G_.nodes[n]:
                G_.nodes[n]['rating'] = 0

                
        # Start algorithm by initializing the nest time step and a time tracker.
        G_plus = G_.copy()
        num_zeros_ = -999
        t = 0
        still_changing = True
        
        
        while still_changing:
            print(f"timestep: {t}")
            
            # Recall that 'n' in this case is actually the title of the movie
            for n in G_.nodes():
                if G_.nodes[n]['rating'] == 0:
                # To get the edges of each node, use G_[], to get the node props, use G_.nodes[]
                    nns = G_[n]
                    # Check each neighbor for ratings, and store them.
                    nn_ratings = []
                    nn_weights = []
                    for nn in nns:
                        if G_.nodes[nn]['rating'] != 0:
                            nn_ratings.append(G_.nodes[nn]['rating'])
                            nn_weights.append(G_[n][nn]['weight'])
                    # If any ratings existed in the neighbors, update this node with the avg rating between neighbors.
                    if nn_ratings:
                        G_plus.nodes[n]['rating'] = np.average(nn_ratings,weights=nn_weights)

            # Check if the graph is still changing:
            ratings_list = list(nx.get_node_attributes(G_plus,'rating').values())
            num_zeros_plus = ratings_list.count(0.0)
            print(f"number of zeroes remaining: {num_zeros_plus}")
            if num_zeros_plus == num_zeros_:
                still_changing = False
            
            # Update pparameters for next iteration.
            t+=1
            num_zeros_ = num_zeros_plus
            G_ = G_plus.copy()
            
        # END ITERATION
        
        # Fill in all remaining zeros with the average rating from this user
        # and convert them back into a vector.
        outvec = np.zeros(self.U.shape[1])
        for n in G_.nodes():
            if G_.nodes[n]['rating'] == 0:
                G_.nodes[n]['rating'] = mean_rating
            outvec[self.ratings.title_to_index[n]] = G_.nodes[n]['rating']
            
        # Finally, compute MAE if there are any test_i
        if test_i:
            err = []
            for indices in test_i:
                err.append(self.U[indices[0],indices[1]] - outvec[indices[1]])
                #print(self.U[indices[0],indices[1]],outvec[indices[1]],err)
            mae = np.sum(np.abs(np.array(err)))/len(test_i)
                    
            
        return outvec, mae
        
            
        
        
    


In [27]:
ica = Network_Recommender(G,ratings)
o,m = ica.ICA(0)

timestep: 0
number of zeroes remaining: 1980
timestep: 1
number of zeroes remaining: 163
timestep: 2
number of zeroes remaining: 37
timestep: 3
number of zeroes remaining: 32
timestep: 4
number of zeroes remaining: 32


In [28]:
m

1.9629629629629628

In [None]:
ratings.index_to_title