In [1]:
import networkx as nx
from csv import reader
import pandas as pd
import random
from random import sample
import numpy as np


keywords_path = "Downloads/keywords.csv/keywords.csv"
movies_metadata_path = "Downloads/movies_metadata.csv/movies_metadata.csv"
credits_path = "Downloads/credits.csv/credits.csv"

movie_list = [] 

def readGenres(movies_metadata_path):
    metadata = pd.read_csv(movies_metadata_path, low_memory=False)
    genres = {}
    all_genres = []
    for idx in range(len(metadata)):
        g = []
        for i in metadata.iloc[idx]["genres"].split("'name': '")[1:]:
            g.append(i.split("'")[0])
            all_genres.append(i.split("'")[0])
        genres[metadata.iloc[idx]["id"]] = g
        
    return genres, all_genres

    
def readMovies(movies_metadata_path, num_movies): 

    count = 0

    movieGraph = nx.DiGraph()

    nodeDict = {}

    firstRow = True


    with open(movies_metadata_path, 'r', encoding="utf8") as read_obj:
        csv_reader = reader(read_obj)
        for row in csv_reader:
            if firstRow != True:
                nodeInfo = {}
                movie_title, movie_id, genre = row[20], row[5], row[3] 
                movieGraph.add_node(movie_id)
                movie_list.append((movie_id, movie_title))
                nodeInfo["node_type"] = "movie"
                nodeInfo["movie_title"] = movie_title 
                nodeDict[movie_id] = nodeInfo
                nx.set_node_attributes(movieGraph, nodeDict) 

                count += 1
                if count == num_movies:
                    break 

            else:
                firstRow = False 
                
                
    return movieGraph
            




In [2]:
def addGenreEdges(movieGraph, genres, all_genres, num_genres): 
    
    nodeDict = {}
    for genre in all_genres:
        movieGraph.add_node(genre)
        nodeInfo = {}
        nodeInfo["node_type"] = "genre"
        nodeDict[genre] = nodeInfo
        
    nx.set_node_attributes(movieGraph, nodeDict)

        

    count = 0

    for key in genres.keys():
        count +=1
        movieNode = key 
        genres_list = genres[key] 
        for genre in genres_list:
            movieGraph.add_edge(movieNode, genre)

        if count == num_genres:
            break
    

##pos = nx.circular_layout(movieGraph)
##nx.draw(movieGraph, with_labels=True, font_weight='bold', pos=pos)

In [3]:

def readKeywords(keywords_path, movieGraph):

    keywords_dict = {}
    all_keywords = []
    keywords = pd.read_csv(keywords_path, low_memory=False)

    for idx in range(len(keywords)):
        k = []
        for i in keywords.iloc[idx]["keywords"].split("'name': '")[1:]:
            k.append(i.split("'")[0])
            all_keywords.append(i.split("'")[0])
        keywords_dict[keywords.iloc[idx]["id"]] = k
        
    return keywords_dict, all_keywords 


def addKeywordsEdges(movieGraph, keywords_dict, all_keywords, num_keywords):
    
    
    nodeDict = {}

    for keyword in all_keywords:
        movieGraph.add_node(keyword)
        nodeInfo = {}
        nodeInfo["node_type"] = "keyword"
        nodeDict[keyword] = nodeInfo
        
    nx.set_node_attributes(movieGraph, nodeDict)

    count = 0

    for key in keywords_dict.keys():
        count +=1
        movieNode = str(key)
        keywords_list = keywords_dict[key]
        for keyword in keywords_list:
            movieGraph.add_edge(movieNode, keyword)

        if count == num_keywords:
            break



In [4]:

def readDirectors(credits_path, movieGraph):

    credits = pd.read_csv(credits_path, low_memory=False)
    director_dict = {}
    all_directors = []
    for idx in range(len(credits)):
        d = []
        for i in credits.iloc[idx]["crew"].split("\'job\': \'Director\', \'name\': \'")[1:]:
            d.append(i.split("'")[0])
            all_directors.append(i.split("'")[0])
            director_dict[credits.iloc[idx]["id"]] = d
            
            
    return director_dict, all_directors
            
        
        
def addDirectorEdges(movieGraph, director_dict, all_directors, num_directors):
    count = 0
    
    nodeDict = {}


    for director in all_directors:
        movieGraph.add_node(director) 
        nodeInfo = {}
        nodeInfo["node_type"] = "keyword"
        nodeDict[director] = nodeInfo
        
    nx.set_node_attributes(movieGraph, nodeDict)


    for key in director_dict.keys():
        count +=1
        movieNode = str(key)

        for director in director_dict[key]:
             movieGraph.add_edge(movieNode, director)

        if count == num_directors:
            break
        


In [5]:
movieGraph = readMovies(movies_metadata_path, 1500)

genre_dict, all_genres = readGenres(movies_metadata_path) 

addGenreEdges(movieGraph, genre_dict, all_genres, 100)

keywords_dict, all_keywords = readKeywords(keywords_path, movieGraph)

addKeywordsEdges(movieGraph, keywords_dict, all_keywords, 100)

director_dict, all_directors = readDirectors(credits_path, movieGraph)

addDirectorEdges(movieGraph, director_dict, all_directors, 100) 

movieList = list(movieGraph.nodes) 

print (len(movieList))







41004


In [6]:
def get_user_liked():
    user_choice_list = []
    # get random list from database to present to user
    random_list = random.sample(movie_list, 30)
    print("Movie List: ", list(random_list))
   
    # ask user to choose three
    print("Choose three movies you like the most from the list above: ")
    input1 = input("Choice #1: ")
    input2 = input("Choice #2: ")
    input3 = input("Choice #3: ")
   
    # take user input and add to list
    user_choice_list.append(input1)
    user_choice_list.append(input2)
    user_choice_list.append(input3)
    print("list: ", user_choice_list)
    
    sample_list = [] 
    
    for movie in random_list:
        if movie[1] in user_choice_list:
            sample_list.append(movie) 
            
   
    # get random movie to start with
    start_movie = sample(sample_list, 1)
    
    start_movie_id = start_movie[0]
    
    print (start_movie)
    
    ##print("start movie: ", start_movie[1]) 
   
    return start_movie_id



In [7]:

# personalized PageRank algorithm
def personalized_PageRank(movieGraph, start_movie, run_len):
    nodes_visited = {}
    # start traveler on movie generated above
    current_movie = start_movie
    for i in range(run_len):
        # generate random percentage
        percentage = np.random.random()
       
        # random walk from start movie
        if percentage > .15:
            out_edges = list(movieGraph.out_edges())
            # send traveler to random node
            if len(out_edges) > 0:
                # choose randomly and send traveler to that node
                rand_idx = np.random.randint(len(out_edges))
                current_movie = out_edges[rand_idx][1]
               
                # if movie has been visited
                if current_movie in nodes_visited.keys():
                    nodes_visited[current_movie] += 1
               
                # if movie has NOT been visited
                else:
                    nodes_visited[current_movie] = 1
                    
        else:
             # if movie has been visited
                if start_movie in nodes_visited.keys():
                    nodes_visited[start_movie] += 1
               
                # if movie has NOT been visited
                else:
                    nodes_visited[start_movie] = 1
                    

            
            
     
    # calculating probability for each node being visited
    for node in movieGraph.nodes():
        # if node is in dictionary
        if node in nodes_visited.keys():
            nodes_visited[node] = nodes_visited[node] / run_len
        # if node is NOT in dictionary
        else:
            nodes_visited[node] = 0
            
    return nodes_visited 

    

In [8]:
start_movie = get_user_liked()

print(start_movie)

movies_to_sort = [] 

nodes_visited = personalized_PageRank(movieGraph, start_movie, 1000)

for node in nodes_visited:
    
    if movieGraph.nodes[node]['node_type'] == 'movie':
        movie_title = movieGraph.nodes[node]['movie_title']  
        movies_to_sort.append((node, movie_title)) 
        
        
print(nodes_visited)

movies_to_sort = sorted(movies_to_sort, key=lambda movie: nodes_visited[movie[0]]) 



##print(movies_to_sort[0:10])






Movie List:  [('15139', 'The Pagemaster'), ('9529', 'Candyman'), ('1892', 'Return of the Jedi'), ('26141', 'Fatal Instinct'), ('32646', 'Safe'), ('659', 'The Tin Drum'), ('429', 'The Good, the Bad and the Ugly'), ('1413', 'M. Butterfly'), ('32622', 'Angela'), ('9405', 'Double Team'), ('315', 'Faster, Pussycat! Kill! Kill!'), ('1092', 'The Third Man'), ('17015', 'Persuasion'), ('9879', 'Striptease'), ('522', 'Ed Wood'), ('76996', 'Love and Other Catastrophes'), ('680', 'Pulp Fiction'), ('38251', 'Mediterraneo'), ('61563', 'Touch'), ('630', 'The Wizard of Oz'), ('18', 'The Fifth Element'), ('31911', 'Mr. Wonderful'), ('16633', 'Somewhere in Time'), ('10525', 'Forget Paris'), ('25796', 'That Old Feeling'), ('9268', 'Eraser'), ('11780', 'Rob Roy'), ('173', '20,000 Leagues Under the Sea'), ('9716', 'Everyone Says I Love You'), ('116690', 'The Ballad of Narayama')]
Choose three movies you like the most from the list above: 


Choice #1:  Ed Wood
Choice #2:  Ed Wood
Choice #3:  Ed Wood


list:  ['Ed Wood', 'Ed Wood', 'Ed Wood']
[('522', 'Ed Wood')]
('522', 'Ed Wood')


KeyError: ('522', 'Ed Wood')