# Imports

    

In [64]:
import pandas as pd
import numpy as np
import csv
import torch
from torch.utils.data import Dataset, DataLoader
import os
import itertools

# Convert Netflix CSV Datafile into Dictionary

In [66]:
def Csv2Dictionary(csv_file):
    """
    Collects all cast and director data from Netflix movies in a Python dictionary 
 
    Inputs
    ----------
    csv_file : *.csv file
 
    Returns
    -----
    Dictionary with each person (actor/director) listed as the *key* 
    and everyone (all other actors and directors) they have worked with as the *values*
    """    
    ### read in csv file
    df = pd.read_csv(csv_file)

    ### select only movies and tv shows originating from the USA
    ### (this is an attempt to narrow the dataset to select only from actors connected to Kevin Bacon)
    USA_df = df[df['country'].str.match('United States', na=False)]

    ### separate the actors listed in cast 
    ### Note: this doesn't work very well and makes a lot of mistakes
    cast = USA_df["cast"]
    actors = cast.str.split(", ")

    ### Combine actors and directors
    directors = USA_df["director"]

    actors_dir = actors
    for ind in actors.index:
        if isinstance(actors[ind], list):
            actors_dir[ind].append(str(directors[ind]))
        else:
            actors_dir[ind] = [str(actors[ind])]
            actors_dir[ind].append(str(directors[ind]))

    actor_list = [ [k for k in z if not 'nan' in k] for z in actors_dir]

    all_actors = []

    i=0
    for movie in actor_list:
        for cast_member in movie:
            all_actors.append(cast_member)
            i =i+1

    ### determine the unique set
    unique_actors = np.unique(all_actors)

    ### Create dictionary
    d ={}
    ### each actor/director is the key of the dictionary and values are all of the actors/directors they've worked with
    for actor in unique_actors:
        d[actor] = []
    ### get the values, i.e. the coworkers for each person
    for actor in unique_actors:
        l = []
        res = []
        for movie in actor_list:
            if actor in movie: 
                    l.append([a for a in movie if actor not in a])
            flat_list = list(itertools.chain(*l))
            [res.append(x) for x in flat_list if x not in res]

        for act_i in res:
            d[actor].append(act_i)
    return d

# Breadth First Search for a Graph

References :  https://www.geeksforgeeks.org/building-an-undirected-graph-and-finding-shortest-path-using-dictionaries-in-python/

In [70]:
def BFS_SP(actor_dict, start, goal):
    
    """
    Author: murtuza_chawala, 2021
    Based on Breadth First Search, or BFS, for a Graph.
    https://auth.geeksforgeeks.org/user/murtuza_chawala/articles
    Reference: https://www.geeksforgeeks.org/building-an-undirected-graph-and-finding-shortest-path-using-dictionaries-in-python/
    """
    explored = []
     
    # Queue for traversing the graph in the BFS
    queue = [[start]]
     
    # If the desired node is reached
    if start == goal:
        return int(0) ### same actor, some distance = 0
     
    # Loop to traverse the graph with the help of the queue
    while queue:
        path = queue.pop(0) 
        actor = path[-1]
         
        # Condition to check if the current node is not visited
        try: 
            if actor not in explored:
                costars = actor_dict[actor] ## find all of the costars for that actor

                # Loop to iterate over the neighbors of the node
                for costar in costars:
                    new_path = list(path)
                    new_path.append(costar)
                    queue.append(new_path)

                    # Condition to check if the neighbor node is the goal
                    if costar == goal:
                        #print("Shortest path = ", *new_path)
                        return len(new_path)-1
                explored.append(actor)
        except KeyError as actor:
            print('Key Not Found in Dictionary:', actor)
            
    # Condition when the nodes are not connected
    return float('NaN')

# Find isolated nodes to remove from the graph

Reference: https://www.python-course.eu/graphs_python.php

In [6]:


def find_isolated_nodes(graph):
    """ returns a set of isolated nodes. 
        Reference: https://www.python-course.eu/graphs_python.php
    """
    isolated = set()
    for node in graph:
        #print(node, graph[node])
        if not graph[node]:
            isolated.add(node)
    return isolated


# Group a list into batches

References: https://docs.python.org/3/library/itertools.html#itertools-recipes

In [7]:
from itertools import zip_longest

def grouper(iterable, n, fillvalue=None):
    """
    Reference: https://docs.python.org/3/library/itertools.html#itertools-recipes
    """
    args = [iter(iterable)] * n
    return zip_longest(*args, fillvalue=fillvalue)



# Generate Symmetric 'Distances' Matrices of Actors and Director 

In [62]:
def generate_batch_iterator(input_dict, batch_size=12):
    """
    Generates an iterator over person-to-person distance matrices computed from the Netflix dataset
 
    Parameters
    ----------
    input_dict : dictionary of actors [keys] and co-stars [values]
    batch_size : int
 
    Yields
    -----
    iterable of training batches, where:
        each 'batch' is a distance matrix of size 'batch_size' squared, where:
          each row/column of the matrix is associated with a person
          The "distance" between any two persons is the shortest number of hops between them
 
    In each batch, the distance matrix is represented by a numpy array.  The batch is accompanied by
    a list of strings (the persons for whom the matrix was computed.  The diagonal should be all
    zeros because each person's distance from themself is zero.
    """
    import random    
    
    ### find isolated nodes to remove from graph
    iso_actors = find_isolated_nodes(input_dict)
    isolist = list(iso_actors)
    fulllist = list(input_dict.keys())
    connected_actors = [ elem for elem in fulllist if elem not in isolist] ### only include connected actors
    
    ### Now break up the data into shuffled batches
    list_of_actors = list(grouper(connected_actors, batch_size))
    random.shuffle(list_of_actors)

    ### For each batch, generate the distance matrix (symmetric matrix with zeros on the diagonal)
    for some_rando_actors in list_of_actors:    
        import time
        tic = time.perf_counter()
        dist_matrix = np.asarray([ [BFS_SP(input_dict, actor_i, actor_j) for j, actor_j 
                                    in enumerate(some_rando_actors)] for i, actor_i in enumerate(some_rando_actors) ])
        toc = time.perf_counter()
        distance_df = pd.DataFrame(dist_matrix, index =some_rando_actors , columns = some_rando_actors)
        data_array = distance_df.to_numpy()
        data_labels = list(distance_df.columns)
        yield distance_df 

# Create Dictionary

This part is slow, approx. 9 minutes

In [None]:
import time
tic = time.perf_counter()
dictionary = Csv2Dictionary("netflix_titles.csv")
toc = time.perf_counter()
print(f"Created distance matrix in {(toc - tic)/60} minutes")

# Choose number of epochs and iterate through the batches

In [54]:
total_epochs = 2

In [72]:
from tabulate import tabulate

for epoch in range(total_epochs):
    iterator = generate_batch_iterator(dictionary, batch_size=6)
    for ind, batch in enumerate(iterator):
        print("Batch #", ind, "##########################")
        print( tabulate(batch, headers='keys', tablefmt='psql') ) 

Batch # 0 ##########################
+-----------------+-------------------+----------------+--------------+--------------+-----------------+-------------+
|                 |   Cortez Chappell |   Cortney Palm |   Cory Doran |   Cory DuVal |   Cory Hardrict |   Cory Hart |
|-----------------+-------------------+----------------+--------------+--------------+-----------------+-------------|
| Cortez Chappell |                 0 |            nan |          nan |          nan |             nan |         nan |
| Cortney Palm    |               nan |              0 |            6 |            5 |               4 |           5 |
| Cory Doran      |               nan |              6 |            0 |            7 |               6 |           7 |
| Cory DuVal      |               nan |              5 |            7 |            0 |               4 |           5 |
| Cory Hardrict   |               nan |              4 |            6 |            4 |               0 |           4 |
| Cory Hart

Batch # 6 ##########################
+-----------------+----------------+-----------------+-------------------+-----------------+-----------------+-------------------+
|                 |   Adrian Hough |   Adrian Lester |   Adrian Martinez |   Adrian Pasdar |   Adrian Petriw |   Adriana Barraza |
|-----------------+----------------+-----------------+-------------------+-----------------+-----------------+-------------------|
| Adrian Hough    |              0 |               4 |                 4 |               4 |               4 |                 5 |
| Adrian Lester   |              4 |               0 |                 4 |               4 |               4 |                 4 |
| Adrian Martinez |              4 |               4 |                 0 |               4 |               4 |                 4 |
| Adrian Pasdar   |              4 |               4 |                 4 |               0 |               3 |                 4 |
| Adrian Petriw   |              4 |          

KeyboardInterrupt: 

In [68]:

dist_mat = generate_batch_iterator(dictionary, batch_size=12)



In [69]:
next(dist_mat)

Unnamed: 0,Genneya Walton,Geno Segers,Geoff Bell,Geoff Dolan,Geoff Hughes,Geoff Lazer Ramsey,Geoff Pierson,Geoff Stults,Geoffrey Arend,Geoffrey Jones,Geoffrey McGivern,"Geoffrey Orthwein, Andrew Sullivan"
Genneya Walton,0.0,4.0,3.0,6.0,4.0,,3.0,3.0,4.0,4.0,5.0,5.0
Geno Segers,4.0,0.0,5.0,6.0,5.0,,3.0,4.0,5.0,4.0,5.0,5.0
Geoff Bell,3.0,5.0,0.0,6.0,5.0,,4.0,4.0,5.0,4.0,6.0,5.0
Geoff Dolan,6.0,6.0,6.0,0.0,7.0,,5.0,5.0,6.0,5.0,6.0,6.0
Geoff Hughes,4.0,5.0,5.0,7.0,0.0,,5.0,4.0,5.0,5.0,7.0,6.0
Geoff Lazer Ramsey,,,,,,0.0,,,,,,
Geoff Pierson,3.0,3.0,4.0,5.0,5.0,,0.0,3.0,4.0,2.0,4.0,4.0
Geoff Stults,3.0,4.0,4.0,5.0,4.0,,3.0,0.0,4.0,4.0,4.0,4.0
Geoffrey Arend,4.0,5.0,5.0,6.0,5.0,,4.0,4.0,0.0,5.0,6.0,5.0
Geoffrey Jones,4.0,4.0,4.0,5.0,5.0,,2.0,4.0,5.0,0.0,5.0,4.0
