## Exercise 1. Visualizing Movie Subgraphs

In [7]:
%matplotlib inline

In [2]:
import json
from pandas import json_normalize
import random

import numpy as np
import pandas as pd
import networkx as nx

In [4]:
g = nx.Graph() # Build the graph

In [5]:
with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Load the movie from this line
        this_movie = json.loads(line)
            
        # Create a node for every actor
        for actor_id,actor_name in this_movie['actors']:
            g.add_node(actor_id, name=actor_name)
            
        # Iterate through the list of actors, generating all pairs
        #. Starting with the first actor in the list, generate pairs with all subsequent actors
        #. then continue to second actor in the list and repeat
        i = 0 # Counter in the list
        for left_actor_id,left_actor_name in this_movie['actors']:
            for right_actor_id,right_actor_name in this_movie['actors'][i+1:]:
                # Get the current weight, if it exists
                current_weight = g.get_edge_data(left_actor_id, right_actor_id, default={"weight":0})["weight"]
                
                # Add an edge for these actors
                g.add_edge(left_actor_id, right_actor_id, weight=current_weight+1)
                
            i += 1 # increment the counter

In [None]:
print("Nodes:", len(g.nodes))

In [None]:
# If you want to explore this graph in Gephi or some other
#. graph analysis tool, NetworkX makes it easy to export data.
#. Here, we use the GraphML format, which Gephi can read 
#. natively, to keep node attributes like Actor Name
nx.write_graphml(g, "actors.graphml")

In [None]:
top_k = 10 # how many of the most central nodes to print

In [None]:
# Calculate degree centrality for all nodes
centrality_degree = nx.degree_centrality(g)

# sort node-centrality dictionary by metric, and reverse to get top elements first
for u in sorted(centrality_degree, key=centrality_degree.get, reverse=True)[:top_k]:
    print(u, g.nodes[u]['name'], centrality_degree[u])

In [6]:
#nx.write_graphml(nx.ego_graph(g, "nm0000206", radius=3),"fghjhgf.graphml")
c = nx.ego_graph(g, "nm0262635", radius=3, undirected=True)

nx.write_graphml(c,"inst414_week5_exercise.graphml")

## Exercise 2(a). Finding Similar Actors based on Genre

### 1. Create a data frame, where each row corresponds to an actor, each column represents a genre, and each cell captures how many times that row’s actor has appeared in that column’s genre

In [6]:
#with open('imdb_movies_2000to2022.prolific.json', 'rb') as f:

    

with open("imdb_movies_2000to2022.prolific.json") as f:
    content = f.readlines()

    data = [eval(c) for c in content]
    data = pd.DataFrame(data)

In [7]:
data.head()

Unnamed: 0,imdb_id,title,year,runtime,genres,actors,rating
0,tt0035423,Kate & Leopold,2001,118,"[Comedy, Fantasy, Romance]","[[nm0000212, Meg Ryan], [nm0413168, Hugh Jackm...","{'avg': 6.4, 'votes': 85923}"
1,tt0088751,The Naked Monster,2005,100,"[Comedy, Horror, Sci-Fi]","[[nm0864851, Kenneth Tobey], [nm0828288, Brink...","{'avg': 5.3, 'votes': 328}"
2,tt0096056,Crime and Punishment,2002,126,[Drama],"[[nm0000417, Crispin Glover], [nm0000603, Vane...","{'avg': 5.6, 'votes': 830}"
3,tt0113092,For the Cause,2000,100,"[Action, Adventure, Drama]","[[nm0001002, Dean Cain], [nm0001299, Thomas Ia...","{'avg': 3.4, 'votes': 829}"
4,tt0116391,Gang,2000,167,"[Action, Crime, Drama]","[[nm0006763, Jackie Shroff], [nm0007113, Nana ...","{'avg': 6.2, 'votes': 257}"


In [219]:
# Create a dictionary having genres as keys and actor IDs as values (array)

actor_genre = dict()
actor_id_list = []

with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
     
    for line in in_file:
        
        this_movie = json.loads(line)
        genres = this_movie['genres']
        actor_id = [item[0] for item in this_movie['actors']]
        #print(genres)
        #print(t)
        for genre in genres:
            #actor_genre.setdefault(genre, [])
            for x in actor_id:
                
                actor_genre.setdefault(genre, []).append(x)
                if x not in actor_id_list:
                    actor_id_list.append(x)
        #break
        
#actor_genre

In [221]:
len(actor_id_list)

33609

In [222]:

df = pd.DataFrame([actor_genre], columns=actor_genre.keys(), index=actor_id_list)
df

Unnamed: 0,Comedy,Fantasy,Romance,Horror,Sci-Fi,Drama,Action,Adventure,Crime,Mystery,...,Music,Western,War,History,Musical,Documentary,Unnamed: 18,Short,News,Reality-TV
nm0000212,"[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0000417, nm0000603, nm0000457, nm0452288, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0006763, nm0007113, nm0310173, nm0412917, n...","[nm0427470, nm0001293, nm0001062, nm0732133, n...",...,"[nm0507271, nm0000569, nm0316079, nm0105672, n...","[nm0380231, nm0553436, nm0575184, nm0836071, n...","[nm0000190, nm0000200, nm0000172, nm0000954, n...","[nm0000126, nm0339304, nm0238057, nm0177020, n...","[nm0451321, nm0706787, nm0802103, nm2528716, n...","[nm0001075, nm0015358, nm0003919, nm1018163, n...","[nm0949350, nm0317570, nm1011760, nm0649330, n...","[nm0929014, nm0000799, nm0444665, nm0519725]","[nm0544611, nm0000117, nm0682495, nm0001683, n...","[nm1552873, nm3531579, nm2137078, nm0005294, n..."
nm0413168,"[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0000417, nm0000603, nm0000457, nm0452288, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0006763, nm0007113, nm0310173, nm0412917, n...","[nm0427470, nm0001293, nm0001062, nm0732133, n...",...,"[nm0507271, nm0000569, nm0316079, nm0105672, n...","[nm0380231, nm0553436, nm0575184, nm0836071, n...","[nm0000190, nm0000200, nm0000172, nm0000954, n...","[nm0000126, nm0339304, nm0238057, nm0177020, n...","[nm0451321, nm0706787, nm0802103, nm2528716, n...","[nm0001075, nm0015358, nm0003919, nm1018163, n...","[nm0949350, nm0317570, nm1011760, nm0649330, n...","[nm0929014, nm0000799, nm0444665, nm0519725]","[nm0544611, nm0000117, nm0682495, nm0001683, n...","[nm1552873, nm3531579, nm2137078, nm0005294, n..."
nm0000630,"[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0000417, nm0000603, nm0000457, nm0452288, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0006763, nm0007113, nm0310173, nm0412917, n...","[nm0427470, nm0001293, nm0001062, nm0732133, n...",...,"[nm0507271, nm0000569, nm0316079, nm0105672, n...","[nm0380231, nm0553436, nm0575184, nm0836071, n...","[nm0000190, nm0000200, nm0000172, nm0000954, n...","[nm0000126, nm0339304, nm0238057, nm0177020, n...","[nm0451321, nm0706787, nm0802103, nm2528716, n...","[nm0001075, nm0015358, nm0003919, nm1018163, n...","[nm0949350, nm0317570, nm1011760, nm0649330, n...","[nm0929014, nm0000799, nm0444665, nm0519725]","[nm0544611, nm0000117, nm0682495, nm0001683, n...","[nm1552873, nm3531579, nm2137078, nm0005294, n..."
nm0005227,"[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0000417, nm0000603, nm0000457, nm0452288, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0006763, nm0007113, nm0310173, nm0412917, n...","[nm0427470, nm0001293, nm0001062, nm0732133, n...",...,"[nm0507271, nm0000569, nm0316079, nm0105672, n...","[nm0380231, nm0553436, nm0575184, nm0836071, n...","[nm0000190, nm0000200, nm0000172, nm0000954, n...","[nm0000126, nm0339304, nm0238057, nm0177020, n...","[nm0451321, nm0706787, nm0802103, nm2528716, n...","[nm0001075, nm0015358, nm0003919, nm1018163, n...","[nm0949350, nm0317570, nm1011760, nm0649330, n...","[nm0929014, nm0000799, nm0444665, nm0519725]","[nm0544611, nm0000117, nm0682495, nm0001683, n...","[nm1552873, nm3531579, nm2137078, nm0005294, n..."
nm0864851,"[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0000417, nm0000603, nm0000457, nm0452288, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0006763, nm0007113, nm0310173, nm0412917, n...","[nm0427470, nm0001293, nm0001062, nm0732133, n...",...,"[nm0507271, nm0000569, nm0316079, nm0105672, n...","[nm0380231, nm0553436, nm0575184, nm0836071, n...","[nm0000190, nm0000200, nm0000172, nm0000954, n...","[nm0000126, nm0339304, nm0238057, nm0177020, n...","[nm0451321, nm0706787, nm0802103, nm2528716, n...","[nm0001075, nm0015358, nm0003919, nm1018163, n...","[nm0949350, nm0317570, nm1011760, nm0649330, n...","[nm0929014, nm0000799, nm0444665, nm0519725]","[nm0544611, nm0000117, nm0682495, nm0001683, n...","[nm1552873, nm3531579, nm2137078, nm0005294, n..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,"[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0000417, nm0000603, nm0000457, nm0452288, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0006763, nm0007113, nm0310173, nm0412917, n...","[nm0427470, nm0001293, nm0001062, nm0732133, n...",...,"[nm0507271, nm0000569, nm0316079, nm0105672, n...","[nm0380231, nm0553436, nm0575184, nm0836071, n...","[nm0000190, nm0000200, nm0000172, nm0000954, n...","[nm0000126, nm0339304, nm0238057, nm0177020, n...","[nm0451321, nm0706787, nm0802103, nm2528716, n...","[nm0001075, nm0015358, nm0003919, nm1018163, n...","[nm0949350, nm0317570, nm1011760, nm0649330, n...","[nm0929014, nm0000799, nm0444665, nm0519725]","[nm0544611, nm0000117, nm0682495, nm0001683, n...","[nm1552873, nm3531579, nm2137078, nm0005294, n..."
nm10592896,"[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0000417, nm0000603, nm0000457, nm0452288, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0006763, nm0007113, nm0310173, nm0412917, n...","[nm0427470, nm0001293, nm0001062, nm0732133, n...",...,"[nm0507271, nm0000569, nm0316079, nm0105672, n...","[nm0380231, nm0553436, nm0575184, nm0836071, n...","[nm0000190, nm0000200, nm0000172, nm0000954, n...","[nm0000126, nm0339304, nm0238057, nm0177020, n...","[nm0451321, nm0706787, nm0802103, nm2528716, n...","[nm0001075, nm0015358, nm0003919, nm1018163, n...","[nm0949350, nm0317570, nm1011760, nm0649330, n...","[nm0929014, nm0000799, nm0444665, nm0519725]","[nm0544611, nm0000117, nm0682495, nm0001683, n...","[nm1552873, nm3531579, nm2137078, nm0005294, n..."
nm7216750,"[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0000417, nm0000603, nm0000457, nm0452288, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0006763, nm0007113, nm0310173, nm0412917, n...","[nm0427470, nm0001293, nm0001062, nm0732133, n...",...,"[nm0507271, nm0000569, nm0316079, nm0105672, n...","[nm0380231, nm0553436, nm0575184, nm0836071, n...","[nm0000190, nm0000200, nm0000172, nm0000954, n...","[nm0000126, nm0339304, nm0238057, nm0177020, n...","[nm0451321, nm0706787, nm0802103, nm2528716, n...","[nm0001075, nm0015358, nm0003919, nm1018163, n...","[nm0949350, nm0317570, nm1011760, nm0649330, n...","[nm0929014, nm0000799, nm0444665, nm0519725]","[nm0544611, nm0000117, nm0682495, nm0001683, n...","[nm1552873, nm3531579, nm2137078, nm0005294, n..."
nm0936300,"[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0000212, nm0413168, nm0000630, nm0005227, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0864851, nm0828288, nm0933983, nm0329491, n...","[nm0000417, nm0000603, nm0000457, nm0452288, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0001002, nm0001299, nm0923529, nm0936365, n...","[nm0006763, nm0007113, nm0310173, nm0412917, n...","[nm0427470, nm0001293, nm0001062, nm0732133, n...",...,"[nm0507271, nm0000569, nm0316079, nm0105672, n...","[nm0380231, nm0553436, nm0575184, nm0836071, n...","[nm0000190, nm0000200, nm0000172, nm0000954, n...","[nm0000126, nm0339304, nm0238057, nm0177020, n...","[nm0451321, nm0706787, nm0802103, nm2528716, n...","[nm0001075, nm0015358, nm0003919, nm1018163, n...","[nm0949350, nm0317570, nm1011760, nm0649330, n...","[nm0929014, nm0000799, nm0444665, nm0519725]","[nm0544611, nm0000117, nm0682495, nm0001683, n...","[nm1552873, nm3531579, nm2137078, nm0005294, n..."


In [225]:
df = pd.DataFrame([actor_genre], columns=actor_genre.keys(), index=actor_id_list)
df

df1 = df.goupby.count()
df1

AttributeError: 'DataFrame' object has no attribute 'goupby'