# Lab - Movie (Graph) Analysis

Muyuan Zhang

July 20 2023

In [None]:
import json
import networkx as nx
import matplotlib.pyplot as plt
import re
import statistics
import matplotlib.colors as mcolors

## Step 1: Ingest the Data
### Task 1

In [None]:
movie_list = []

with open('imdbClean.json', 'r', errors='ignore') as file:
    for line in file:
        line = line.strip()

        try:
            movie = json.loads(line)
            # store each of these movie objects in a list
            movie_list.append(movie)
        except json.JSONDecodeError:
            continue

### Task 2

In [None]:
us_movies_list = [movie for movie in movie_list if 'Country' in movie and re.compile(r'USA', re.IGNORECASE).search(
        movie['Country'])]

print("The total number of movies is", len(movie_list))
print("The number of US movies is", len(us_movies_list))

## Step 2: Kevin Bacon
### Create a graph object

In [None]:
movieGraph = nx.Graph()

for cur_movie in us_movies_list[:10]:
    actors = cur_movie['Actors'].split(', ')
    actors = [actor for actor in actors if actor != 'N/A']

    for actor in actors:
        movieGraph.add_node(actor)
        
        for costar in actors:
            if actor != costar:
                movieGraph.add_edge(actor, costar)

nx.draw(movieGraph, with_labels=True, pos=nx.kamada_kawai_layout(movieGraph), font_size=7, node_size=50)
plt.title("Movie Graph")
plt.show()

### Q1: Which actor has been in the most movies?

In [None]:
movieGraph = nx.Graph()

for cur_movie in us_movies_list:
    actors = cur_movie['Actors'].split(', ')
    actors = [actor for actor in actors if actor != 'N/A']

    for actor in actors:
        movieGraph.add_node(actor)

        for costar in actors:
            if actor != costar:
                movieGraph.add_edge(actor, costar)

max_degree = max(movieGraph.degree(), key=lambda x: x[1])[1]
actor_in_most_movies = [actor for actor, degree in movieGraph.degree() if degree == max_degree]
print(actor_in_most_movies, "has been in the most movies.")

### Q2: How many movies was Kevin Bacon in? How many costars has he had?

In [None]:
actor_movies = {}

for movie in us_movies_list:
    actors = movie['Actors'].split(', ')
    actors = [actor for actor in actors if actor != 'N/A']

    for actor in actors:
        if actor not in actor_movies:
            actor_movies[actor] = 0
            
        actor_movies[actor] += 1

print("Kevin Bacon was in", actor_movies['Kevin Bacon'], "movies and has had", 
      movieGraph.degree('Kevin Bacon'), "costars.")

### Q3: What is the median number of costars an actor has had?

In [None]:
degrees_list = list(dict(movieGraph.degree()).values())

print("The median number of costars an actor has had is", statistics.median(degrees_list))

plt.hist(degrees_list, bins=range(min(degrees_list), max(degrees_list) + 1, 1))
plt.xlabel("# of costars")
plt.ylabel("# of actors")
plt.title("Distribution of Actors' Costars")
plt.show()

### Q4: Is the "6 degrees of separation" from Kevin Bacon theory true? If not, provide counterexamples.

In [None]:
path_lengths_list = list(nx.shortest_path_length(movieGraph, source='Kevin Bacon').values())

plt.hist(path_lengths_list, bins=range(min(path_lengths_list),
                                       max(path_lengths_list) + 1, 1))
plt.xlabel("Length of path")
plt.ylabel("# of actors")
plt.title("Distribution of lengths of path between Kevin Bacon and everyone else")
plt.show()

paths_shorter_than_7 = sum(1 for path_length in path_lengths_list if path_length <= 6)
percentage = (paths_shorter_than_7 / len(path_lengths_list))
print(round(percentage, 4) * 100, "% of the paths are shorter than 7.")
if percentage > 0.99:
    print("The 6 degrees of separation from Kevin Bacon theory is very likely true.")


## Step 3: Other Actor Analysis

In [None]:
kevin_bacon_subgraph = nx.subgraph(movieGraph, list(movieGraph.neighbors('Kevin Bacon')) + ['Kevin Bacon'])
node_degrees_colors_map = dict(kevin_bacon_subgraph.degree())
second_largest_value = sorted(node_degrees_colors_map.values())[-2]
node_degrees_colors_map['Kevin Bacon'] = second_largest_value + 1

cmap = plt.cm.Reds
scalar_map = plt.cm.ScalarMappable(norm=mcolors.Normalize(
    vmin=min(node_degrees_colors_map.values()), vmax=max(node_degrees_colors_map.values())), cmap=cmap)
scalar_map.set_array([])

plt.figure(figsize=(22, 18))
nx.draw(kevin_bacon_subgraph, with_labels=True, pos=nx.circular_layout(kevin_bacon_subgraph),
        node_color=list(node_degrees_colors_map.values()), cmap=cmap)
cbar = plt.colorbar(scalar_map)
cbar.set_label("Node degree")
plt.title("Kevin Bacon subgraph circular layout")
plt.show()

plt.figure(figsize=(22, 18))
nx.draw(kevin_bacon_subgraph, with_labels=True, pos=nx.spring_layout(kevin_bacon_subgraph),font_size=10,
        node_color=list(node_degrees_colors_map.values()), cmap=cmap, width=0.5)
cbar = plt.colorbar(scalar_map)
cbar.set_label("Node degree")
plt.title("Kevin Bacon subgraph spring layout")
plt.show()

pagerank_actors = sorted(nx.pagerank(movieGraph).items(), key=lambda x: x[1], reverse=True)

print("5 highest scoring actors:", pagerank_actors[:5])
print("5 lowest scoring actors:", pagerank_actors[-5:])

Q: What does the page rank tell you here?

A: PageRank calculates the ranks based on the proportional rank passed around the actors. Actors with more costars will have higher PageRank, and for actors who have the same number of costars, those who collaborate with more influential actors will be ranked higher.