In [None]:
%load_ext lab_black

In [None]:
import pandas as pd
import numpy as np

import ast
import itertools

import networkx as nx

pd.set_option("display.max_columns", 100)

In [None]:
credits = pd.read_csv("../data/credits.csv")
meta = pd.read_csv("../data/movies_metadata.csv")

### select subset of movies

To restrict the size of the resulting graph, we focus on movies that have generated more than 5 million in revenue starting in the year 2000.

In [None]:
meta.release_date = pd.to_datetime(meta.release_date, errors="coerce")

In [None]:
subset = meta.loc[(meta.revenue > 5000000) & (meta.release_date.dt.year >= 2000)].copy()
subset.id = subset.id.astype(int)

In [None]:
subset.release_date.dt.year.value_counts().sort_index()

In [None]:
len(subset)

In [None]:
credits = credits.merge(subset[["id"]], how="inner")

### general information about credits

In [None]:
credits.head()

In [None]:
print("number of duplicates: ", credits.id.duplicated().sum())
print("number of distinct movies: ", credits.id.nunique())

In [None]:
# each entry is a list of dictionaries
# each dictionary contains information about the cast
credits.cast[0]

### preprocess

In [None]:
# covert strings to list of dictionary
credits.cast = credits.cast.apply(ast.literal_eval)
credits.crew = credits.crew.apply(ast.literal_eval)

In [None]:
cdf = credits[["id"]].copy()

# cdf (cast and crw df) contains list of people who worked on a certain movie
# extract information from dictionaries
cdf["cast_list"] = credits.cast.apply(lambda x: [y["name"] for y in x])
cdf["crew_list"] = credits.crew.apply(lambda x: [y["name"] for y in x])
cdf["people"] = cdf.apply(lambda x: x.cast_list + x.crew_list, axis=1)

In [None]:
credits.cast[0]

### create cooperation graph

Assume that there are no two people with the same name.

In [None]:
print(f"number of distinct cast people: {cdf.cast_list.explode().nunique()}")
print(f"number of distinct crew people: {cdf.crew_list.explode().nunique()}")
print(f"number of distinct people: {cdf.people.explode().nunique()}")

In [None]:
cdf.cast_list.explode().value_counts()[:20]

In [None]:
def get_graph(_s, map_names=False):
    """Takes pd.Series containing lists of people who have worked together.
    Returns graph showing cooperation between these people
    """

    s = _s.copy()

    # pd. Series with distinct people names
    nodes = s.explode().drop_duplicates().reset_index(drop=True)

    if map_names:
        # map each name to an index
        nodes_inverse_mapping = {v: k for k, v in nodes.to_dict().items()}
        # replace names in list with index
        s = s.apply(lambda x: [nodes_inverse_mapping[name] for name in x])

    # for each list: create 2 tuple combinations of list items
    # (in each list are people who worked on one movie, the 2 tuple shows they have cooperated)
    edges_raw = s.apply(
        lambda x: [tuple(sorted(edge)) for edge in itertools.combinations(x, 2)]
    )
    # count edges, (A, B) could occur more than once, we interpret that as the weight
    edges_raw = edges_raw.explode().value_counts().reset_index()
    edges_raw.columns = ["edge", "weight"]
    # 10x faster than edges_raw.apply(lambda x: (*x.edge, x.weight), axis=1)
    _a = np.array(edges_raw.edge.tolist(), dtype="object")
    _b = np.array(edges_raw.weight.tolist(), dtype="object").reshape(-1, 1)
    _data = np.concatenate([_a, _b], axis=1)
    edges_weighted = [tuple(_data[i]) for i in range(len(_data))]

    # create networkx Graph
    G = nx.Graph(directed=False)
    G.add_nodes_from(nodes.to_list())
    G.add_weighted_edges_from(edges_weighted)

    return G

In [None]:
s = cdf.cast_list.copy()
s = cdf.people.copy()

In [None]:
%%time
G = get_graph(s)

### General graph information 

In [None]:
G.is_directed()

In [None]:
# Add node degree to node attribute

degree_dict = dict(G.degree(G.nodes()))
nx.set_node_attributes(G, degree_dict, "degree")

#### summary statistics

In [None]:
print("number of nodes:", G.number_of_nodes())
print("number of edges:", G.number_of_edges())
cc = 2 * G.number_of_edges() / G.number_of_nodes()
print("average node degree:", cc)
print("density of network:", nx.density(G))

In [None]:
%%time
# 12 seconds for 320k edges and 12k nodes (windows core i5)
# 1min 21s for 1600k edges and 40k nodes (windows core i5)
# 42k nodes, 1680k edges: 35 sek (macbook pro)
# 85k nodes, 8726k edges: 10min (macbook pro)
nx.average_clustering(G)

In [None]:
# %%time
# # 2min for 320k edges and 12k nodes
# nx.average_clustering(G, weight = 'weight')

#### look at individual star

In [None]:
star = "Jackie Chan"
star = "Scarlett Johansson"

In [None]:
print(f"{star} : {G.nodes[star]}")

In [None]:
colab_nodes = list(G.neighbors(star))
colab_edges = list(G.edges(star, data=True))

In [None]:
colab_edges[:10]

### find most influential people

In [None]:
from operator import itemgetter

sorted_degree = sorted(degree_dict.items(), key=itemgetter(1), reverse=True)

In [None]:
print("Top 20 nodes by degree:")
for d in sorted_degree[:20]:
    print(d)

### pagerank

In [None]:
%%time
# 50.6 seconds for 1600k edges and 40k nodes
# 17 sec for 85k nodes, 8726k edges
pr = nx.pagerank(G, weight = 'weight')

In [None]:
sorted(pr.items(), key=lambda item: item[1], reverse=True)[:20]

In [None]:
sorted(pr.items(), key=lambda item: item[1], reverse=True)[-10:]

In [None]:
pr_unweighted = nx.pagerank(G, weight=None)

In [None]:
sorted(pr_unweighted.items(), key=lambda item: item[1], reverse=True)[:20]

### get jobs of each person

In [None]:
job_dict = {}

In [None]:
def extract_crew(s):
    name_job = (
        s.apply(lambda ls: [[str(x["name"]), str(x["job"])] for x in ls])
        .explode()
        .drop_duplicates()
        .dropna()
        .to_list()
    )
    df = pd.DataFrame(name_job)
    df.columns = ["name", "job"]
    return df

In [None]:
%%time
# 2min
crew_jobs = extract_crew(credits.crew)
cast_jobs = cdf.cast_list.explode().drop_duplicates().to_frame()
cast_jobs.columns = ["name"]
cast_jobs["job"] = "cast"

In [None]:
jobs = pd.concat([crew_jobs, cast_jobs])
jobs = jobs.groupby(by=["name"]).job.apply(list).to_dict()

### pagerank with additional output

In [None]:
a = sorted(pr.items(), key=lambda item: item[1], reverse=True)[:20]

In [None]:
for t in a:
    print(t, jobs[t[0]])

### community detection

In [None]:
%%time
c = nx.algorithms.community.greedy_modularity_communities(G, weight = "weight")