## Exercise 1. Clustering Actors in the Actor-Genre Matrix

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import json

In [4]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        
        this_movie = json.loads(line)
                    
        
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            
            actor_genre_map[actor_id] = this_actors_genres
            
        
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })
 

In [5]:
# Check the output for actor Hugh Jackman, actor ID nm0413168
actor_genre_map['nm0413168']

{'Comedy': 7,
 'Fantasy': 3,
 'Romance': 5,
 'Action': 14,
 'Adventure': 11,
 'Sci-Fi': 10,
 'Crime': 6,
 'Thriller': 2,
 'Animation': 4,
 'Drama': 12,
 'Mystery': 5,
 'Biography': 4,
 'Musical': 2,
 'History': 1}

In [6]:
# Get all actors as an index for a dataframe
index = actor_genre_map.keys()

# Get the genre-counts for each actor in the index
rows = [actor_genre_map[k] for k in index]

# Create the data frame from these rows, with the actors as index
df = pd.DataFrame(rows, index=index)

# Fill NAs with zero, as NA means the actor has not starred in that genre
df = df.fillna(0)

df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,3.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Apply k-Means with a Fixed K

In [7]:
from sklearn.cluster import KMeans

In [8]:
k = 8

In [9]:
cluster_model = KMeans(n_clusters=k)

In [10]:
cluster_model.fit(df)

KMeans()

In [11]:
cluster_labels = cluster_model.predict(df)
actor_cluster_df = pd.DataFrame(cluster_labels, index=df.index, columns=["cluster"])

In [12]:
actor_cluster_df["cluster"].value_counts()

2    29034
0     2774
5      953
3      295
1      290
7      132
6      123
4        8
Name: cluster, dtype: int64

In [13]:
for cluster,actors in actor_cluster_df.groupby("cluster"):
    print("Cluster:", cluster, "Size:", actors.shape[0])
    
    for a_id in actors.sample(5).index:
        print("\t", a_id, actor_name_map[a_id])

Cluster: 0 Size: 2774
	 nm1232226 Alexis Dziena
	 nm1426805 Salman Shahid
	 nm0989182 Tamsin Egerton
	 nm7567974 Finn Little
	 nm0936298 Robert Wisdom
Cluster: 1 Size: 290
	 nm0662504 Molly Parker
	 nm0000671 Billy Bob Thornton
	 nm0000438 Ed Harris
	 nm0000286 Stephen Baldwin
	 nm0451600 Anupam Kher
Cluster: 2 Size: 29034
	 nm0000296 Robert Beltran
	 nm4651633 Phil Dunster
	 nm0008417 Kian Abedini
	 nm0485894 Robert Lang
	 nm9502531 Allen Lorenzo
Cluster: 3 Size: 295
	 nm0771414 Martin Starr
	 nm0004395 Adam Scott
	 nm0000702 Reese Witherspoon
	 nm0005227 Breckin Meyer
	 nm0000502 Christopher Lloyd
Cluster: 4 Size: 8
	 nm2278431 Joe Hammerstone
	 nm0001744 Tom Sizemore
	 nm0000616 Eric Roberts
	 nm0222881 Tony Devon
	 nm0001803 Danny Trejo
Cluster: 5 Size: 953
	 nm1551922 Columbus Short
	 nm6171194 Cody Renee Cameron
	 nm0526019 Diego Luna
	 nm0265670 Frankie Faison
	 nm2489045 Gregory Blair
Cluster: 6 Size: 123
	 nm0186225 Barbara Crampton
	 nm3046228 Billy Blair
	 nm0499614 Ari Lehm

## Exercise 2. Choosing k in k-means via the Elbow Method

In [18]:
for k in range(2,32):
    labels = KMeans(n_clusters=k).fit_predict(df)
    score = silhouette_score(df, labels)
    plot(k, score)

NameError: name 'silhouette_score' is not defined