## Exercise 1.Finding the most likely cluster given a particular genre (30 minutes)

In [8]:
import json

import pandas as pd
import numpy as np

In [9]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [10]:
cluster_df = pd.read_csv("movie_to_cluster.csv")

In [11]:
cluster_df["cluster"].value_counts()

6     4119
2     3616
0     3507
13    1551
1     1380
3     1252
8     1072
4      845
14     750
7      638
10     553
9      483
5      318
12     295
11     194
15      47
Name: cluster, dtype: int64

In [12]:
cluster_pr_map = {cluster_id:cluster_pr for cluster_id,cluster_pr in (cluster_df["cluster"].value_counts() / cluster_df.shape[0]).items()}
cluster_pr_df = pd.DataFrame(cluster_df["cluster"].value_counts() / cluster_df.shape[0])

cluster_pr_df

Unnamed: 0,cluster
6,0.199758
2,0.175364
0,0.170078
13,0.075218
1,0.066925
3,0.060718
8,0.051988
4,0.04098
14,0.036372
7,0.030941


### Assess Genre-Specific Cluster Probabilities

In [13]:
# For each genre, count the number of movies
genre_counts = {}

# For each movie, get its genres and update the genre count
for movie_id in movie_actor_map.keys():
    for genre in movie_actor_map[movie_id]["genres"]:
        genre_counts[genre] = genre_counts.get(genre, 0) + 1
        
genre_prs = []
for genre,g_count in genre_counts.items():
    genre_prs.append((genre, g_count/len(movie_actor_map)))
    
genre_prs_df = pd.DataFrame(genre_prs, columns=["genre", "probability"])
genre_pr_map = {row["genre"]:row["probability"] for idx,row in genre_prs_df.iterrows()}

genre_prs_df.sort_values(by="probability", ascending=False)

Unnamed: 0,genre,probability
5,Drama,0.49258
0,Comedy,0.291804
10,Thriller,0.19418
6,Action,0.181523
3,Horror,0.149224
8,Crime,0.134481
2,Romance,0.12226
7,Adventure,0.080844
9,Mystery,0.074442
4,Sci-Fi,0.051164


In [14]:
#Setting the genre we will be investigating
target_genre = "Sci-Fi"

In [15]:
per_cluster_prs = []
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_genre_count = sum([
        1 if target_genre in movie_actor_map[m]["genres"] else 0 
        for m in group["movie_id"]
    ])
    
    # Calculate conditional probability of the target genre given this cluster
    ## Pr(X=Genre | Y=Cluster <ID>)
    #gu = this_cluster_genre_count/group.shape[0]
    pr_genre_given_cluster = this_cluster_genre_count / group.shape[0]
    joint_pr_genre_cluster = pr_genre_given_cluster * group.shape[0] / cluster_df.shape[0]
    # Then add this calculated probability to `per_cluster_prs`,
    ## so we can track this conditional probability
    per_cluster_prs.append(joint_pr_genre_cluster)
    

In [16]:
# For each cluster ID, calculate the posterior probability given the target genre
for cluster_id,cluster_genre_pr in enumerate(per_cluster_prs):

    # combine this cluster_genre_pr, cluster_pr_df, and genre_prs_df
    ## to calculate Pr(Cluster|Genre)
    pr_cluster_given_genre = cluster_genre_pr / genre_pr_map[target_genre]

    print("Pr[Cluster %02d | %s]:" % (cluster_id, target_genre), "\t", pr_cluster_given_genre)
    

Pr[Cluster 00 | Sci-Fi]: 	 0.14407582938388624
Pr[Cluster 01 | Sci-Fi]: 	 0.08815165876777252
Pr[Cluster 02 | Sci-Fi]: 	 0.14597156398104266
Pr[Cluster 03 | Sci-Fi]: 	 0.058767772511848344
Pr[Cluster 04 | Sci-Fi]: 	 0.04549763033175355
Pr[Cluster 05 | Sci-Fi]: 	 0.022748815165876776
Pr[Cluster 06 | Sci-Fi]: 	 0.20284360189573458
Pr[Cluster 07 | Sci-Fi]: 	 0.04170616113744076
Pr[Cluster 08 | Sci-Fi]: 	 0.03981042654028436
Pr[Cluster 09 | Sci-Fi]: 	 0.025592417061611375
Pr[Cluster 10 | Sci-Fi]: 	 0.025592417061611375
Pr[Cluster 11 | Sci-Fi]: 	 0.016113744075829384
Pr[Cluster 12 | Sci-Fi]: 	 0.022748815165876776
Pr[Cluster 13 | Sci-Fi]: 	 0.06824644549763033
Pr[Cluster 14 | Sci-Fi]: 	 0.050236966824644555
Pr[Cluster 15 | Sci-Fi]: 	 0.0018957345971563982


## Exercise 2. Finding the most likely cluster given a particular actor (30 minutes)

In [17]:
# For each actor, count the number of movies
actor_counts = {}

# For each movie, get its genres and update the genre count
for movie_id in movie_actor_map.keys():
    for actor in movie_actor_map[movie_id]["actors"]:
        actor_counts[actor] = actor_counts.get(actor, 0) + 1
        
actor_prs = []
for actor,a_count in actor_counts.items():
    actor_prs.append((actor, a_count/len(movie_actor_map)))
    
actor_prs_df = pd.DataFrame(actor_prs, columns=["actor", "probability"])
actor_pr_map = {row["actor"]:row["probability"] for idx,row in actor_prs_df.iterrows()}

actor_prs_df.sort_values(by="probability", ascending=False)

Unnamed: 0,actor,probability
661,nm0000616,0.009408
258,nm0000514,0.004753
1033,nm0001744,0.004704
1205,nm0001803,0.004268
126,nm0222881,0.003637
...,...,...
15304,nm12444764,0.000048
15303,nm2877285,0.000048
15302,nm10843335,0.000048
15301,nm12015373,0.000048


In [18]:
#Setting the actor we will be comparing to
target_actor_id = 'nm0000115' # Nic Cage

In [19]:
per_cluster_prs = []
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_actor_count = sum([
        1 if target_actor_id in movie_actor_map[m]["actors"] else 0 
        for m in group["movie_id"]
    ])
    
    # Calculate conditional probability of the target actor given this cluster
    ## Pr(X=Actor | Y=Cluster <ID>)
    pr_actor_given_cluster = this_cluster_actor_count / group.shape[0]
    joint_pr_actor_cluster = pr_actor_given_cluster * group.shape[0] / cluster_df.shape[0]
    
    # Then add this calculated probability to `per_cluster_prs`,
    ## so we can track this conditional probability
    per_cluster_prs.append(joint_pr_actor_cluster)

In [21]:
# For each cluster ID, calculate the posterior probability given the target actor
for cluster_id,cluster_actor_pr in enumerate(per_cluster_prs):

    # combine this cluster_actor_pr, cluster_pr_df, and actor_prs_df
    ## to calculate Pr(Cluster|Actor)
    pr_cluster_given_actor = cluster_actor_pr / actor_pr_map[target_actor_id]

    print("Pr[Cluster %02d | %s]:" % (cluster_id, target_actor_id), "\t", pr_cluster_given_actor)
    

Pr[Cluster 00 | nm0000115]: 	 0.0
Pr[Cluster 01 | nm0000115]: 	 0.0
Pr[Cluster 02 | nm0000115]: 	 0.0
Pr[Cluster 03 | nm0000115]: 	 0.0
Pr[Cluster 04 | nm0000115]: 	 0.0
Pr[Cluster 05 | nm0000115]: 	 0.0
Pr[Cluster 06 | nm0000115]: 	 0.0
Pr[Cluster 07 | nm0000115]: 	 0.0
Pr[Cluster 08 | nm0000115]: 	 0.0
Pr[Cluster 09 | nm0000115]: 	 0.0
Pr[Cluster 10 | nm0000115]: 	 0.0
Pr[Cluster 11 | nm0000115]: 	 0.0
Pr[Cluster 12 | nm0000115]: 	 0.9836065573770492
Pr[Cluster 13 | nm0000115]: 	 0.0
Pr[Cluster 14 | nm0000115]: 	 0.01639344262295082
Pr[Cluster 15 | nm0000115]: 	 0.0
