In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('titles_mod2.csv')

movie_actors = defaultdict(list) # Dictionary that stores the actors in each movie as a list
actors = defaultdict(list) # Dictionary that stores the imdb scores for an actor's appearances

for row in df.index: # Get actor's name, then add the imdb score of that show to actor's list
    row_actors = df['actor_id'][row].strip('[]')
    for row_actor in row_actors.split(','):
        actor_id = row_actor.strip(" '").strip('"')
        if actor_id != '': # Remove "blank" actors
            movie_actors[df['id'][row]].append(actor_id)
            actors[actor_id].append(df['imdb_score'][row])

avg_actors = defaultdict(float)

for actor in actors.keys():
    avg_actors[actor] = (sum(actors[actor]) / len(actors[actor])) # Average out the scores

avg_movies = defaultdict(float) # Dictionary to find the average "actor score" based on the average
                                # avg_imdb score of all actors in the movie

indexes = []

for movie in movie_actors.keys():
    scoreSum = 0
    actorAppearances = 0
    for actor in movie_actors[movie]:
        scoreSum += avg_actors[actor]
        actorAppearances += len(actors[actor])

    avg_movies[movie] = scoreSum / len(movie_actors[movie])

    if actorAppearances / len(movie_actors[movie]) > 5: # Keep track of the movies that have a high
                                                        # high enough appearance rate. Pick 5 as
                                                        # we want to look at actors with a career
        indexes.append(movie)


movie_avg_actors = pd.Series(avg_movies)
tmdb_scores = pd.Series(df['imdb_score'])

In [3]:
scores = df[['id','imdb_score']].copy()
scores = scores.set_index('id')
actor_movie_scores = pd.concat([movie_avg_actors, scores], axis=1)
actor_movie_scores = actor_movie_scores.filter(indexes, axis=0)
actor_movie_scores.index.name = 'id'
actor_movie_scores.columns = ['actor_score', 'imdb_score']
print(len(actor_movie_scores.index) / 5340)
actor_movie_scores[['imdb_score', 'actor_score']].corr(method='pearson')


0.13651685393258428


Unnamed: 0,imdb_score,actor_score
imdb_score,1.0,0.443248
actor_score,0.443248,1.0


In [4]:
movie_directors = defaultdict(list) # Dictionary that stores the directors in each movie as a list
directors = defaultdict(list) # Dictionary that stores the imdb scores for an director's appearances

for row in df.index: # Get director's name, then add the imdb score of that show to director's list
    row_directors = df['director_id'][row].strip('[]{}')
    for row_director in row_directors.split(','):
        director_id = row_director.strip(" '").strip('"')
        if director_id != '': # Remove "blank" directors
            movie_directors[df['id'][row]].append(director_id)
            directors[director_id].append(df['imdb_score'][row])

avg_directors = defaultdict(float)

for director in directors.keys():
    avg_directors[director] = (sum(directors[director]) / len(directors[director])) # Average out the scores

avg_movies = defaultdict(float) # Dictionary to find the average "director score" based on the average
                                # avg_imdb score of all directors in the movie

dir_indexes = []

for movie in movie_directors.keys():
    scoreSum = 0
    for director in movie_directors[movie]:
        scoreSum += avg_directors[director]
    
    avg_movies[movie] = scoreSum / len(movie_directors[movie])

    if len(movie_directors[movie]) >= 2: # Keep track of the movies that have a high
                                        # high enough appearance rate. Pick 2 as
                                        # we want to look at directors with more data, but less than
                                        # actors as it's rarer for a director to direct multiple movies
        dir_indexes.append(movie)

movie_avg_directors = pd.Series(avg_movies)
tmdb_scores = pd.Series(df['imdb_score'])

dir_scores = df[['id','imdb_score']].copy()
dir_scores = dir_scores.set_index('id')
director_movie_scores = pd.concat([movie_avg_directors, dir_scores], axis=1)
director_movie_scores = director_movie_scores.filter(dir_indexes, axis=0)
director_movie_scores.index.name = 'id'
director_movie_scores.columns = ['director_score', 'imdb_score']
print(len(director_movie_scores.index))
director_movie_scores[['imdb_score', 'director_score']].corr(method='pearson')


413


Unnamed: 0,imdb_score,director_score
imdb_score,1.0,0.93465
director_score,0.93465,1.0


In [31]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load your dataset (replace 'your_dataset.csv' with your dataset path)
df = pd.read_csv('titles_mod2.csv')

# Check for rows with empty data in all columns
empty_rows = df[df.apply(lambda row: all(pd.isna(row) | (row == '[]')), axis=1)]

# Remove rows with empty data
df_cleaned = df.drop(empty_rows.index)

features = ['runtime', 'imdb_score', 'tmdb_score']

# Extract the selected features
X = df[features]
X

Unnamed: 0,runtime,imdb_score,tmdb_score
0,51,6.6,6.900000
1,114,8.2,8.179000
2,109,7.7,7.300000
3,91,8.2,7.811000
4,150,7.7,7.600000
...,...,...,...
5845,100,6.8,6.982444
5846,134,7.7,7.498945
5847,90,3.8,6.300000
5848,37,10.0,10.000000


In [30]:
# Standardize the features (mean=0, variance=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Choose the number of clusters (K)
k = 3  # Replace with your desired number of clusters

# Apply K-Means clustering
kmeans = KMeans(n_clusters=k, n_init=10) 
df['cluster'] = kmeans.fit_predict(X_scaled)

# Analyze and interpret the clusters
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
df_cluster_centers = pd.DataFrame(cluster_centers, columns=features)
print(df_cluster_centers)

      runtime  imdb_score  tmdb_score
0  112.477332    6.861442    6.939382
1   86.855769    5.076723    5.556299
2   41.527204    7.188683    7.572387
