## Description

#### Purpose: To calculate director score using the dataset following similar methodology to *The Numbers*.

#### Input: `3.3.3_Merged_Data_Prod_Scores.csv`

#### Outputs: `3.3.4_Merged_Data_Dir_Scores.csv`

This notebook performs the same functions as `3.3.3 Production Company Score.ipynb` but for director score.

In [None]:
from tmdbv3api import TMDb
from tmdbv3api import Movie
from tmdbv3api.exceptions import TMDbException
import random
import pandas as pd
import csv
import numpy as np
from math import exp
import ast
tmdb=TMDb()
tmdb.api_key=' '
    # API key redacted

In [None]:
# Initialize csv file path
csv_file_path= '../3.3.3 Calculate Production Company Score/Outputs/3.3.3_Merged_Data_Prod_Scores.csv'
df = pd.read_csv(csv_file_path)

In [None]:
df['Movie Contribution to Director and Production Scores'] = 0

#Calculates the contribution of a movie to a star score based on whether or not it was a top 100 domestic grossing movie of that year

release_years = range(2010, 2024)
#iterates through the years
for year in release_years:
    df_year = df[df["Release Year"] == year]
    print(df_year.head())
    #sorts by revenue (descending)
    df_year = df_year.sort_values(by=['Merged Revenue'], ascending=False)
    #iterate through top 100 movies, give them points based on the ranking (100 to the top grossing, 99 to the 2nd top, ..., 1 to the 100th)
    for i in range(0,100):
        tmdb_id_to_update = df_year['IMDB ID'].iloc[i]
        #record the contribution in the dataframe
        df.loc[df['IMDB ID'] == tmdb_id_to_update, 'Movie Contribution to Director and Production Scores'] = 100 - i

In [None]:
# Create a DataFrame with unique director IDs
director_ids = []

for index, row in df.iterrows():
    director_ids_str = row['director_ids']
    
    # Check for NaN values and skip them
    if pd.isna(director_ids_str):
        continue

    # Safely evaluate the content of 'director_ids' if it's not NaN
    director_ids += ast.literal_eval(director_ids_str)

unique_director_ids = list(set(director_ids))

# Create a list of years from 2011 to 2023
years = [str(year) for year in range(2010, 2024)]

# Initialize the data with zeros
data = {f'star_{year}': [0] * len(unique_director_ids) for year in years}
data['ids'] = unique_director_ids

# Create the 'output_df' DataFrame
director_df = pd.DataFrame(data)

# Reorder columns with 'ids' as the first column
director_df = director_df[['ids'] + [col for col in director_df.columns if col != 'ids']]

# Print the first few rows of the 'output_df' DataFrame for debugging
print(director_df.head())

In [None]:
# Iterate through df to calculate director scores
for index, row in df.iterrows():
    director_ids_str = row['director_ids']
    # Check for NaN values and skip them
    if pd.isna(director_ids_str):
        continue
    # Safely evaluate the content of 'director_ids' if it's not NaN
    director_ids = ast.literal_eval(director_ids_str)
    release_year = row['Release Year']
    score_contribution = row['Movie Contribution to Director and Production Scores']
    # Iterate through each director_id in the director_ids array
    for director_id in director_ids:
        # Find the corresponding row in director_df
        director_df_row = director_df[director_df['ids'] == director_id]
        if not director_df_row.empty:
            director_score = director_df_row[f'star_{release_year}'].values[0]
            # Add the contribution from the movie to the total star score for that director for that year
            if not pd.isna(score_contribution):
                director_score += score_contribution
            # Assign the updated director score to the corresponding 'star_yyyy' column
            director_df.loc[director_df['ids'] == director_id, f'star_{release_year}'] = director_score

output_director = director_df.copy(deep = True)           

#sums the star scores for the previous three years (beginning in 2012)
for director_id in director_df['ids']:
    for column in director_df.columns:
        if column.startswith("star_"):
            release_year = int(column.split("_")[1])
            if release_year > 2011:
                previous_year = release_year - 1
                year_before_previous = release_year - 2
                director_df_row = director_df[director_df['ids'] == director_id]
                if not director_df_row.empty:
                    director_score = director_df_row[column].values[0]
                    # Calculate score from the previous year
                    previous_year_score = director_df_row[f'star_{previous_year}'].values[0]
                    director_score += previous_year_score
                    # Calculate score from the year before the previous year
                    year_before_previous_score = director_df_row[f'star_{year_before_previous}'].values[0]
                    director_score += year_before_previous_score
                    # Assign the updated director score to the corresponding 'star_year' column
                    output_director.loc[output_director['ids'] == director_id, column] = director_score

In [None]:
# Save Raw Director Score Data
output_director.to_csv('director_df.csv', index=False)

In [None]:
# Create an output dataframe
df_output = df.copy()  # Copy the original DataFrame
df_output = df_output[df_output['Release Year'] > 2012]
df_output['Total Director Score'] = 0  # create an empty star scores column for the sum of the director scores
df_output['Avg Director Score'] = 0  # create an empty star scores column for the avg of the director scores



for index, row in df_output.iterrows():
    row_dir_info = row['director_ids']  # Extract director_ids array
    if not pd.isna(row_dir_info):  # Check for NaN
        row_dir_info = ast.literal_eval(row_dir_info)
    else:
        row_dir_info = []  # Set to an empty list if NaN
    for director_id in row_dir_info:
        release_date = row['Release Year']
        previous_year = release_date - 1
        dir_info_row = output_director[output_director['ids'] == director_id]  # Get the row containing the queried director id
        star_column_name = f'star_{previous_year}'  # get the star score column for the previous year
        star_score = dir_info_row[star_column_name].values[0]  # Use the previous year as the star score
        if star_score != None:
            df_output.loc[df['IMDB ID'] == row['IMDB ID'], 'Total Director Score'] += star_score
    #computes avg of director scores
    if len(row_dir_info) > 0:
        df_output.loc[df['IMDB ID'] == row['IMDB ID'], 'Avg Director Score'] = df_output.loc[df['IMDB ID'] == row['IMDB ID'], 'Total Director Score'] / len(row_dir_info)

# Save to csv
print(df_output.head())
df_output.to_csv('3.3.4_Merged_Data_Dir_Scores.csv', index=False)