## Description

#### Purpose: To calculate distributor score using the dataset following similar methodology to *The Numbers*.

#### Input: `3.3.7_Merged_Data_Filtered.csv`

#### Outputs: `3.3.8_Merged_Data_Dist_Scores.csv`

This notebook performs the same functions as `3.3.3 Production Company Score.ipynb` but for distributor score.

In [None]:
from tmdbv3api import TMDb
from tmdbv3api import Movie
from tmdbv3api.exceptions import TMDbException
import random
import pandas as pd
import csv
import numpy as np
from math import exp
import ast
tmdb=TMDb()
tmdb.api_key=' '
    # API key redacted

In [None]:
# Initialize csv file path
csv_file_path= '../3.3.7 Additional Filtering/Outputs/3.3.7_Merged_Data_Filtered.csv'
df = pd.read_csv(csv_file_path)

In [None]:
df['Movie Contribution to Director and Production Scores'] = 0

#Calculates the contribution of a movie to a star score based on whether or not it was a top 100 domestic grossing movie of that year
release_years = range(2010, 2024)
#iterates through the years
for year in release_years:
    df_year = df[df["Release Year"] == year]
    #sorts by revenue (descending)
    df_year = df_year.sort_values(by=['Merged Revenue'], ascending=False)
    #iterate through top 100 movies, give them points based on the ranking (100 to the top grossing, 99 to the 2nd top, ..., 1 to the 100th)
    for i in range(0,100):
        tmdb_id_to_update = df_year['IMDB ID'].iloc[i]
        #record the contribution in the dataframe
        df.loc[df['IMDB ID'] == tmdb_id_to_update, 'Movie Contribution to Director and Production Scores'] = 100 - i

In [None]:
# Create a DataFrame with unique production company IDs
unique_dist_ids = list(set(df['Domestic Distributor ID']))

# Create a list of years from 2011 to 2023
years = [str(year) for year in range(2010, 2024)]

# Initialize the data with zeros
data = {f'star_{year}': [0] * len(unique_dist_ids) for year in years}
data['ids'] = unique_dist_ids

# Create the 'output_df' DataFrame
dist_df = pd.DataFrame(data)

# Print the first few rows of the 'output_df' DataFrame for debugging
print(dist_df.head())

In [None]:
import pandas as pd

#Iterate through main data
for index, row in df.iterrows():
    dist_ids = row['Domestic Distributor ID']
    # Iterate through each dist_id in the dist_ids list
    for dist_id in [dist_ids]:
        # Your existing code...
        release_year = row['Release Year']
        score_contribution = row['Movie Contribution to Director and Production Scores']
        # Iterate through each dist_id in the dist_ids array
        dist_df_row = dist_df[dist_df['ids'] == dist_id]
        if not dist_df_row.empty:
            dist_score = dist_df_row[f'star_{release_year}'].values[0]
            # Add the contribution from the movie to the total star score for that company for that year
            if not pd.isna(score_contribution):
                dist_score += score_contribution
            # Assign the updated production company score to the corresponding 'star_yyyy' column
            dist_df.loc[dist_df['ids'] == dist_id, f'star_{release_year}'] = dist_score

# Sum the star scores for the previous three years (beginning in 2012)
output_dists = dist_df.copy(deep=True)

for dist_id in dist_df['ids']:
    for column in dist_df.columns:
        if column.startswith("star_"):
            release_year = int(column.split("_")[1])
            if release_year > 2011:
                previous_year = release_year - 1
                year_before_previous = release_year - 2
                dist_df_row = dist_df[dist_df['ids'] == dist_id]
                if not dist_df_row.empty:
                    dist_score = dist_df_row[column].values[0]
                    # Calculate score from the previous year
                    previous_year_score = dist_df_row[f'star_{previous_year}'].values[0]
                    dist_score += previous_year_score
                    # Calculate score from the year before the previous year
                    year_before_previous_score = dist_df_row[f'star_{year_before_previous}'].values[0]
                    dist_score += year_before_previous_score
                    # Assign the updated director score to the corresponding 'star_year' column
                    output_dists.loc[output_dists['ids'] == dist_id, column] = dist_score

In [None]:
print(output_dists.head())

In [None]:
# Save Raw Distributor Score Data
output_dists.to_csv('domestic_dist_df.csv', index=False)

In [None]:
df_output = df.copy()  # Copy the original DataFrame
df_output = df_output[df_output['Release Year'] > 2012]
df_output['Domestic Distributor Score'] = 0  # create an empty star scores column

for index, row in df_output.iterrows():
    row_dist_info = row['Domestic Distributor ID']  # Extract prod_ids array
    if isinstance(row_dist_info, int) and row_dist_info != 0:
        row_dist_info = [row_dist_info]
    else:
        row_dist_info = []  # Set to an empty list if not an integer or if it's 0
    # computes the sum of production company scores
    for dist_id in row_dist_info:
        release_date = row['Release Year']
        previous_year = release_date - 1
        dist_info_row = output_dists[output_dists['ids'] == dist_id]  # Get the row containing the queried prod id
        star_column_name = f'star_{previous_year}'  # get the star score column for the previous year
        star_score = dist_info_row[star_column_name].values[0]  # Use the previous year as the star score
        if star_score is not None:
            df_output.loc[df['IMDB ID'] == row['IMDB ID'], 'Domestic Distributor Score'] += star_score

# Save to csv
print(df_output.head())
df_output.to_csv('Outputs/3.3.8_Merged_Data_Dist_Scores.csv', index=False)