In [47]:
import os
import sys
import logging
import boto3
import itertools as it
import io
from utils import *
import snowflake.connector

In [48]:
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
import datetime
from datetime import timedelta
import scipy.stats as st

In [49]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

In [50]:
class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")

In [51]:
def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df

In [52]:
logger = logging.getLogger()
logger.info(f'Loading inputs')
data_list ={}

s3 = boto3.resource('s3')
bucket = s3.Bucket('hbo-ingest-datascience-content-dev')

In [53]:
cast = pd.read_csv('Sundance Cast List.csv')
log_line = pd.read_csv('Sundance Loglines.csv')
log_comps = pd.read_csv('Sundance Loglines Dups.csv')

# Cast Ranking

In [54]:
imdb_cast = run_query(''' with
      imdb_contributor AS (
          SELECT distinct name as contributor_name,
                 t.title_id, c.category,
                 t.ORIGINAL_TITLE, t.TITLE_TYPE,
                 production_release_date,
                 YEAR as release_year,
                 min(c.logged_order) as logged_order,
                 min(c.billingorder) as billingorder,
                 max(number_of_votes) as number_of_votes,
                 max(imdb_rating) as imdb_rating,
                case when hbo_titles IS NULL THEN 'No' ELSE 'Yes' END AS is_hbo_titles
          FROM max_prod.workspace.imdb_title_contributor_billing_logged c
          JOIN "ENTERPRISE_DATA"."CATALOG"."IMDB_TITLE" t  ON c.titleid = t.title_id
          LEFT JOIN  (select distinct title_name , release_year, 1 as hbo_titles from max_prod.catalog.reporting_asset_dim) rad
                    ON LOWER(t.ORIGINAL_TITLE) = LOWER(rad.title_name )
                    and t.year = rad.release_year
          where 1=1
          and title_type IN ('tvEpisode', 'movie')
          and category NOT IN ('archive_footage')

          GROUP BY 1, 2, 3, 4, 5, 6, 7, 12
      ),
      yearly_sum as (
      select YEAR as release_year, sum(number_of_votes) as yearly_sum
      from
      "ENTERPRISE_DATA"."CATALOG"."IMDB_TITLE"
      GROUP BY YEAR
      having yearly_sum > 0
      )

      SELECT c.contributor_name, c.ORIGINAL_TITLE as title_name, c.category, c.title_type,
             number_of_votes, yearly_sum,
             number_of_votes/yearly_sum * 10000 as standardized_votes,
             imdb_rating,
             production_release_date,
             c.release_year,
             billingOrder,
             number_of_votes/yearly_sum *logged_order * 10000  AS standardized_score,
             c.is_hbo_titles
      FROM imdb_contributor c
      JOIN yearly_sum s ON c.release_year = s. release_year
      order by STANDARDIZED_SCORE desc
''')

In [55]:
imdb_cast_2021 = imdb_cast[imdb_cast['release_year'] == '2021']
imdb_cast_2021 = imdb_cast_2021.groupby(['contributor_name', 'release_year']).sum().reset_index()
imdb_cast_2021['ranking'] = imdb_cast_2021.groupby("release_year")["standardized_score"].rank("dense", ascending=False)

In [56]:
imdb_cast_2020 = imdb_cast[imdb_cast['release_year'] == '2020']
imdb_cast_2020 = imdb_cast_2020.groupby(['contributor_name', 'release_year']).sum().reset_index()
imdb_cast_2020['ranking'] = imdb_cast_2020.groupby("release_year")["standardized_score"].rank("dense", ascending=False)

In [57]:
imdb_cast_overall = imdb_cast.groupby(['contributor_name']).sum().reset_index()
imdb_cast_overall['ranking'] = imdb_cast_overall["standardized_score"].rank(ascending=False)

In [58]:
imdb_cast_average = imdb_cast.groupby(['release_year', 'contributor_name']).sum().reset_index()
imdb_cast_average['ranking'] = imdb_cast_average.groupby("release_year")["standardized_score"].rank("dense", ascending=False)
imdb_cast_average = imdb_cast_average.groupby(['contributor_name']).mean().reset_index()

In [59]:
imdb_cast_2021 = imdb_cast_2021[['contributor_name', 'standardized_score', 'ranking']]\
                .rename(columns = {'ranking': '2021_ranking', 'standardized_score': '2021_score'})
imdb_cast_2020 = imdb_cast_2020[['contributor_name', 'standardized_score', 'ranking']]\
                .rename(columns = {'ranking': '2020_ranking', 'standardized_score': '2020_score'})
imdb_cast_overall = imdb_cast_overall[['contributor_name', 'standardized_score', 'ranking']]\
                .rename(columns = {'ranking': 'overall_ranking', 'standardized_score': 'overall_score'})
imdb_cast_average = imdb_cast_average[['contributor_name', 'standardized_score', 'ranking']]\
                .rename(columns = {'ranking': 'average_ranking', 'standardized_score': 'average_score'})

In [102]:
cast = pd.merge(cast, imdb_cast_2021, on = ['contributor_name'], how = 'left')
cast = pd.merge(cast, imdb_cast_2020, on = ['contributor_name'], how = 'left')
cast = pd.merge(cast, imdb_cast_overall, on = ['contributor_name'], how = 'left')
cast = pd.merge(cast, imdb_cast_average, on = ['contributor_name'], how = 'left')

In [103]:
cast.head()

Unnamed: 0,Sundance Movie,Cast,contributor_name,2021_score,2021_ranking,2020_score,2020_ranking,overall_score,overall_ranking,average_score,average_ranking
0,892,Connie Britton,Connie Britton,,,4.588167,735.0,23.647116,20479.0,1.391007,8596.882353
1,892,John Boyega,John Boyega,0.552929,5457.0,,,72.675222,7069.0,8.075025,7667.888889
2,892,Michael K. Williams,Michael Kenneth Williams,0.076767,12109.0,0.652104,4825.0,53.598087,9478.0,3.34988,7855.9375
3,Alice,Common,Common,,,4.850961,665.0,84.285555,5967.0,5.267847,2438.875
4,Alice,Keke Palmer,Keke Palmer,,,0.051736,17333.0,14.26493,31540.0,1.188744,8703.333333


In [104]:
cast.to_csv('Sundance Cast List with Ranking.csv')

In [108]:
imdb_cast_2021.sort_values(by = ['2021_ranking']).head(5)

Unnamed: 0,contributor_name,2021_score,2021_ranking
59813,Timothée Chalamet,152.416027,1.0
60053,Tom Holland,143.182191,2.0
52745,Ryan Reynolds,139.06689,3.0
16801,Dwayne Johnson,127.409478,4.0
7274,Ben Affleck,120.017024,5.0


In [112]:
imdb_cast_2020['contributor_name'] = imdb_cast_2020['contributor_name'].str.replace('Juhan Ulfsak', 'John David Washington')

In [113]:
imdb_cast_2020.sort_values(by = ['2020_ranking']).head(5)

Unnamed: 0,contributor_name,2020_score,2020_ranking
34545,John David Washington,153.494216,1.0
29081,Jamie Foxx,129.123699,2.0
66482,Tom Holland,88.440886,3.0
23030,Gal Gadot,84.027742,4.0
30491,Jefferson Hall,76.735955,5.0


# Logline

In [127]:
log_line = log_line[log_line['Sundance Movie'].notnull()]

In [128]:
log_line_2 = log_line.copy()

In [129]:
for i in range(0, 5):
    log_line = pd.concat([log_line, log_line_2], axis = 0)

In [131]:
log_line.to_csv('Sundance Loglines Dups.csv')

In [19]:
log_comps = pd.read_csv('Sundance Loglines Comps.csv')
log_comps = log_comps[['Sundance Movie', 'Comps']]

In [18]:
contributor = run_query('''
SELECT t.ORIGINAL_TITLE,
       max(imdb_rating) as imdb_rating,
       max(NUMBER_OF_VOTES) as number_of_votes,
       max(box.AREA_GROSS_TO_DATE) as box_office_ww
FROM "ENTERPRISE_DATA"."CATALOG"."IMDB_TITLE" t  
LEFT JOIN 
        (select * from "ENTERPRISE_DATA"."CATALOG"."IMDB_BOXOFFICE_TITLE_GROSS"
        where area_code IN ('XWW')
        ) box 
ON t.title_id = box.title_id
where 1=1
--AND box.area_code IN ('XWW')
and title_type = 'movie'
group by 1
''')

In [20]:
log_comps = pd.merge(log_comps, contributor,
                    left_on = ['Comps'], right_on = ['original_title'],
                    how = 'left')

In [26]:
from statistics import mean

In [28]:
log_comps.dtypes

Sundance Movie      object
Comps               object
original_title      object
imdb_rating         object
number_of_votes     object
box_office_ww      float64
dtype: object

In [29]:
log_comps['imdb_rating'] = log_comps['imdb_rating'].astype(float)
log_comps['number_of_votes'] = log_comps['number_of_votes'].astype(float)
log_comps['box_office_ww'] = log_comps['box_office_ww'].astype(float)


In [31]:
log_comps['average_rating'] = log_comps.groupby(['Sundance Movie'])['imdb_rating'].transform(mean)
log_comps['average_votes'] = log_comps.groupby(['Sundance Movie'])['number_of_votes'].transform(mean)


In [38]:
log_comps['average_boxoffice'] = log_comps[log_comps['box_office_ww'].notnull()].groupby(['Sundance Movie'])['box_office_ww'].transform(mean)

In [40]:
log_comps['average_boxoffice'] = log_comps['average_boxoffice'].fillna(method="bfill")

In [43]:
log_comps.to_csv('logline Comps.csv')

# Holiday

In [66]:
holiday = pd.read_csv('''Holiday Movie Talent List.csv''')
holiday['Cast'] = holiday['Cast'].str.replace('Remi Hii', 'Remy Hii')

In [67]:
holiday = pd.merge(holiday, imdb_cast_2021, 
                   left_on = ['Cast'], right_on = ['contributor_name'], how = 'left')
holiday = pd.merge(holiday, imdb_cast_2020, 
                   left_on = ['Cast'], right_on = ['contributor_name'], how = 'left')
holiday = pd.merge(holiday, imdb_cast_overall, 
                   left_on = ['Cast'], right_on = ['contributor_name'], how = 'left')
holiday = pd.merge(holiday, imdb_cast_average, 
                   left_on = ['Cast'], right_on = ['contributor_name'], how = 'left')

In [70]:
holiday.drop(['contributor_name_x', 'contributor_name_y', 'contributor_name_x', 'contributor_name_y'], axis = 1, inplace= True)

In [72]:
holiday.to_csv('Holiday Movie Talent List Ranking.csv')