In [0]:
# general
import boto3
import datetime as dt
import json
import numpy as np
import pandas as pd
import snowflake.connector

In [0]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [0]:
pd.options.display.float_format = '{:,.2f}'.format

## 1. "The Northman" Does not have viewership data --- 26 titls in total missing that

### 0.2 : Connection to Snowflake

In [0]:
from abc import ABCMeta, abstractmethod

class Credentials(metaclass=ABCMeta):
    pass
    
    
class SSMPSCredentials(Credentials):
    def __init__(self, secretid: str):
        self._secretid = secretid
        self._secrets = {}
        
    def get_keys(self):
        """
        credential fetching 
        """
        _aws_sm_args = {'service_name': 'secretsmanager', 'region_name': 'us-east-1'}
        secrets_client = boto3.client(**_aws_sm_args)
        get_secret_value_response = secrets_client.get_secret_value(SecretId=self._secretid)
        return get_secret_value_response
    
    
class BaseConnector(metaclass=ABCMeta):
    @abstractmethod
    def connect(self):
        raise NotImplementedError
    

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","WORKSPACE")

def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df

In [0]:
original_movies = pd.read_csv('Pay-1 Titles Theatrical Window_v0.2.csv')

In [0]:
original_movies.head()

In [0]:
len(original_movies)

In [0]:
title_info = run_query('''
SELECT m.TITLE_SERIES, m.LEGACY_SERIES_ID, m.CKG_SERIES_ID, m.FIRST_RELEASE_DATE, o.start_date AS THEATRICAL_DATE
, PROGRAM_TYPE, PRIMARY_GENRE, LATAM_HOME_MEDAL
, O.TITLE_ID as imdb_id, c.AREA_NAME, m.GEO_VALUE, o.area_gross ,bdg.AMOUNT as budget
FROM max_prod.content_datascience.all_titles_season_metadata m
JOIN max_prod.ckg.imdb_boxoffice_opening_weekends o 
    ON m.imdb_series_id = o.title_id
JOIN MAX_PROD.CKG.IMDB_AREA_CODES c 
    ON c.AREA_CODE= o.AREA_CODE and UPPER(c.AREA_NAME) = m.GEO_VALUE
left join "MAX_PROD"."CKG".IMDB_BOXOFFICE_TITLE_BUDGET bdg -- budget title_id to imdb_id
    on m.imdb_series_id=bdg.title_id
WHERE m.GEO_VALUE IN ('LATAM', 'BRAZIL', 'ARGENTINA', 'COLOMBIA', 'MEXICO', 'CARIBBEAN', 'REST OF LATAM')
and m.ever_pay_1_title = 1
order by title_series, area_name
''')

In [0]:
title_info.head()

In [0]:
len(title_info.legacy_series_id.unique())

In [0]:
len(title_info)

In [0]:
# title_info['latam_home_medal'] = title_info['latam_home_medal'].replace('No Medal', np.NaN)

# title_info['latam_home_medal'] = title_info.groupby(['ckg_series_id'])['latam_home_medal'].bfill()
# title_info['latam_home_medal'] = title_info.groupby(['ckg_series_id'])['latam_home_medal'].ffill()

# title_info['latam_home_medal'] = title_info['latam_home_medal'].fillna('None')

In [0]:
area_gross = title_info.groupby(['ckg_series_id'])['area_gross'].sum().reset_index()
budget = title_info.groupby(['ckg_series_id'])['budget'].first().reset_index()

In [0]:
title_info = title_info.groupby(['title_series', 'legacy_series_id', 'ckg_series_id', 'imdb_id', 'program_type', 
                                 'primary_genre', 'first_release_date'])['theatrical_date'].first().reset_index()

In [0]:
title_info = title_info.merge(area_gross, on = ['ckg_series_id'], how = 'left')\
                       .merge(budget, on = ['ckg_series_id'], how = 'left')

In [0]:
title_info.head()

In [0]:
len(title_info)

In [0]:
# title_info.to_csv('latam_title_release_info.csv')

In [0]:
medal= run_query('''
WITH TEMP AS (
select
WBD_MAX_SERIES_ID_OR_HBOMAX_TITLE_ID AS ckg_series_id, DAYS_ON_HBO_MAX_AND_WBD_MAX as day_on_max,
OFFERING_START_DATE_PST, REQUEST_DATE_PST,  DAYS_ON_HBO_MAX_AND_WBD_MAX, 
c.CUMULATIVE_VIEWING_SUBSCRIBERS/plt.viewing_subscribers*100 as percent_cumulative_viewing_subscribers,
       c.CUMULATIVE_SUBSCRIPTION_FIRST_VIEWS/plt.subscription_first_views*100 as percent_cumulative_first_views
from max_prod.content_analytics.cumulative_content_viewership_pst  c
JOIN max_prod.content_analytics.cumulative_platform_viewership_pst plt
ON to_char(c.OFFERING_START_DATE_PST, 'YYYY-MM-DD') = plt.start_date
AND c.DAYS_ON_HBO_MAX_AND_WBD_MAX = plt.DAYS_SINCE_RELEASE
AND c.GEO_VALUE = plt.GEO_VALUE
where c.geo_value = 'LATAM' 
and day_on_max = 28
and title_level = 'Seasons and Movies'
and OFFERING_WINDOW_NUM = 1
order by DAYS_ON_HBO_MAX_AND_WBD_MAX
)

SELECT ckg_series_id,
    case when percent_cumulative_viewing_subscribers > 20 or percent_cumulative_first_views > 10 then 'Platinum'
		 when percent_cumulative_viewing_subscribers > 10 or percent_cumulative_first_views > 5 then 'Gold'
		 when percent_cumulative_viewing_subscribers > 5 or percent_cumulative_first_views > 2.5 then 'Silver'
		 else 'Bronze'
		 end as observed_medal
FROM TEMP
''')

In [0]:
title_info = title_info.merge(medal, on = ['ckg_series_id'], how = 'left')

In [0]:
# title_info.loc[title_info['latam_home_medal'] == 'None', 'latam_home_medal'] = title_info['observed_medal']
title_info['latam_home_medal'] = title_info['observed_medal']

In [0]:
# final_title_info = title_info[title_info['observed_medal'].notnull()]

In [0]:
len(title_info)

In [0]:
viewership = run_query('''
select
WBD_MAX_SERIES_ID_OR_HBOMAX_TITLE_ID AS ckg_series_id, DAYS_ON_HBO_MAX_AND_WBD_MAX as day_on_max,
c.CUMULATIVE_VIEWING_SUBSCRIBERS, c.CUMULATIVE_SUBSCRIPTION_FIRST_VIEWS as CUMULATIVE_FIRST_VIEWS,
c.CUMULATIVE_VIEWING_SUBSCRIBERS/plt.viewing_subscribers*100 as percent_cumulative_viewing_subscribers,
c.CUMULATIVE_SUBSCRIPTION_FIRST_VIEWS/plt.subscription_first_views*100 as percent_cumulative_first_views
from max_prod.content_analytics.cumulative_content_viewership_pst  c
JOIN max_prod.content_analytics.cumulative_platform_viewership_pst plt
ON to_char(c.OFFERING_START_DATE_PST, 'YYYY-MM-DD') = plt.start_date
AND c.DAYS_ON_HBO_MAX_AND_WBD_MAX = plt.DAYS_SINCE_RELEASE
AND c.GEO_VALUE = plt.GEO_VALUE
where c.geo_value = 'LATAM' 
and title_level = 'Seasons and Movies'
and OFFERING_WINDOW_NUM = 1
and DAYS_ON_HBO_MAX_AND_WBD_MAX = 90
order by DAYS_ON_HBO_MAX_AND_WBD_MAX
''')

In [0]:
title_info = title_info.merge(viewership, on = ['ckg_series_id'], how = 'left')

In [0]:
len(title_info.legacy_series_id.unique())

In [0]:
title_info.head()

In [0]:
final_title_info = title_info[(title_info['cumulative_viewing_subscribers'].notnull())
                             &(title_info['cumulative_viewing_subscribers']>0)]

In [0]:
len(final_title_info.legacy_series_id.unique())

In [0]:
len(final_title_info)

In [0]:
final_title_info['latam_home_medal'] = final_title_info['latam_home_medal'].fillna('Bronze')

In [0]:
final_title_info[['latam_home_medal', 'imdb_id', 'area_gross', 'budget', 'theatrical_date']].isnull().sum()/len(final_title_info)

In [0]:
final_title_info['days_from_theatrical'] = (final_title_info['first_release_date'].apply(pd.to_datetime)
                                    - final_title_info['theatrical_date'].apply(pd.to_datetime)).dt.days

In [0]:
final_title_info.to_csv('latam_title_release_info.csv')

In [0]:
movie_hbo = final_title_info.copy()

In [0]:
movie_hbo.head()

In [0]:
# Catigorical feature 
category_onehot = pd.get_dummies(movie_hbo['primary_genre'], prefix='genre')
movie_hbo=pd.concat([movie_hbo, category_onehot], axis = 1)

In [0]:
medal_onehot = pd.get_dummies(movie_hbo['latam_home_medal'], prefix='medal')
movie_hbo=pd.concat([movie_hbo, medal_onehot], axis = 1)

In [0]:
# Medal Data
medal_dict = {'Silver':2, 'Bronze':3, 'Gold':1 , 'Platinum':0,}
movie_hbo['medal_number'] = movie_hbo['latam_home_medal'].replace(medal_dict)

In [0]:
movie_hbo.medal_number.unique()

In [0]:
movie_hbo['log_cumulative_first_views'] = np.log2(movie_hbo['cumulative_first_views'])
movie_hbo['log_cumulative_viewing_subs'] = np.log2(movie_hbo['cumulative_viewing_subscribers'])

In [0]:
movie_hbo.to_csv('latam_training_data.csv')

# Correlation

In [0]:
import matplotlib.pyplot as plt 
import seaborn as sns

In [0]:
data = movie_hbo.copy()

In [0]:
data['percent_cumulative_viewing_subscribers'] = data['percent_cumulative_viewing_subscribers'].astype(float)

In [0]:
FEATURE_COLS = ['days_from_theatrical', 'area_gross', 'budget', 'genre_Action', 'genre_Comedy', 
                'genre_Drama', 'genre_Fantasy & Sci-Fi', 'genre_Horror', 'genre_Kids & Family',
                'medal_number', 'medal_Bronze', 'medal_Gold', 'medal_Platinum', 'medal_Silver',
               ]

In [0]:
TARGET_COL = ['cumulative_viewing_subscribers'] # cumulative_first_views

In [0]:
movie_hbo.head()

In [0]:
plot_data=data[FEATURE_COLS+TARGET_COL]
corr = plot_data.corr()[TARGET_COL]
mask=np.zeros_like(corr, dtype=np.bool)
corr.loc['dummy_value'] = -1
corr.sort_values(by = TARGET_COL)

In [0]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [0]:
TARGET_COL = ['log_cumulative_viewing_subs'] # cumulative_first_views

In [0]:
plot_data=data[FEATURE_COLS+TARGET_COL]
corr = plot_data.corr()[TARGET_COL]
mask=np.zeros_like(corr, dtype=np.bool)
corr.loc['dummy_value'] = -1
corr.sort_values(by = TARGET_COL)

In [0]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [0]:
TARGET_COL = ['log_cumulative_first_views']

In [0]:
plot_data=data[FEATURE_COLS+TARGET_COL]
corr = plot_data.corr()[TARGET_COL]
mask=np.zeros_like(corr, dtype=np.bool)
corr.loc['dummy_value'] = -1
corr.sort_values(by = TARGET_COL)

In [0]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [0]:
TARGET_COL = ['cumulative_first_views'] # cumulative_first_views

In [0]:
plot_data=data[FEATURE_COLS+TARGET_COL]
corr = plot_data.corr()[TARGET_COL]
mask=np.zeros_like(corr, dtype=np.bool)
corr.loc['dummy_value'] = -1
corr.sort_values(by = TARGET_COL)

In [0]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

## Only movies after launch

In [0]:
data = movie_hbo[movie_hbo['days_from_max_launch'] < 0].copy()

In [0]:
data['pct_actives'] = data['pct_actives'].astype(float)

In [0]:
data.title_id.count()

In [0]:
TARGET_COL = ['cumulative_viewing_subs'] # cumulative_first_views

In [0]:
plot_data=data[FEATURE_COLS+TARGET_COL]
corr = plot_data.corr()[TARGET_COL]
mask=np.zeros_like(corr, dtype=np.bool)
corr.loc['dummy_value'] = -1
corr.sort_values(by = TARGET_COL)

In [0]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [0]:
TARGET_COL = ['log_cumulative_first_views']

In [0]:
plot_data=data[FEATURE_COLS+TARGET_COL]
corr = plot_data.corr()[TARGET_COL]
mask=np.zeros_like(corr, dtype=np.bool)
corr.loc['dummy_value'] = -1
# corr.sort_values(by = TARGET_COL)

In [0]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [0]:
TARGET_COL = ['cumulative_first_views'] # cumulative_first_views

In [0]:
plot_data=data[FEATURE_COLS+TARGET_COL]
corr = plot_data.corr()[TARGET_COL]
mask=np.zeros_like(corr, dtype=np.bool)
corr.loc['dummy_value'] = -1
corr.sort_values(by = TARGET_COL)

In [0]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 8, as_cmap=False),
            square=True, ax=ax)

### correlation between features

In [0]:
# FEATURE_COLS = ['days_from_theatrical','area_gross','budget',
#  'genre_action',
#  'genre_comedy',
#  'genre_crime',
#  'genre_drama',
#  'genre_fantasy & sci-fi',
#  'genre_horror',
#  'medal_number',
#  'is_direct_to_max_title']

In [0]:
plot_data=data[FEATURE_COLS]
corr = plot_data.corr()
mask=np.zeros_like(corr, dtype=np.bool)

In [0]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [0]:
movie_hbo.to_csv('latam_training_data.csv')