In [2]:
# general
import boto3
import datetime as dt
import json
import numpy as np
import pandas as pd
import snowflake.connector

In [3]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [4]:
pd.options.display.float_format = '{:,.2f}'.format

### 0.2 : Connection to Snowflake

In [5]:
from abc import ABCMeta, abstractmethod

class Credentials(metaclass=ABCMeta):
    pass
    
    
class SSMPSCredentials(Credentials):
    def __init__(self, secretid: str):
        self._secretid = secretid
        self._secrets = {}
        
    def get_keys(self):
        """
        credential fetching 
        """
        _aws_sm_args = {'service_name': 'secretsmanager', 'region_name': 'us-east-1'}
        secrets_client = boto3.client(**_aws_sm_args)
        get_secret_value_response = secrets_client.get_secret_value(SecretId=self._secretid)
        return get_secret_value_response
    
    
class BaseConnector(metaclass=ABCMeta):
    @abstractmethod
    def connect(self):
        raise NotImplementedError
    

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_DEV","WORKSPACE")

def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df



In [6]:
genre_medal = run_query('''
select max_prod.content_analytics.asset_medals.TITLE_ID, TITLE, OFFERING_START_DATE, OFFERING_END_DATE, PROGRAMMING_PROVIDED_GENRE, HOME_TERRITORY_OBSERVED_MEDAL, PILLAR_GENRES
from max_prod.content_analytics.asset_medals
inner join max_prod.catalog.vw_title_genres on max_prod.content_analytics.asset_medals.TITLE_ID=max_prod.catalog.vw_title_genres.TITLE_ID
where PROGRAMMING_PROVIDED_GENRE = 'Pay1'
''')

In [18]:
launch_date = run_query('''
--*************************************************** USE THIS
-- building new table off of TAD
with tentpole_ids as (
select stg.asset_title_long
  , offering_start_date as offering_start_date --limiting returning ID,
  , stg.viewable_id
  , rank() over (partition by stg.asset_title_long order by offering_start_date) as rank
from max_prod.catalog.tentpole_asset_offering_dim tad
inner join max_prod.staging.tentpole_titles_stg stg
  on stg.viewable_id=tad.viewable_id
  and stg.territory = tad.territory
where stg.territory = 'HBO MAX DOMESTIC'
  
group by 1,2,3
)
select   
    tid.asset_title_long
    , tid.viewable_id as title_id
    , map.imdb_id as imdb_id
    , stg.theatrical_release_date
    ,to_date(coalesce(pop.theatrical_release_date, tid.offering_start_date)) as offering_start_date --applying popcorn release dates overridding Pay-1 offering dates for popcorn titles
    ,(case when pop.theatrical_release_date is not null
        then datediff(day,pop.theatrical_release_date,stg.theatrical_release_date) --calculate window with new applied date (should be 0 for popcorn)
      else days_from_theatrical end) as days_from_theatrical1
    ,datediff(days,stg.theatrical_release_date,'2020-05-20') as days_from_max_launch --recalculating windows from titles that premiered before MAX launch
    ,(case when days_from_theatrical1 = 0 then 0
     when days_from_max_launch<=0 then days_from_theatrical1
     else days_from_max_launch end) as days_from_theatrical2
    ,days_on_hbo_max
    ,cumulative_viewing_subs
    ,cumulative_first_views
    ,area_gross
from tentpole_ids tid
inner join max_prod.staging.tentpole_titles_stg stg
    on tid.viewable_id=stg.viewable_id
    and stg.territory = 'HBO MAX DOMESTIC'
    and stg.theatrical_release_date is not null
left join max_prod.catalog.popcorn_titles pop 
    on pop.viewable_id=stg.viewable_id
left join enterprise_data.catalog.wm_wb_imdb_mapping map --vid to imdb_id
    on map.viewable_id=stg.viewable_id
    and studio = 'HBO'
left join enterprise_data.catalog.imdb_boxoffice_opening_weekends bo --title_id to imdb_id
    on bo.title_id=map.imdb_id
    and (area_code='XDOM' or area_code is null)
left join max_prod.content_analytics.new_title_release_days_on_platform dop 
    on tid.viewable_id = dop.title_id
where area_gross is not null
    and rank = 1
    and title_level = 'Series and Movies'
    and stg.is_pay_1 =1
    and days_on_hbo_max = 90
group by 1,2,3,4,5,6,7,8,9,10,11, 12
order by 2 desc
''')

In [19]:
launch_date.tail(3)

Unnamed: 0,asset_title_long,title_id,imdb_id,theatrical_release_date,offering_start_date,days_from_theatrical1,days_from_max_launch,days_from_theatrical2,days_on_hbo_max,cumulative_viewing_subs,cumulative_first_views,area_gross
148,The Call of the Wild,GX1ozeARCl6vDcwEAAAF9,tt7504726,2019-12-25,2020-11-29,312,147,147,90,920407,26197,24791624
149,Underwater,GX1oymw295cLCTQEAAAFt,tt5774060,2020-01-10,2020-11-22,296,131,131,90,1371037,34513,7008297
150,Dolittle,GX1oycwoNEsMslAEAAAAI,tt6673612,2020-01-17,2020-11-15,289,124,124,90,1501877,37595,21844045


In [14]:
genre_medal.head()

Unnamed: 0,title_id,title,offering_start_date,offering_end_date,programming_provided_genre,home_territory_observed_medal,pillar_genres
0,GYPXEhQg3DFbCwwEAAADm,News of the World,2021-09-05,2022-12-01,Pay1,Silver,Action/Adventure
1,GXpTQtgV6ax-auwEAABc3,The Good Liar,2020-06-14,2021-03-01,Pay1,Bronze,Drama
2,GYK0lqgy8KZiWvwEAAAAH,Shiva Baby,2021-07-08,2024-07-01,Pay1,Bronze,Comedy
3,GYbzvWABDj8LDwgEAAAAp,The Last Duel,2022-01-14,2022-09-01,Pay1,Gold,Drama
4,GYm0w5wSRXaTDYQEAAAD4,Last Night in Soho,2022-07-01,2023-10-01,Pay1,Silver,Suspense/Thriller


In [24]:
movie_hbo = pd.merge(genre_medal[['title_id', 'title', 'home_territory_observed_medal', 'pillar_genres']],
                    launch_date[['title_id','imdb_id', 'theatrical_release_date', 'offering_start_date', 'days_from_theatrical2', 
                                 'days_from_max_launch', 'cumulative_viewing_subs', 'cumulative_first_views', 'area_gross']],
                    on = 'title_id', how = 'inner')

In [26]:
# Catigorical feature 
category_onehot = pd.get_dummies(movie_hbo['pillar_genres'], prefix='genre')
movie_hbo=pd.concat([movie_hbo, category_onehot], axis = 1)

In [30]:
# Medal Data
medal_dict = {'Silver':2, 'Bronze':3, 'Gold':1 , 'Platinum':0}
movie_hbo['medal_number'] = movie_hbo['home_territory_observed_medal'].replace(medal_dict)

# Correlation

In [None]:
import matplotlib.pyplot as plt 

In [33]:
data = movie_hbo.copy()

In [34]:
data.head()

Unnamed: 0,title_id,title,home_territory_observed_medal,pillar_genres,imdb_id,theatrical_release_date,offering_start_date,days_from_theatrical2,days_from_max_launch,cumulative_viewing_subs,cumulative_first_views,area_gross,genre_Action/Adventure,genre_Comedy,genre_Drama,genre_Horror,genre_Kids/Family,genre_Romance,genre_Sci-Fi,genre_Suspense/Thriller,medal_number
0,GYPXEhQg3DFbCwwEAAADm,News of the World,Silver,Action/Adventure,tt6878306,2020-12-25,2021-09-05,250,-219,2557041,25362,2250430,1,0,0,0,0,0,0,0,2
1,GXpTQtgV6ax-auwEAABc3,The Good Liar,Bronze,Drama,tt5563334,2019-11-15,2020-06-14,187,187,186115,10519,5605051,0,0,1,0,0,0,0,0,3
2,GYK0lqgy8KZiWvwEAAAAH,Shiva Baby,Bronze,Comedy,tt11317142,2020-03-15,2021-07-08,66,66,307494,1415,7160,0,1,0,0,0,0,0,0,3
3,GYbzvWABDj8LDwgEAAAAp,The Last Duel,Gold,Drama,tt4244994,2021-10-15,2022-01-14,91,-513,5012315,33314,4759151,0,0,1,0,0,0,0,0,1
4,GYm0w5wSRXaTDYQEAAAD4,Last Night in Soho,Silver,Suspense/Thriller,tt9639470,2021-10-29,2022-07-01,245,-527,2005376,8975,4178460,0,0,0,0,0,0,0,1,2


In [41]:
FEATURE_COLS = ['days_from_theatrical2', 'area_gross', 'genre_Action/Adventure ', 'genre_Comedy', 'genre_Drama', 'genre_Horror ', 
                'genre_Kids/Family', 'genre_Romance', 'genre_Sci-Fi', 'genre_Suspense/Thriller', 'medal_number']

In [38]:
TARGET_COL = ['cumulative_viewing_subs'] # cumulative_first_views

In [40]:
data.columns

Index(['title_id', 'title', 'home_territory_observed_medal', 'pillar_genres',
       'imdb_id', 'theatrical_release_date', 'offering_start_date',
       'days_from_theatrical2', 'days_from_max_launch',
       'cumulative_viewing_subs', 'cumulative_first_views', 'area_gross',
       'genre_Action/Adventure ', 'genre_Comedy', 'genre_Drama',
       'genre_Horror ', 'genre_Kids/Family', 'genre_Romance', 'genre_Sci-Fi',
       'genre_Suspense/Thriller', 'medal_number'],
      dtype='object')

In [42]:
plot_data=data[FEATURE_COLS+TARGET_COL]
corr = plot_data.corr()[TARGET_COL]
mask=np.zeros_like(corr, dtype=np.bool)
corr.sort_values(by = TARGET_COL)

Unnamed: 0,cumulative_viewing_subs
medal_number,-0.61
days_from_theatrical2,-0.45
genre_Drama,-0.29
genre_Romance,-0.15
genre_Sci-Fi,-0.1
genre_Horror,-0.1
genre_Comedy,-0.1
genre_Suspense/Thriller,0.02
genre_Kids/Family,0.1
genre_Action/Adventure,0.5


In [43]:
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

NameError: name 'plt' is not defined