In [1]:
import os
import sys
import logging
import boto3
import itertools as it
import io
from utils import *
import snowflake.connector

In [2]:
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
import datetime
from datetime import timedelta
import scipy.stats as st

In [3]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")

In [5]:
def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df

In [8]:
logger = logging.getLogger()
logger.info(f'Loading inputs')
data_list =[]

s3 = boto3.resource('s3')
bucket = s3.Bucket('hbo-ingest-datascience-content-dev')

In [9]:
for obj in bucket.objects.filter(Prefix='input_percent_view'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))
    exec("{0}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    exec("{0}.columns = {0}.columns.str.lower()".format(var_name))
    
    # exclude the full null columns
    exec("{0} = {0}.loc[:,{0}.isnull().sum()!={0}.shape[0]]".format(var_name))

    # exclude the old Mortal Kombat movie because the trailer percent view 
    # matching matches the trailer of the new movie to the old movie
    # exclude Tom & Jerry due to unresolvable data issue
    exec("{0} = {0}.loc[{0}['match_id_platform'].\
        isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL', '1-GYEb9QwLgFF9_ZwEAAAA7', '0-GYEb9QwLgFF9_ZwEAAAA7'])==False,:]\
        .reset_index(drop = True)".format(var_name))
    
    # append the feature df
    exec("data_list.append({0})".format(var_name))

Reading funnel_metric_feature features
Reading media_cost_postlaunch_feature features
Reading media_cost_prelaunch_feature features
Reading metadata_feature features
Reading prelaunch_trailer_feature features
Reading prelaunch_trailer_feature_before28 features
Reading sub_total_feature features
Reading trailer_feature features
Reading vtp_feature features
Reading wiki_view_feature_before28 features
Reading wiki_view_post_feature features
Reading wiki_view_pre_feature features


In [46]:
funnel_metric_feature=data_list[0]
metadata_feature=data_list[3]

In [13]:
funnel_metric_feature=funnel_metric_feature[['match_id_platform', 'day001_percent_viewed', 'day004_percent_viewed', 'day007_percent_viewed', 'day028_percent_viewed']]

In [15]:
funnel_metric_feature=funnel_metric_feature[funnel_metric_feature['day028_percent_viewed'] > 0.01]

In [17]:
len(funnel_metric_feature.match_id_platform.unique())

1219

In [18]:
metadata_feature=metadata_feature[metadata_feature['platform_name']==1]

In [20]:
metadata_feature['category'] = 'None'

In [22]:
metadata_feature.head(2)

Unnamed: 0,title_name,match_id,match_id_platform,season_number_adj,earliest_offered_timestamp,platform_name,program_type,content_category,single_episode_ind,in_sequantial_releasing_period,at_release_year,dayofweek_earliest_date,total_hours,prod_release_year,title_age_approx,licensor_agg,descriptive_genre_desc_agg,wm_enterprise_genres_agg,navigation_genre_desc_agg,category
0,VICE Special Report: Killing Cancer,GV-54_wnG7qbCwwEAAAAY,1-GV-54_wnG7qbCwwEAAAAY,-1,2020-05-27 07:00:00.000,1,original,movies,1,0,0,3,0.643611,2015,5.249315,hbo original,documentary,originals | news/talk | documentary,documentaries| news/talk| originals,
1,VICE Special Report: Countdown to Zero,GV-55HghKlqbCwwEAAAAb,1-GV-55HghKlqbCwwEAAAAb,-1,2020-05-27 07:00:00.000,1,original,movies,1,0,0,3,0.815972,2015,4.490411,hbo original,documentary,originals | news/talk | documentary,documentaries| news/talk| originals,


In [24]:
metadata_feature.loc[(metadata_feature['program_type'] == 'original')&(metadata_feature['earliest_offered_timestamp'] > '2020-05-27')\
                     &(metadata_feature['dayofweek_earliest_date'] == 4)&(metadata_feature['content_category'] == 'series'), 
                     'category'] = 'SERIES - THURS'

In [25]:
metadata_feature.loc[(metadata_feature['program_type'] == 'original')&(metadata_feature['earliest_offered_timestamp'] > '2020-05-27')\
                     &(metadata_feature['dayofweek_earliest_date'] != 4)&(metadata_feature['content_category'] == 'series'), 
                     'category'] = 'SERIES - NON-THURSDAY'

In [26]:
metadata_feature.loc[(metadata_feature['earliest_offered_timestamp'] > '2020-05-27')&(metadata_feature['content_category'] == 'movies'), 
                     'category'] = 'MOVIE'

In [27]:
metadata_feature.head()

Unnamed: 0,title_name,match_id,match_id_platform,season_number_adj,earliest_offered_timestamp,platform_name,program_type,content_category,single_episode_ind,in_sequantial_releasing_period,at_release_year,dayofweek_earliest_date,total_hours,prod_release_year,title_age_approx,licensor_agg,descriptive_genre_desc_agg,wm_enterprise_genres_agg,navigation_genre_desc_agg,category
0,VICE Special Report: Killing Cancer,GV-54_wnG7qbCwwEAAAAY,1-GV-54_wnG7qbCwwEAAAAY,-1,2020-05-27 07:00:00.000,1,original,movies,1,0,0,3,0.643611,2015,5.249315,hbo original,documentary,originals | news/talk | documentary,documentaries| news/talk| originals,MOVIE
1,VICE Special Report: Countdown to Zero,GV-55HghKlqbCwwEAAAAb,1-GV-55HghKlqbCwwEAAAAb,-1,2020-05-27 07:00:00.000,1,original,movies,1,0,0,3,0.815972,2015,4.490411,hbo original,documentary,originals | news/talk | documentary,documentaries| news/talk| originals,MOVIE
2,VICE Special Report: Fixing the System,GV-55Hwvj4IaDwwEAAAAY,1-GV-55Hwvj4IaDwwEAAAAY,-1,2020-05-27 07:00:00.000,1,original,movies,1,0,0,3,1.120278,2015,4.668493,hbo original,documentary,originals | news/talk | documentary,documentaries| news/talk| originals,MOVIE
3,VICE Special Report: Fighting ISIS,GV-55JASHjruwgwEAAAAU,1-GV-55JASHjruwgwEAAAAU,-1,2020-05-27 07:00:00.000,1,original,movies,1,0,0,3,0.776667,2016,4.323288,hbo original,documentary,originals | news/talk | documentary,documentaries| news/talk| originals,MOVIE
4,Heartbreakers,GV-BGJA11hsLDwwEAAAAZ,1-GV-BGJA11hsLDwwEAAAAZ,-1,2020-09-01 10:00:00.000,1,acquired,movies,1,0,0,2,1.993056,2001,19.457534,mgm domestic television distribution llc,comedy|crime,crime | comedy | romance,comedy | crime | romance,MOVIE


In [30]:
feature_metric =pd.merge(funnel_metric_feature, metadata_feature[['match_id_platform', 'category', 'title_name']], 
                         on = ['match_id_platform'])

In [33]:
feature_metric.head()

Unnamed: 0,match_id_platform,day001_percent_viewed,day004_percent_viewed,day007_percent_viewed,day028_percent_viewed,category,title_name,day1,day4,day7
0,1-GV-BKigrJWcJMwwEAAABi,0.000846,0.00276,0.004772,0.013082,MOVIE,Matchstick Men,0.064669,0.210977,0.364776
1,1-GV1sAaA2CKMLDwwEAAAD7,0.000861,0.003405,0.005744,0.015485,MOVIE,The Aviator,0.055602,0.21989,0.37094
2,1-GV4z_-gbJc7fDNQEAAAAj,0.00108,0.003797,0.006132,0.014282,MOVIE,The Adventures of Sharkboy and Lavagirl,0.07562,0.265859,0.429352
3,1-GV7tzLgeWEoORngEAAACT,0.001394,0.004518,0.007646,0.019765,MOVIE,28 Days Later,0.070529,0.228586,0.386845
4,1-GV7tzQwWPW8PCHAEAAABT,0.002688,0.008292,0.012439,0.02877,MOVIE,Drumline,0.093431,0.288217,0.43236


In [32]:
feature_metric['day1'] = feature_metric['day001_percent_viewed']/feature_metric['day028_percent_viewed']
feature_metric['day4'] = feature_metric['day004_percent_viewed']/feature_metric['day028_percent_viewed']
feature_metric['day7'] = feature_metric['day007_percent_viewed']/feature_metric['day028_percent_viewed']

In [34]:
feature_metric.groupby(['category']).mean()

Unnamed: 0_level_0,day001_percent_viewed,day004_percent_viewed,day007_percent_viewed,day028_percent_viewed,day1,day4,day7
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MOVIE,0.01092,0.024462,0.030555,0.04671,0.171553,0.420489,0.566722
,0.012581,0.021209,0.024587,0.028874,0.344182,0.620565,0.751307
SERIES - NON-THURSDAY,0.013823,0.023421,0.030134,0.040059,0.383938,0.656958,0.796205
SERIES - THURS,0.008898,0.023393,0.030205,0.04832,0.161023,0.463082,0.612811


In [35]:
feature_metric[feature_metric['category'].str.contains('SERIES')].mean()

day001_percent_viewed    0.011860
day004_percent_viewed    0.023410
day007_percent_viewed    0.030162
day028_percent_viewed    0.043351
day1                     0.295095
day4                     0.579689
day7                     0.723113
dtype: float64

## Actuals

In [45]:
metadata_feature[metadata_feature['title_name'].str.contains('Gilded')]

Unnamed: 0,title_name,match_id,match_id_platform,season_number_adj,earliest_offered_timestamp,platform_name,program_type,content_category,single_episode_ind,in_sequantial_releasing_period,at_release_year,dayofweek_earliest_date,total_hours,prod_release_year,title_age_approx,licensor_agg,descriptive_genre_desc_agg,wm_enterprise_genres_agg,navigation_genre_desc_agg,category
10453,The Gilded Age S1,GYZLijQsvoJKWWQEAAAB6-1,1-GYZLijQsvoJKWWQEAAAB6-1,1,2022-01-25 02:00:00.000,1,original,series,0,0,1,2,4.613287,2022,1.665753,hbo original,drama,drama | originals,drama | original,SERIES - NON-THURSDAY


In [47]:
funnel_metric_feature[funnel_metric_feature['match_id_platform'] == '1-GYZLijQsvoJKWWQEAAAB6-1']

Unnamed: 0,match_id_platform,max_days_since_first_offered,day001_percent_viewed,day002_percent_viewed,day003_percent_viewed,day004_percent_viewed,day005_percent_viewed,day006_percent_viewed,day007_percent_viewed,day008_percent_viewed,day009_percent_viewed,day010_percent_viewed,day011_percent_viewed,day012_percent_viewed,day013_percent_viewed,day014_percent_viewed,day015_percent_viewed,day016_percent_viewed,day017_percent_viewed,day018_percent_viewed,day019_percent_viewed,day020_percent_viewed,day021_percent_viewed,day022_percent_viewed,day023_percent_viewed,day024_percent_viewed,day025_percent_viewed,day026_percent_viewed,day027_percent_viewed,day028_percent_viewed
10406,1-GYZLijQsvoJKWWQEAAAB6-1,23,0.022603,0.03346,0.040894,0.046569,0.053385,0.058523,0.060779,0.065142,0.068482,0.071219,0.073328,0.075713,0.078085,0.079556,0.082303,0.084734,0.086333,0.087371,0.088611,0.089665,0.090415,0.09149,0.092781,,,,,


In [55]:
day140_metric = pd.read_csv('Day28, 31, 140 %view of Max bigger titles - 2022-02-23.csv')

In [58]:
day140_metric.head()

Unnamed: 0,title_name,content_category,day028_pct_view,day031_pct_view,day140_pct_view,growth_ratio,long_term_growth_ind
0,Tenet,movies,15.5%,16.1%,19.4%,1.21,1
1,Greenland,movies,14.5%,15.0%,18.8%,1.26,1
2,The Flight Attendant S1,series,18.5%,17.3%,17.8%,1.03,1
3,Kong: Skull Island,movies,6.0%,6.8%,17.6%,2.58,1
4,Mare of Easttown S1,series,9.7%,10.4%,17.0%,1.64,1


In [57]:
day140_metric.columns = [s.lower() for s in day140_metric.columns]

In [61]:
feature_metric_140=pd.merge(feature_metric, day140_metric, on = 'title_name')

In [75]:
feature_metric_140=feature_metric_140[feature_metric_140['day140_pct_view'] > -100]

In [72]:
feature_metric_140['day031_pct_view']=feature_metric_140['day031_pct_view'].str.strip('%').astype(float)
feature_metric_140['day140_pct_view']=feature_metric_140['day140_pct_view'].str.strip('%').astype(float)

In [76]:
feature_metric_140.head()

Unnamed: 0,match_id_platform,day001_percent_viewed,day004_percent_viewed,day007_percent_viewed,day028_percent_viewed,category,title_name,day1,day4,day7,content_category,day028_pct_view,day031_pct_view,day140_pct_view,growth_ratio,long_term_growth_ind
0,1-GV-BKigrJWcJMwwEAAABi,0.000846,0.00276,0.004772,0.013082,MOVIE,Matchstick Men,0.064669,0.210977,0.364776,movies,1.3%,1.4,1.3,0.96,0
1,1-GV1sAaA2CKMLDwwEAAAD7,0.000861,0.003405,0.005744,0.015485,MOVIE,The Aviator,0.055602,0.21989,0.37094,movies,1.5%,1.6,1.6,1.0,1
2,1-GV4z_-gbJc7fDNQEAAAAj,0.00108,0.003797,0.006132,0.014282,MOVIE,The Adventures of Sharkboy and Lavagirl,0.07562,0.265859,0.429352,movies,1.4%,1.5,2.4,1.57,1
3,1-GV7tzLgeWEoORngEAAACT,0.001394,0.004518,0.007646,0.019765,MOVIE,28 Days Later,0.070529,0.228586,0.386845,movies,2.0%,2.1,3.0,1.44,1
4,1-GV7tzQwWPW8PCHAEAAABT,0.002688,0.008292,0.012439,0.02877,MOVIE,Drumline,0.093431,0.288217,0.43236,movies,2.9%,3.0,3.5,1.18,1


In [81]:
feature_metric_140['day31'] = feature_metric_140['day031_pct_view']/feature_metric_140['day028_percent_viewed']/100
feature_metric_140['day140'] = feature_metric_140['day140_pct_view']/feature_metric_140['day028_percent_viewed']/100

In [82]:
feature_metric_140.groupby(['category']).mean()

Unnamed: 0_level_0,day001_percent_viewed,day004_percent_viewed,day007_percent_viewed,day028_percent_viewed,day1,day4,day7,day031_pct_view,day140_pct_view,growth_ratio,long_term_growth_ind,day31,day140
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
MOVIE,0.012663,0.027394,0.03382,0.050434,0.176313,0.430339,0.577991,5.105143,5.58,1.271057,0.711429,1.059706,1.351275
,0.012878,0.021548,0.024965,0.028702,0.351888,0.627198,0.760504,2.883158,3.498947,1.411053,0.8,1.016699,1.440319
SERIES - NON-THURSDAY,0.013661,0.022737,0.029758,0.037368,0.40913,0.677426,0.821231,3.839706,4.482353,1.334559,0.602941,1.024325,1.374459
SERIES - THURS,0.00536,0.015346,0.02054,0.034488,0.154673,0.460032,0.614424,3.490244,3.487805,1.000976,0.439024,1.019715,1.026042


In [83]:
feature_metric_140[feature_metric_140['category'].str.contains('SERIES')].mean()

day001_percent_viewed    0.010539
day004_percent_viewed    0.019957
day007_percent_viewed    0.026291
day028_percent_viewed    0.036284
day1                     0.313417
day4                     0.595654
day7                     0.743441
day031_pct_view          3.708257
day140_pct_view          4.108257
growth_ratio             1.209083
long_term_growth_ind     0.541284
day31                    1.022591
day140                   1.243403
dtype: float64