In [1]:
import os
import sys
import logging
import boto3
import itertools as it
import io
from utils import *
import snowflake.connector

In [2]:
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
import datetime
from datetime import timedelta
import scipy.stats as st

In [3]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

In [4]:
class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")

In [5]:
def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df

In [6]:
logger = logging.getLogger()
logger.info(f'Loading inputs')
data_list =[]

s3 = boto3.resource('s3')
bucket = s3.Bucket('hbo-ingest-datascience-content-dev')

In [7]:
for obj in bucket.objects.filter(Prefix='input_percent_view'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))
    exec("{0}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    exec("{0}.columns = {0}.columns.str.lower()".format(var_name))
    
    # exclude the full null columns
    exec("{0} = {0}.loc[:,{0}.isnull().sum()!={0}.shape[0]]".format(var_name))

    # exclude the old Mortal Kombat movie because the trailer percent view 
    # matching matches the trailer of the new movie to the old movie
    # exclude Tom & Jerry due to unresolvable data issue
    exec("{0} = {0}.loc[{0}['match_id_platform'].\
        isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL', '1-GYEb9QwLgFF9_ZwEAAAA7', '0-GYEb9QwLgFF9_ZwEAAAA7'])==False,:]\
        .reset_index(drop = True)".format(var_name))
    
    # append the feature df
    exec("data_list.append({0})".format(var_name))

Reading funnel_metric_feature features
Reading media_cost_postlaunch_feature features
Reading media_cost_prelaunch_feature features
Reading metadata_feature features
Reading prelaunch_trailer_feature features
Reading prelaunch_trailer_feature_before28 features
Reading sub_total_feature features
Reading trailer_feature features
Reading vtp_feature features
Reading wiki_view_feature_before28 features
Reading wiki_view_post_feature features
Reading wiki_view_pre_feature features


In [8]:
for obj in bucket.objects.filter(Prefix='pct_actives_prediction/pct_actives_metric_values'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))
    exec("{0}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    exec("{0}.columns = {0}.columns.str.lower()".format(var_name))
    
    # exclude the full null columns
    exec("{0} = {0}.loc[:,{0}.isnull().sum()!={0}.shape[0]]".format(var_name))

    # exclude the old Mortal Kombat movie because the trailer percent view 
    # matching matches the trailer of the new movie to the old movie
    # exclude Tom & Jerry due to unresolvable data issue
    exec("{0} = {0}.loc[{0}['match_id'].\
        isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL', '1-GYEb9QwLgFF9_ZwEAAAA7', '0-GYEb9QwLgFF9_ZwEAAAA7'])==False,:]\
        .reset_index(drop = True)".format(var_name))
    
    # append the feature df
    exec("data_list.append({0})".format(var_name))

Reading pct_actives_metric_values features


In [9]:
metadata_feature = data_list[3]

In [10]:
active_data = data_list[-1][['match_id', 'days_on_hbo_max', 'pct_actives']]
active_data = active_data.merge(metadata_feature[['match_id', 'match_id_platform']], on = 'match_id')
active_data.drop(['match_id'], axis = 1, inplace = True)

In [11]:
active_data['pct_actives_values'] = active_data.groupby(['match_id_platform', 'days_on_hbo_max'])['pct_actives'].transform('mean')
active_data = active_data[['match_id_platform', 'days_on_hbo_max', 'pct_actives_values']]
active_data = active_data[(active_data['match_id_platform'].notnull())
                         &(active_data['days_on_hbo_max'].notnull())]
active_data.drop_duplicates(inplace = True)

In [12]:
active_data = active_data.pivot(index='match_id_platform', columns='days_on_hbo_max', values=['pct_actives_values']).reset_index()
columns = ['day00' + str(i) + '_percent_actives' for i in range(1, 10) ]
columns = columns + ['day0' + str(i) + '_percent_actives' for i in range(10, 29)]
active_data.columns = ['match_id_platform'] + columns

In [13]:
data_list.pop(-1)
data_list.append(active_data)

In [14]:
popcorn_titles = run_query('''
SELECT * FROM MAX_PROD.CATALOG.POPCORN_TITLES
''')

In [15]:
metadata_feature = metadata_feature.groupby(['match_id']).first().reset_index()

In [17]:
pct_actives=data_list[-1]
pct_actives.head()

Unnamed: 0,match_id_platform,day001_percent_actives,day002_percent_actives,day003_percent_actives,day004_percent_actives,day005_percent_actives,day006_percent_actives,day007_percent_actives,day008_percent_actives,day009_percent_actives,day010_percent_actives,day011_percent_actives,day012_percent_actives,day013_percent_actives,day014_percent_actives,day015_percent_actives,day016_percent_actives,day017_percent_actives,day018_percent_actives,day019_percent_actives,day020_percent_actives,day021_percent_actives,day022_percent_actives,day023_percent_actives,day024_percent_actives,day025_percent_actives,day026_percent_actives,day027_percent_actives,day028_percent_actives
0,0-GV-BK7wNv6MLCwgEAAABw,0.002995,0.004647,0.006102,0.007873,0.009281,0.011066,0.01276,0.01386,0.015079,0.015978,0.01686,0.018175,0.019889,0.021216,0.022407,0.023237,0.02416,0.024076,0.024444,0.025271,0.026081,0.026793,0.027351,0.028217,0.028885,0.029772,0.030714,0.031708
1,0-GV-BKigrJWcJMwwEAAABi,0.245508,0.33937,0.408132,0.450645,0.518481,0.584256,0.627948,0.670518,0.71542,0.751392,0.78987,0.838085,0.890808,0.925785,0.961196,0.991271,1.019204,1.048776,1.085916,1.122019,1.150228,1.176638,1.201948,1.226225,1.250756,1.281501,1.315118,1.336501
2,0-GV-BKvAt0FsJMwwEAAABv,0.002174,0.00291,0.003826,0.004417,0.004875,0.005303,0.005921,0.006501,0.007094,0.007974,0.008875,0.009515,0.01013,0.010748,0.011313,0.011648,0.012082,0.012627,0.013197,0.013563,0.014008,0.014466,0.01492,0.015499,0.015955,0.01633,0.016768,0.017302
3,0-GV-BPaQSKT8JMwwEAAACP,0.128517,0.19525,0.252511,0.289867,0.341674,0.40149,0.452797,0.50507,0.530118,0.551629,0.572394,0.597531,0.626846,0.649267,0.668678,0.683831,0.696199,0.709961,0.725991,0.741742,0.756903,0.77626,0.812862,0.841368,0.872993,0.907615,0.942694,0.975593
4,0-GV-LJSA5eFZqOiQEAAAAq,0.001461,0.001946,0.002342,0.002567,0.002825,0.003136,0.003377,0.003545,0.003762,0.003944,0.004115,0.004259,0.004542,0.00474,0.004954,0.005067,0.005175,0.005249,0.005337,0.005461,0.005563,0.005655,0.005795,0.00584,0.005899,0.006056,0.006221,0.00642


In [18]:
pct_actives=pct_actives[['match_id_platform', 'day001_percent_actives','day007_percent_actives', 
                         'day004_percent_actives', 'day028_percent_actives']]
pct_actives.head()

Unnamed: 0,match_id_platform,day001_percent_actives,day007_percent_actives,day004_percent_actives,day028_percent_actives
0,0-GV-BK7wNv6MLCwgEAAABw,0.002995,0.01276,0.007873,0.031708
1,0-GV-BKigrJWcJMwwEAAABi,0.245508,0.627948,0.450645,1.336501
2,0-GV-BKvAt0FsJMwwEAAABv,0.002174,0.005921,0.004417,0.017302
3,0-GV-BPaQSKT8JMwwEAAACP,0.128517,0.452797,0.289867,0.975593
4,0-GV-LJSA5eFZqOiQEAAAAq,0.001461,0.003377,0.002567,0.00642


In [20]:
pct_actives['match_id_platform'] = pct_actives['match_id_platform'].astype(str)
metadata_feature['match_id_platform'] = metadata_feature['match_id_platform'].astype(str)

In [21]:
pct_actives=pd.merge(pct_actives,
                      metadata_feature.rename(columns = {'title_name':'id'}), 
                      on = ['match_id_platform'])

In [22]:
recent_originals = pct_actives[(pct_actives['program_type'] == 'original')
                         &(pct_actives['prod_release_year'] >= 2020)
                         &(pct_actives['platform_name'] == 1)
                         ].copy()

popcorn_titles = pd.merge(pct_actives,  popcorn_titles[['viewable_id']],
                         left_on = ['match_id'], right_on = ['viewable_id']).copy()

In [23]:
recent_originals['originals_after_launch'] = 1
popcorn_titles['popcorn_titles'] = 1

In [24]:
recent_originals.drop_duplicates(inplace = True)

In [25]:
popcorn_titles.drop_duplicates(inplace = True)

In [26]:
pct_actives = pd.merge(pct_actives, recent_originals[['match_id', 'originals_after_launch']],
                        on = ['match_id'], how = 'left')


In [28]:
pct_actives = pd.merge(pct_actives, popcorn_titles[['match_id', 'popcorn_titles']],
                        on = ['match_id'], how = 'left')

In [29]:
pct_actives.loc[pct_actives['originals_after_launch'] == 1, 'originals_type'] = 'originals_after_launch'
pct_actives.loc[pct_actives['popcorn_titles'] == 1, 'originals_type'] = 'popcorn_titles'
pct_actives['originals_type'] = pct_actives['originals_type'].fillna(pct_actives['program_type'])
pct_actives = pct_actives.drop(['originals_after_launch', 'popcorn_titles'], axis = 1)

In [30]:
pct_actives.head()

Unnamed: 0,match_id_platform,day001_percent_actives,day007_percent_actives,day004_percent_actives,day028_percent_actives,match_id,id,season_number_adj,earliest_offered_timestamp,platform_name,program_type,content_category,single_episode_ind,in_sequantial_releasing_period,at_release_year,dayofweek_earliest_date,total_hours,prod_release_year,title_age_approx,licensor_agg,descriptive_genre_desc_agg,wm_enterprise_genres_agg,navigation_genre_desc_agg,originals_type
0,0-GV-BK7wNv6MLCwgEAAABw,0.002995,0.01276,0.007873,0.031708,GV-BK7wNv6MLCwgEAAABw,The Sand Pebbles,-1,2016-11-01 10:00:00.000,0,acquired,movies,1,0,0,2,2.973611,1966,50.868493,twentieth century fox film corporation,drama,drama | action | romance,drama | romance,acquired
1,0-GV-BPaQSKT8JMwwEAAACP,0.128517,0.452797,0.289867,0.975593,GV-BPaQSKT8JMwwEAAACP,How to Be Single,-1,2017-05-02 10:00:00.000,0,acquired,movies,1,0,0,2,1.735,2016,1.219178,warner bros. inc.,comedy|romance,drama | comedy | romance,comedy | romance,acquired
2,0-GV-LJSA5eFZqOiQEAAAAq,0.001461,0.003377,0.002567,0.00642,GV-LJSA5eFZqOiQEAAAAq,"Lucia, Lucia",-1,2017-06-01 10:00:00.000,0,acquired,movies,1,0,0,4,1.725625,2003,13.863014,twentieth century fox film corporation,foreign|crime|drama,drama | crime | action | romance | internation...,crime | drama | international | latino,acquired
3,0-GV-P5WwNChivDZAEAAAAn,0.002867,0.008171,0.006572,0.018349,GV-P5WwNChivDZAEAAAAn,Marathon: The Patriots Day Bombing,-1,2016-11-22 01:00:00.000,0,original,movies,1,0,1,2,1.729861,2016,0.00274,hbo original,documentary,originals | documentary,documentaries| originals,original
4,0-GV-P9gwHsio25wgEAAAA9,0.000499,0.001445,0.0008,0.004384,GV-P9gwHsio25wgEAAAA9,Mariela Castro's March: Cuba's LGBT Revolution,-1,2016-11-29 01:00:00.000,0,original,movies,1,0,1,2,0.622222,2016,0.00274,"downtown community television center, inc.",documentary,originals | shorts | documentary,documentaries| originals,original


In [38]:
pct_actives['multiplier_1'] = pct_actives['day001_percent_actives']/pct_actives['day028_percent_actives']
pct_actives['multiplier_4'] = pct_actives['day004_percent_actives']/pct_actives['day028_percent_actives']
pct_actives['multiplier_7'] = pct_actives['day007_percent_actives']/pct_actives['day028_percent_actives']

In [39]:
pct_actives.groupby(['originals_type']).mean()

Unnamed: 0_level_0,day001_percent_actives,day007_percent_actives,day004_percent_actives,day028_percent_actives,season_number_adj,platform_name,single_episode_ind,in_sequantial_releasing_period,at_release_year,dayofweek_earliest_date,total_hours,prod_release_year,title_age_approx,multiplier_4,multiplier_7,multiplier_1
originals_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
acquired,0.146952,0.306747,0.248861,0.489824,0.326643,0.905542,0.74443,0.015283,0.072915,3.028724,2.839746,1999.216535,21.211401,0.371582,0.489222,0.191643
original,0.071,0.132871,0.108065,0.218512,1.746493,0.56513,0.526052,0.108216,0.190381,2.733467,3.826522,2010.870741,7.677939,0.370496,0.475396,0.203343
originals_after_launch,1.869182,2.548127,2.405999,3.024305,1.219697,1.0,0.393939,0.181818,0.929293,3.464646,3.490717,2020.656566,1.205964,0.747821,0.825072,0.533587
popcorn_titles,32.179766,39.337321,38.936555,40.237103,-1.0,1.0,1.0,0.0,1.0,4.6875,1.931545,2020.9375,0.540582,0.943515,0.958748,0.754506


In [34]:
metadata_feature[metadata_feature['title_name'].str.contains('Winning')]

Unnamed: 0,match_id,title_name,match_id_platform,season_number_adj,earliest_offered_timestamp,platform_name,program_type,content_category,single_episode_ind,in_sequantial_releasing_period,at_release_year,dayofweek_earliest_date,total_hours,prod_release_year,title_age_approx,licensor_agg,descriptive_genre_desc_agg,wm_enterprise_genres_agg,navigation_genre_desc_agg
2255,GX5leFQdo8Sx_wwEAAAcr-1,The Cost of Winning S1,1-GX5leFQdo8Sx_wwEAAAcr-1,1,2020-11-11 02:00:00.000,1,original,series,0,0,1,3,1.860556,2020,0.460274,story lab inc.,documentary,originals | documentary | sports,documentary | original | sport | teen
4372,GXkV5UQmSVcPDwwEAABOb,Worth Winning,1-GXkV5UQmSVcPDwwEAABOb,-1,2021-01-01 11:00:00.000,1,acquired,movies,1,0,0,5,1.650556,1989,31.20274,twentieth century fox film corporation,comedy|romance,comedy | romance,comedy | romance
7621,GYbDO6QFHOsPCwgEAAAMR-1,Winning Time: The Rise of the Lakers Dynasty S1,1-GYbDO6QFHOsPCwgEAAAMR-1,1,2022-03-07 02:00:00.000,1,original,series,0,0,1,1,-1.0,2022,1.778082,hbo original,drama,drama | originals | sports,drama | original | sport
