In [12]:
import pandas as pd
import numpy as np
import itertools as it
import os
import io
import logging

import boto3
import sys
import json

In [13]:
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold
import datetime
from datetime import timedelta
import scipy.stats as st

In [14]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

# 1. Read Data

In [15]:
# input_bucket = 'hbo-ingest-datascience-content-dev'

In [16]:
logger = logging.getLogger()
logger.info(f'Loading inputs')

s3 = boto3.resource('s3')
bucket = s3.Bucket(input_bucket)

In [17]:
for obj in bucket.objects.filter(Prefix='pct_actives_prediction_emea/pct_actives_metric_values_emea'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))

    pct_actives = pd.read_csv(body, na_values = [r'\\\\N'])
    pct_actives.columns = pct_actives.columns.str.lower()
    pct_actives= pct_actives.loc[pct_actives['match_id'].\
                isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL', '1-GYEb9QwLgFF9_ZwEAAAA7', '0-GYEb9QwLgFF9_ZwEAAAA7'])==False,:]\
                .reset_index(drop = True)

Reading pct_actives_metric_values_emea features


# 2. Model

In [20]:
class DecayModel:
    def __init__(self, kpi):
        self.tracking_col = TRACKING_COLUMN[kpi]
        self.multiplier_df = None

    def fit(self, train_df):
        pct_actives = train_df
        pct_actives_from = pct_actives[['originals_type', 'content_category',
                                    'match_id','prediction_start_day', 'territory', self.tracking_col]]
        pct_actives_from.rename(columns={self.tracking_col: self.tracking_col + '_from'}, inplace=True)


        pct_actives_to = pct_actives[['originals_type', 'content_category',
                                        'match_id','prediction_start_day', 'territory', self.tracking_col]]
        pct_actives_to.rename(columns={self.tracking_col: self.tracking_col + '_to',
                                         'prediction_start_day': 'days_after_launch'}, inplace=True)


        multipliers = pd.merge(pct_actives_from, pct_actives_to,
                           on=['originals_type', 'content_category',
                               'match_id', 'territory'])
        multipliers['multiplier'] = multipliers[self.tracking_col + '_to'] / multipliers[self.tracking_col+ '_from']
        multiplier_df = multipliers.groupby(['originals_type', 'content_category', 'territory',
                                             'prediction_start_day', 'days_after_launch'],
                                            as_index=False).agg({'multiplier': 'median'})
        self.multiplier_df = multiplier_df

    def predict(self, pred_df):

        postlaunch_df = pred_df[META_COLUMNS + [self.tracking_col]]

        assert self.tracking_col in postlaunch_df.columns


        postlaunch_df = pd.merge(postlaunch_df, self.multiplier_df,
                                 on=['originals_type', 'content_category', 'prediction_start_day', 'territory'],
                                 how='left')

        postlaunch_prediction = np.where(postlaunch_df['prediction_start_day'] > postlaunch_df['days_after_launch'],
                                         np.nan,
                                         postlaunch_df[self.tracking_col] * postlaunch_df['multiplier'])

        postlaunch_df['prediction'] = postlaunch_prediction


        postlaunch_df = postlaunch_df[META_COLUMNS + ['days_after_launch', 'prediction']]

        return postlaunch_df


# 3. Cross Validation

In [21]:
META_COLUMNS = ['match_id',
                'title',
                'available_date',
                'originals_type',
                'content_category',
                'real_date',
                'prediction_start_day',
                'territory'
                ]
TRACKING_COLUMN = {'pct_actives': 'pct_actives',
                 'total_actives': 'total_viewing_accounts',
                 'title_actives': 'title_viewing_accounts',
                 }

In [22]:
kpi = 'pct_actives'

In [23]:
bigger_titles = pct_actives[(pct_actives['days_on_hbo_max'] == 28)
                       &(pct_actives['pct_actives'] > 1)]

data_train_all = pct_actives[(pct_actives['prod_release_year'] >= 2021)
                            &(pct_actives.match_id.isin(bigger_titles.match_id))].copy()

data_train_all.rename(columns={"days_on_hbo_max": "prediction_start_day"}, inplace=True)
data_train_all['real_date'] = data_train_all['real_date'].map(str).map(lambda x: x[:10])
data_train_all['available_date'] = data_train_all['available_date'].map(str).map(lambda x: x[:10])
data_train_all['match_id_territory'] = data_train_all['match_id'] + '_' +  data_train_all['territory']

# logger.info("nrow(features): " + str(len(data_train_all.index)))
print("nrow(features): " + str(len(data_train_all.index)))

validation_set = pd.DataFrame()

num_folds = len(data_train_all['match_id_territory'].unique())
group_kfold = GroupKFold(n_splits=num_folds)
print (group_kfold)

nrow(features): 6685
GroupKFold(n_splits=242)


In [24]:
for train_index, test_index in group_kfold.split(data_train_all, groups=data_train_all['match_id_territory'].values):


    train_df, test_df = data_train_all.iloc[train_index], data_train_all.iloc[test_index]
    train_df = train_df[train_df['territory'] == str(test_df['territory'].values[0])]


    avail_date = test_df['available_date'].values[0]
    train_df = train_df[(train_df['available_date'] <= avail_date)]

    print("Validation Title: " + str(test_df['title'].values[0]) + ' at ' + str(test_df['territory'].values[0]))

    # fit_predict decay model
    decay_model = DecayModel(kpi=kpi)
    decay_model.fit(train_df)
    pred = decay_model.predict(test_df)
    validation_set = pd.concat((validation_set, pred))

validation_set = validation_set[validation_set['days_after_launch'].notnull()]
validation_set.reset_index(drop=True, inplace=True)
# post-process
validation_set.rename(columns={'real_date': 'prediction_start_date'}, inplace=True)
validation_set['real_date'] = pd.to_datetime(validation_set['available_date']
                                            ).add(
    validation_set['days_after_launch'].map(lambda x: datetime.timedelta(x))
    ).map(str).map(lambda x: x[:10])

validation_set = pd.merge(validation_set,
                          data_train_all[
                              ['match_id', 'real_date', 'season_number', 'territory'] + [TRACKING_COLUMN[kpi]]],
                          on=['match_id', 'real_date', 'territory'],
                          how='left')

validation_set.rename(columns={TRACKING_COLUMN[kpi]: 'actuals'}, inplace=True)

validation_set = validation_set[['match_id',
                                 'title',
                                 'territory',
                                 'available_date',
                                 'originals_type',
                                 'content_category',
                                 'prediction_start_date',
                                 'real_date',
                                 'prediction_start_day',
                                 'days_after_launch',
                                 'actuals',
                                 'prediction']]

Validation Title: And Just Like That... The Documentary at SPAIN
Validation Title: Tom & Jerry at EMEA
Validation Title: Friends: The Reunion at SPAIN
Validation Title: Friends: The Reunion at NORDICS
Validation Title: Friends: The Reunion at EMEA
Validation Title: Starstruck S1 at SPAIN
Validation Title: Starstruck S1 at NORDICS
Validation Title: Starstruck S1 at EMEA
Validation Title: Kung Fu S1 at SPAIN
Validation Title: Kung Fu S1 at NORDICS
Validation Title: Kung Fu S1 at EMEA
Validation Title: Superman & Lois S2 at SPAIN
Validation Title: Superman & Lois S2 at NORDICS
Validation Title: Superman & Lois S2 at EMEA
Validation Title: Superman & Lois S1 at SPAIN
Validation Title: Superman & Lois S1 at NORDICS
Validation Title: Superman & Lois S1 at EMEA
Validation Title: Hacks S1 at SPAIN
Validation Title: Hacks S1 at NORDICS
Validation Title: Hacks S1 at EMEA
Validation Title: Those Who Wish Me Dead at NORDICS
Validation Title: Those Who Wish Me Dead at EMEA
Validation Title: Mortal 

# 4. Scoring

In [None]:
scoring_df = pct_actives.copy()
scoring_df['max_days'] = scoring_df.groupby(['match_id'])['days_on_hbo_max'].transform(max)
scoring_df.rename(columns={"days_on_hbo_max": "prediction_start_day"}, inplace=True)
scoring_df['real_date'] = scoring_df['real_date'].map(str).map(lambda x: x[:10])
scoring_df['available_date'] = scoring_df['available_date'].map(str).map(lambda x: x[:10])
scoring_df['match_id_territory'] = scoring_df['match_id'] + '_' +  pct_actives['territory']

train_df = scoring_df[(scoring_df['available_date'] < (pd.to_datetime("now") - timedelta(days=28)).strftime('%Y-%m-%d'))
                      &(scoring_df.match_id.isin(bigger_titles.match_id))]
test_df = scoring_df[scoring_df['available_date'] >= (pd.to_datetime("now") - timedelta(days=28)).strftime('%Y-%m-%d')]

kpi = 'pct_actives'

print("nrow(features): " + str(len(test_df.index)))

pred_df = pd.DataFrame()

In [None]:
for id, feature in test_df.groupby('match_id_territory'):

    print("Scoring Title: " + str(feature['title'].values[0]) + ' at ' + str(feature['territory'].values[0]))
    avail_date = feature['available_date'].values[0]
    train_features = train_df[(train_df['available_date'] <= avail_date)
                             &(train_df['territory'] == str(feature['territory'].values[0]))]

    # fit_predict decay model
    decay_model = DecayModel(kpi=kpi)
    decay_model.fit(train_features)
    pred = decay_model.predict(feature)
    pred_df = pd.concat((pred_df, pred))

In [None]:
pred_df = pred_df[pred_df['days_after_launch'].notnull()]
pred_df.reset_index(drop=True, inplace=True)
# post-process
pred_df.rename(columns={'real_date': 'prediction_start_date'}, inplace=True)
pred_df['real_date'] = pd.to_datetime(pred_df['available_date']
                                    ).add(pred_df['days_after_launch'].map(lambda x: datetime.timedelta(x))
                                    ).map(str).map(lambda x: x[:10])

pred_df = pred_df[['match_id',
                                 'title',
                                 'territory',
                                 'available_date',
                                 'originals_type',
                                 'content_category',
                                 'prediction_start_date',
                                 'real_date',
                                 'prediction_start_day',
                                 'days_after_launch',
                                 'prediction']]

# 5. Ouput Results 

In [None]:
table_name = 'pct_actives_emea_cv_results'
filename ='pct_actives_prediction_emea/' + table_name + '.csv'
csv_buffer = io.StringIO()
validation_set.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()
client = boto3.client('s3')
client.put_object(Bucket=output_bucket, Key=filename, Body=content)
print ('Write cv to S3 finished')

In [None]:
table_name = 'pct_actives_emea_scoring_results'
filename ='pct_actives_prediction_emea/' + table_name + '.csv'
csv_buffer = io.StringIO()
pred_df.to_csv(csv_buffer, index = False)
content = csv_buffer.getvalue()
client = boto3.client('s3')
client.put_object(Bucket=output_bucket, Key=filename, Body=content)
print ('Write scoring to S3 finished')