In [1]:
# general
import boto3
import datetime as dt
import json
import numpy as np
import pandas as pd
import snowflake.connector

In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
pd.options.display.float_format = '{:,.4f}'.format

In [4]:
# !pip install pygam

# 0. Read Data

In [5]:
import xgboost  
from xgboost import XGBRegressor
from sklearn.model_selection import GroupKFold

In [6]:
## WRITE OR READ FROM S3 ####
import boto3
import io

s3 = boto3.resource('s3')
output_bucket = "hbo-outbound-datascience-content-dev"
input_bucket = "hbo-ingest-datascience-content-dev"
bucket = s3.Bucket(input_bucket)

def write_to_sf(df, file_name):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index = False)
    content = csv_buffer.getvalue()
    filename = 'title_hours_viewed_retention/{}.csv'.format(file_name)
    client = boto3.client('s3')
    client.put_object(Bucket=output_bucket, Key=filename, Body=content)
    
def write_to_input(df, file_name):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index = False)
    content = csv_buffer.getvalue()
    filename = 'title_hours_viewed_retention/{}.csv'.format(file_name)
    client = boto3.client('s3')
    client.put_object(Bucket=input_bucket, Key=filename, Body=content)
    
def read_from_s3(filename, input_bucket = input_bucket):
    for obj in bucket.objects.filter(Prefix='title_hours_viewed_retention/'+filename): #churn_metric_0811
            key = obj.key 
            body = obj.get()['Body']
            print('Reading {0} features'.format(key))
            df = pd.read_csv(body, na_values = [r'\\\\N'])
    return df



## 0.1 Training Data

In [55]:
validation_data = read_from_s3('validation_data.csv')

Reading title_hours_viewed_retention/validation_data.csv features


In [56]:
manual_pop_title = ['Euphoria', 'House of the Dragon', 'Peacemaker', 'The Last of Us']

In [57]:
validation_data.loc[validation_data['title_name'].isin(manual_pop_title), 'medal_number'] = 0
validation_data['hit_series'] = 0
validation_data.loc[validation_data['title_name'].isin(manual_pop_title), 'hit_series'] = 1
validation_data.loc[(validation_data['title_name'] == 'Euphoria')&(validation_data['season_number'] == 1),
                   'hit_series'] = 0
validation_data.loc[(validation_data['title_name'] == 'Euphoria')&(validation_data['season_number'] == 1),
                   'medal_number'] = 3

In [60]:
validation_data.rename(columns = {'genre_Action/Adventure ':'genre_action', 'genre_Drama':'genre_drama',
                               'genre_Comedy':'genre_comedy', 'pillar_genre':'genre'}, inplace = True)

In [61]:
validation_data.loc[validation_data['genre'] == 'Action/Adventure ', 'genre'] = 'action'
validation_data.loc[validation_data['genre'] == 'Comedy', 'genre'] = 'comedy'
validation_data.loc[validation_data['genre'] == 'Drama', 'genre'] = 'drama'
validation_data.loc[~validation_data['genre'].isin(['action', 'comedy', 'drama']), 'genre'] = 'other'

validation_data.loc[validation_data['medal'] == 'None', 'medal'] = 'Bronze'
validation_data['medal_number'] = validation_data['medal_number'].fillna(3)

validation_data.loc[validation_data['content_category'] == 'special', 'content_category'] = 'movies'
validation_data = validation_data[validation_data['content_category'].isin(['movies', 'series'])]

validation_data = validation_data[validation_data['program_type'].notnull()]

In [100]:
validation_data[(validation_data['medal_number'] == 0)
               &()]

Unnamed: 0,title_name,title_id,season_number,days_on_hbo_max,cumulative_hours_viewed,average_hours_viewed,offering_start_date,asset_run_time_hours,content_category,program_type,air_date,medal,episode_number_in_season,genre,is_pay_1,is_popcorn,budget,content_category_livesports,content_category_movies,content_category_series,content_category_special,program_type_acquired,program_type_original,genre_action,genre_Adult Animation,genre_Classics,genre_comedy,genre_Documentary,genre_drama,genre_Horror,genre_Kids/Family,genre_Music,genre_Other,genre_Romance,genre_Sci-Fi,genre_Suspense/Thriller,genre_Unscripted,medal_number,is_new_content,age_of_content,hvr,ahvr,content_cost,hit_series
1257,Raised by Wolves,GX0WFcAlf5r5cuAEAAADu,1,60,3669705.4175,1.1723,2020-09-03,8.0031,series,original,2020-09-10,Platinum,5.0,other,0,0,,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1,-0.0192,458538.3021,0.1465,80250000.0,0
1338,Charm City Kings,GX3Oo8gHLjKHCwwEAAAEk,0,60,513869.4014,0.1219,2020-10-08,2.0033,movies,original,2020-10-08,Platinum,,other,0,0,,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1,0.0,256507.2314,0.0608,8800000.0,0
1454,Roald Dahl’s The Witches,GX5A8JQnSPQ2QFgEAAAAC,0,60,1318359.3806,0.2779,2020-10-22,1.7131,movies,original,2020-10-22,Platinum,,other,0,0,,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1,0.0,769595.4774,0.1622,90000000.0,0
1467,The Flight Attendant,GX5MHsQzwwIuLwgEAAACp,1,60,6516263.467,0.917,2020-11-26,5.7369,series,original,2020-11-26,Platinum,1.0,drama,0,0,,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,1,0.0,1135842.8589,0.1598,54055437.0,0
1577,The Fresh Prince of Bel-Air Reunion,GX7QU5AGnhiLDwwEAAAAj,0,60,732831.7744,0.106,2020-11-19,1.2092,movies,original,2020-11-19,Platinum,,other,0,0,,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.0,1,0.0,606063.8278,0.0876,6715324.0,0
1683,Wonder Woman 1984,GX9KHPw1OIMPCJgEAAAAD,0,60,12249348.0638,1.52,2020-12-25,2.4669,movies,acquired,2020-01-01,Platinum,,action,1,1,200000000.0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,1,0.9829,4965393.6465,0.6162,39454083.0,0
1794,Euphoria,GXKN_xQX5csPDwwEAAABj,2,60,25592855.6164,1.8582,2022-01-10,7.8194,series,original,2022-01-16,Platinum,2.0,drama,0,0,,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,1,-0.0164,3272978.067,0.2376,115400000.0,1
4748,The Menu,GY6HeoQwl0MPCLwEAAAgW,0,60,4870811.9097,0.4141,2023-01-03,1.7853,movies,acquired,2022-11-18,Platinum,,comedy,1,0,35000000.0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.0,1,0.1259,2728322.7811,0.232,3134781.0,0
4889,Judas and the Black Messiah,GYBmsKA4FaUnDdQEAAAAj,0,60,3160390.5814,0.2897,2021-02-12,2.0614,movies,acquired,2021-02-12,Platinum,,drama,1,1,26000000.0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,1,0.0,1533137.1782,0.1405,12508720.0,0
4944,Zack Snyder's Justice League,GYDAnZgCFQ8IJpQEAAAAN,0,60,23402847.4237,1.9459,2021-03-18,3.9381,movies,original,2021-03-18,Platinum,,drama,0,0,300000000.0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.0,1,0.0,5942742.6543,0.4941,25000000.0,0


## 0.1 Testing Data

In [65]:
FEATURE_COLS = ['is_pay_1', 
                'hit_series',
                'medal_number', 
                'content_category_movies', 
                'content_category_series', 
                'program_type_acquired', 
                'program_type_original', 
                'genre_action', 
                'genre_comedy',
                'genre_drama'
#                 'age_of_content', 
#                 'budget', 
               ]
### New/Library content not able to be identified
### Budget data not able to be identified

In [66]:
# FEATURE_DICT = {
# 'is_pay_1': [0, 1] , 
# 'hit_series': [0, 1] ,
# 'medal_number': [0, 1, 2, 3]  , 
# 'content_category_movies': [0, 1] , 
# 'content_category_series': [0, 1] , 
# 'program_type_acquired': [0, 1] , 
# 'program_type_original': [0, 1] , 
# 'genre_action': [0, 1] , 
# 'genre_comedy': [0, 1] ,
# 'genre_drama' : [0, 1]    
# }

In [67]:
test_data = validation_data[FEATURE_COLS+['content_category', 'program_type', 'medal', 'genre']].drop_duplicates()

In [68]:
run_time = validation_data.groupby(['content_category'])['asset_run_time_hours'].mean().reset_index()

In [69]:
test_data = test_data.merge(run_time, on = ['content_category'])

# 1. Modeling - Scoring

In [70]:
META_COLS = ['asset_run_time_hours','content_category', 'program_type', 'medal', 'genre']

In [71]:
FEATURE_COLS = ['is_pay_1', 
                'hit_series',
                'medal_number', 
                'content_category_movies', 
                'content_category_series', 
                'program_type_acquired', 
                'program_type_original', 
                'genre_action', 
                'genre_comedy',
                'genre_drama'
#                 'age_of_content', 
#                 'budget', 
               ]
### New/Library content not able to be identified
### Budget data not able to be identified

In [72]:
def cal_error(validation_set):
    error_col = ((validation_set['pred']-validation_set[TARGET_COL[0]]).abs()/validation_set[TARGET_COL[0]]).abs()
    return error_col.mean()

## 1.3 Log AVHR

In [73]:
# !pip install pygam

#### 1.3.1 Significant title classifier

In [74]:
training_data = validation_data.copy()

In [75]:
training_data['is_significant_title'] = 1
training_data.loc[training_data['ahvr'] < 0.01, 'is_significant_title'] = 0

In [76]:
training_data[FEATURE_COLS].isnull().sum()/len(training_data)
## Checking if there are any nulls in the feature columns

is_pay_1                  0.0000
hit_series                0.0000
medal_number              0.0000
content_category_movies   0.0000
content_category_series   0.0000
program_type_acquired     0.0000
program_type_original     0.0000
genre_action              0.0000
genre_comedy              0.0000
genre_drama               0.0000
dtype: float64

In [77]:
training_data['medal_number'] = training_data['medal_number'].fillna(3)

In [78]:
TARGET_COL = ['is_significant_title']

In [79]:
from sklearn.linear_model import LogisticRegression

In [80]:
## fit_predict prelaunch model|
X_train, X_test, y_train = training_data[FEATURE_COLS], test_data[FEATURE_COLS], training_data[TARGET_COL]

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
print (clf.score(X_train, y_train))

pred = clf.predict(X_test)
pred = pd.DataFrame(pred)
pred.columns = ['sig_pred']
test_data.reset_index(inplace = True, drop = True)
test_data = pd.concat([test_data, pred], axis = 1)

  return f(*args, **kwargs)


0.9083175803402647


In [81]:
# test_data[test_data['sig_pred'] == 0]

#### 1.3.2 Modeling

In [82]:
insig_title = test_data[test_data['sig_pred'] == 0]

In [83]:
len(insig_title)

21

In [84]:
training_data = validation_data[(validation_data['ahvr']>0.01)]
# test_data = test_data[test_data['sig_pred'] == 1]

In [85]:
len(training_data)

855

In [86]:
len(test_data)

59

In [87]:
from pygam import LinearGAM, s, PoissonGAM

In [88]:
training_data['log_ahvr'] = np.log2(training_data['ahvr'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [89]:
TARGET_COL = ['log_ahvr']

In [90]:
training_data[FEATURE_COLS].isnull().sum()/len(training_data)

is_pay_1                  0.0000
hit_series                0.0000
medal_number              0.0000
content_category_movies   0.0000
content_category_series   0.0000
program_type_acquired     0.0000
program_type_original     0.0000
genre_action              0.0000
genre_comedy              0.0000
genre_drama               0.0000
dtype: float64

In [91]:
training_data['medal_number'] = training_data['medal_number'].fillna(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [92]:
test_data[FEATURE_COLS].isnull().sum()/len(test_data)

is_pay_1                  0.0000
hit_series                0.0000
medal_number              0.0000
content_category_movies   0.0000
content_category_series   0.0000
program_type_acquired     0.0000
program_type_original     0.0000
genre_action              0.0000
genre_comedy              0.0000
genre_drama               0.0000
dtype: float64

In [93]:
X_train, X_test, y_train = training_data[FEATURE_COLS], test_data[FEATURE_COLS], training_data[TARGET_COL]
gam = LinearGAM(n_splines=20).fit(X_train, y_train)
pred = gam.predict(X_test)
pred = pd.DataFrame(pred)
pred.columns = ['logged_pred']
test_data.reset_index(inplace = True, drop = True)
test_data = pd.concat([test_data, pred], axis = 1)

In [94]:
test_data = test_data[META_COLS+FEATURE_COLS+['sig_pred']+['logged_pred']]

In [95]:
test_data['prediction'] = (2**(test_data['logged_pred']))

In [96]:
# insig_title = insig_title[META_COLS+FEATURE_COLS+['sig_pred']]

# insig_title['logged_pred']=0
# insig_title['prediction'] = np.NaN

# final_output =pd.concat([test_data, insig_title], axis = 0)

In [97]:
test_data[test_data['medal_number'] == 0]

Unnamed: 0,asset_run_time_hours,content_category,program_type,medal,genre,is_pay_1,hit_series,medal_number,content_category_movies,content_category_series,program_type_acquired,program_type_original,genre_action,genre_comedy,genre_drama,sig_pred,logged_pred,prediction
17,1.6303,movies,original,Platinum,other,0,0,0.0,1,0,0,1,0,0,0,1,-2.7833,0.1453
22,1.6303,movies,original,Platinum,other,0,0,0.0,0,0,0,1,0,0,0,1,-2.6322,0.1613
25,1.6303,movies,acquired,Platinum,action,1,0,0.0,1,0,1,0,1,0,0,1,-1.7812,0.2909
35,1.6303,movies,acquired,Platinum,comedy,1,0,0.0,1,0,1,0,0,1,0,1,-1.8642,0.2747
36,1.6303,movies,acquired,Platinum,drama,1,0,0.0,1,0,1,0,0,0,1,1,-2.1459,0.226
37,1.6303,movies,original,Platinum,drama,0,0,0.0,1,0,0,1,0,0,1,1,-2.9073,0.1333
38,1.6303,movies,acquired,Platinum,other,1,0,0.0,1,0,1,0,0,0,0,1,-2.0219,0.2462
51,6.7765,series,original,Platinum,other,0,0,0.0,0,1,0,1,0,0,0,1,-2.4968,0.1772
55,6.7765,series,original,Platinum,drama,0,0,0.0,0,1,0,1,0,0,1,1,-2.6208,0.1626
56,6.7765,series,original,Platinum,drama,0,1,0.0,0,1,0,1,0,0,1,1,-1.9112,0.2659


# Write to S3/SF

In [98]:
len(test_data)

59

# 

In [99]:
write_to_input(test_data,'content_slate_metadata_level')

