In [1]:
# general
import boto3
import datetime as dt
import json
import numpy as np
import pandas as pd
import snowflake.connector

In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
pd.options.display.float_format = '{:,.4f}'.format

In [4]:
# !pip install pygam

# 0. Read Data

In [5]:
import xgboost  
from xgboost import XGBRegressor
from sklearn.model_selection import GroupKFold

## 0.1 Training Data

In [42]:
validation_data = pd.read_csv('validation_data.csv')

In [43]:
manual_pop_title = ['Euphoria', 'House of the Dragon', 'Peacemaker', 'The Last of Us']

In [44]:
validation_data.loc[validation_data['title_name'].isin(manual_pop_title), 'medal_number'] = 0
validation_data['hit_series'] = 0
validation_data.loc[validation_data['title_name'].isin(manual_pop_title), 'hit_series'] = 1

In [45]:
validation_data.rename(columns = {'genre_Action/Adventure ':'genre_action', 'genre_Drama':'genre_drama',
                               'genre_Comedy':'genre_comedy'}, inplace = True)

## 0.1 Testing Data

In [82]:
test_data = pd.read_csv('Content Slate Titles vSend.csv')
test_data = test_data[test_data['Market'] == 'US']

In [83]:
len(test_data)

746

In [84]:
test_data.rename(columns = {'Title (if known)':'title_name', 
                            'Volume (hours)':'asset_run_time_hours',
                            'Source':'program_type',
                            'Season':'season_number',
                            'Genre':'genre'
                           }, inplace=True)

In [85]:
test_data['is_pay_1'] = 0
test_data.loc[test_data['PSI Content genre'].str.contains('Pay 1'), 'is_pay_1'] = 1

In [86]:
test_data['hit_series'] = 0
test_data['title_name_upper'] = test_data['title_name'].str.upper()
test_data.loc[test_data['title_name_upper'].str.contains('|'.join([i.upper() for i in manual_pop_title])), 
              'hit_series'] = 1

In [87]:
test_data['medal'] = test_data['Performance Tier'].combine_first(test_data['Budget Tier'])
test_data.loc[test_data['medal'] == 'SIlver', 'medal'] = 'Silver'
medal_dict = {'Silver':2, 'Bronze':3, 'Gold':1 , 'Platinum':0, 'ACQ':np.NaN, 'TBD':np.NaN}
#### What is ACQ??
test_data['medal_number'] = test_data['medal'].replace(medal_dict)
test_data['medal_number'] = test_data['medal_number'].fillna(3)

In [88]:
test_data.loc[test_data['genre'] == 'Comedy', 'genre'] = 'comedy'
test_data.loc[test_data['genre'] == 'Drama', 'genre'] = 'drama'
test_data.loc[(test_data['Content type'].str.contains('Action')) &(test_data['genre']!='drama'), 
                 'genre'] = 'action'
test_data.loc[~test_data['genre'].isin(['action', 'comedy', 'drama']), 'genre'] = 'other'

genre_onehot = pd.get_dummies(test_data['genre'], prefix='genre')
test_data=pd.concat([test_data, genre_onehot], axis = 1)

In [89]:
test_data.loc[test_data['PSI Content format'].str.contains('Series'), 'content_category'] = 'series'
test_data.loc[test_data['PSI Content format'].str.contains('series'), 'content_category'] = 'series'
test_data.loc[test_data['content_category']!='series', 'content_category'] = 'movies'

content_category_onehot = pd.get_dummies(test_data['content_category'], prefix='content_category')
test_data=pd.concat([test_data, content_category_onehot], axis = 1)

In [90]:
test_data.loc[test_data['program_type'].str.contains('OP', na=False), 'program_type'] = 'original'
test_data.loc[test_data['program_type']!='original', 'program_type'] = 'acquired'

program_type_onehot = pd.get_dummies(test_data['program_type'], prefix='program_type')
test_data=pd.concat([test_data, program_type_onehot], axis = 1)

In [91]:
len(test_data)

746

In [92]:
#### FILL IN asset_run_time_hours
avg_run_time = test_data.groupby(['genre','medal','content_category'])['asset_run_time_hours'].mean().reset_index()
avg_run_time['asset_run_time_hours'] = avg_run_time['asset_run_time_hours'].fillna(2)
test_data = test_data.merge(avg_run_time.rename(columns ={'asset_run_time_hours':'category_run_time_hours'}),
                                 on = ['genre','medal','content_category'])

In [93]:
# test_data[(test_data['genre'].isnull())|(test_data['medal'].isnull())|(test_data['content_category'].isnull())]

In [94]:
test_data['asset_run_time_hours'] = test_data['asset_run_time_hours'].combine_first(test_data['category_run_time_hours'])

In [95]:
test_data['offering_start_date'] = pd.to_datetime(test_data['Month Year'],format= '%m/%d/%y')

In [96]:
len(test_data)

726

# 1. Modeling - Scoring

In [64]:
META_COLS = ['title_name','season_number', 'offering_start_date', 'asset_run_time_hours','content_category', 'program_type', 'medal', 'genre']

In [65]:
FEATURE_COLS = ['is_pay_1', 
                'hit_series',
                'medal_number', 
                'content_category_movies', 
                'content_category_series', 
                'program_type_acquired', 
                'program_type_original', 
                'genre_action', 
                'genre_comedy',
                'genre_drama'
#                 'age_of_content', 
#                 'budget', 
               ]
### New/Library content not able to be identified
### Budget data not able to be identified

In [66]:
def cal_error(validation_set):
    error_col = ((validation_set['pred']-validation_set[TARGET_COL[0]]).abs()/validation_set[TARGET_COL[0]]).abs()
    return error_col.mean()

## 1.3 Log AVHR

In [67]:
# !pip install pygam

#### 1.3.1 Significant title classifier

In [68]:
training_data = validation_data.copy()

In [69]:
training_data['is_significant_title'] = 1
training_data.loc[training_data['ahvr'] < 0.01, 'is_significant_title'] = 0

In [70]:
training_data[FEATURE_COLS].isnull().sum()/len(training_data)
## Checking if there are any nulls in the feature columns

is_pay_1                  0.0000
hit_series                0.0000
medal_number              0.9225
content_category_movies   0.0000
content_category_series   0.0000
program_type_acquired     0.0000
program_type_original     0.0000
genre_action              0.0000
genre_comedy              0.0000
genre_drama               0.0000
dtype: float64

In [71]:
training_data['medal_number'] = training_data['medal_number'].fillna(3)

In [72]:
TARGET_COL = ['is_significant_title']

In [73]:
from sklearn.linear_model import LogisticRegression

In [99]:
## fit_predict prelaunch model|
X_train, X_test, y_train = training_data[FEATURE_COLS], test_data[FEATURE_COLS], training_data[TARGET_COL]

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
print (clf.score(X_train, y_train))

pred = clf.predict(X_test)
pred = pd.DataFrame(pred)
pred.columns = ['sig_pred']
test_data.reset_index(inplace = True, drop = True)
test_data = pd.concat([test_data, pred], axis = 1)

0.9085883780504247


  return f(*args, **kwargs)


#### 1.3.2 Modeling

In [100]:
insig_title = test_data[test_data['sig_pred'] == 0]

In [101]:
len(insig_title)

213

In [102]:
training_data = validation_data[(validation_data['ahvr']>0.01)]
test_data = test_data[test_data['sig_pred'] == 1]

In [103]:
len(training_data)

855

In [104]:
len(test_data)

513

In [105]:
from pygam import LinearGAM, s, PoissonGAM

In [106]:
training_data['log_ahvr'] = np.log2(training_data['ahvr'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [107]:
TARGET_COL = ['log_ahvr']

In [108]:
training_data[FEATURE_COLS].isnull().sum()/len(training_data)

is_pay_1                  0.0000
hit_series                0.0000
medal_number              0.6772
content_category_movies   0.0000
content_category_series   0.0000
program_type_acquired     0.0000
program_type_original     0.0000
genre_action              0.0000
genre_comedy              0.0000
genre_drama               0.0000
dtype: float64

In [109]:
training_data['medal_number'] = training_data['medal_number'].fillna(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [110]:
test_data[FEATURE_COLS].isnull().sum()/len(test_data)

is_pay_1                  0.0000
hit_series                0.0000
medal_number              0.0000
content_category_movies   0.0000
content_category_series   0.0000
program_type_acquired     0.0000
program_type_original     0.0000
genre_action              0.0000
genre_comedy              0.0000
genre_drama               0.0000
dtype: float64

In [111]:
X_train, X_test, y_train = training_data[FEATURE_COLS], test_data[FEATURE_COLS], training_data[TARGET_COL]
gam = LinearGAM(n_splines=20).fit(X_train, y_train)
pred = gam.predict(X_test)
pred = pd.DataFrame(pred)
pred.columns = ['logged_pred']
test_data.reset_index(inplace = True, drop = True)
test_data = pd.concat([test_data, pred], axis = 1)

In [112]:
test_data = test_data[META_COLS+FEATURE_COLS+['sig_pred']+['logged_pred']]

In [113]:
test_data['prediction'] = (2**(test_data['logged_pred']))

In [114]:
test_data[test_data['title_name'].str.contains('EUPHORIA')]

Unnamed: 0,title_name,season_number,offering_start_date,asset_run_time_hours,content_category,program_type,medal,genre,is_pay_1,hit_series,medal_number,content_category_movies,content_category_series,program_type_acquired,program_type_original,genre_action,genre_comedy,genre_drama,sig_pred,logged_pred,prediction
152,EUPHORIA S3,3.0,2025-01-01,8.0,series,original,Platinum,drama,0,1,0.0,0,1,0,1,0,0,1,1,-2.5303,0.1731
157,EUPHORIA S4 / TBD,4.0,2026-10-01,8.0,series,original,Platinum,drama,0,1,0.0,0,1,0,1,0,0,1,1,-2.5303,0.1731


In [115]:
insig_title = insig_title[META_COLS+FEATURE_COLS+['sig_pred']]

In [116]:
insig_title['logged_pred']=0
insig_title['prediction'] = np.NaN

In [117]:
final_output =pd.concat([test_data, insig_title], axis = 0)

In [118]:
final_output.tail()

Unnamed: 0,title_name,season_number,offering_start_date,asset_run_time_hours,content_category,program_type,medal,genre,is_pay_1,hit_series,medal_number,content_category_movies,content_category_series,program_type_acquired,program_type_original,genre_action,genre_comedy,genre_drama,sig_pred,logged_pred,prediction
659,ADVENTURE TIME: FIONNA AND CAKE S4 / TBD,4.0,2026-08-01,5.0,series,original,Bronze,comedy,0,0,3.0,0,1,0,1,0,1,0,0,0.0,
660,UNTITLED HOLIDAY MOVIE 2 ('24),,2024-11-01,2.0,movies,original,Bronze,drama,0,0,3.0,1,0,0,1,0,0,1,0,0.0,
661,UNTITLED HOLIDAY MOVIE 2 ('25),,2024-11-01,2.0,movies,original,Bronze,drama,0,0,3.0,1,0,0,1,0,0,1,0,0.0,
663,TBD MULTICAM S2,2.0,2025-08-01,5.0,series,original,TBD,action,0,0,3.0,0,1,0,1,1,0,0,0,0.0,
664,TBD MULTICAM S3,3.0,2026-08-01,5.0,series,original,TBD,action,0,0,3.0,0,1,0,1,1,0,0,0,0.0,


In [119]:
len(insig_title)

213

In [120]:
len(final_output)

726

# Write to S3/SF

In [121]:
import boto3
import io

def to_s3(filename, output_bucket, content):
    client = boto3.client('s3')
    client.put_object(Bucket=output_bucket, Key=filename, Body=content)
    
output_bucket = 'hbo-outbound-datascience-content-dev'
s3 = boto3.resource('s3')
bucket = s3.Bucket(output_bucket)

def write_to_sf(df, file_name):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index = False)
    content = csv_buffer.getvalue()
    filename = 'title_hours_viewed_retention/{}.csv'.format(file_name)
    to_s3(filename, output_bucket, content)



In [122]:
test_data.columns

Index(['title_name', 'season_number', 'offering_start_date',
       'asset_run_time_hours', 'content_category', 'program_type', 'medal',
       'genre', 'is_pay_1', 'hit_series', 'medal_number',
       'content_category_movies', 'content_category_series',
       'program_type_acquired', 'program_type_original', 'genre_action',
       'genre_comedy', 'genre_drama', 'sig_pred', 'logged_pred', 'prediction'],
      dtype='object')

In [123]:
final_output.to_csv('content_slate_prediction_perf_tier.csv')

# 