In [1]:
# general
import boto3
import datetime as dt
import json
import numpy as np
import pandas as pd
import snowflake.connector

In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
pd.options.display.float_format = '{:,.4f}'.format

In [4]:
# !pip install pygam

# 0. Read Data

In [5]:
import xgboost  
from xgboost import XGBRegressor
from sklearn.model_selection import GroupKFold

## 0.1 Training Data

In [6]:
validation_data = pd.read_csv('validation_data.csv')

In [7]:
manual_pop_title = ['Euphoria', 'House of the Dragon', 'Peacemaker', 'The Last of Us']

In [8]:
validation_data.loc[validation_data['title_name'].isin(manual_pop_title), 'medal_number'] = 0
validation_data['hit_series'] = 0
validation_data.loc[validation_data['title_name'].isin(manual_pop_title), 'hit_series'] = 1

## 0.1 Testing Data

In [9]:
test_data = pd.read_csv('2023.09 - Content Slate Titles vSend.csv')
test_data = test_data[test_data['Market'] == 'US']

In [41]:
test_data.rename(columns = {'Title (if known)':'title_name', 
                            'Volume (hours)':'asset_run_time_hours',
                            'Content type':'content_category',
                            'Source':'program_type',
                            'Season':'season_number'
                           }, inplace=True)

In [11]:
test_data['is_pay_1'] = 0
test_data.loc[test_data['PSI Content genre'] == 'Pay 1', 'is_pay_1'] = 1

In [12]:
test_data['hit_series'] = 0
test_data['title_name_upper'] = test_data['title_name'].str.upper()
test_data.loc[test_data['title_name_upper'].str.contains('|'.join([i.upper() for i in manual_pop_title])), 
              'hit_series'] = 1

In [13]:
test_data['medal'] = test_data['Budget Tier'].combine_first(test_data['Performance Tier'])
medal_dict = {'Silver':2, 'Bronze':3, 'Gold':1 , 'Platinum':0, 'ACQ':np.NaN}
test_data['medal_number'] = test_data['medal'].replace(medal_dict)
test_data['medal_number'] = test_data['medal_number'].fillna(3)

In [14]:
test_data['content_category_movies'] = 0
test_data['content_category_series'] = 0

test_data.loc[test_data['content_category'].str.contains('Movie'), 'content_category_movies'] = 1
test_data.loc[test_data['content_category'].str.contains('Series|series'), 'content_category_series'] = 1

In [15]:
test_data['program_type_acquired'] = 1
test_data['program_type_original'] = 0

test_data.loc[test_data['program_type'].str.contains('OP', na=False), 'program_type_original'] = 1
test_data.loc[test_data['program_type'].str.contains('OP', na=False), 'program_type_acquired'] = 0

In [16]:
test_data['genre_Action/Adventure '] = 0
test_data['genre_Comedy'] = 0
test_data['genre_Drama'] = 0

test_data.loc[test_data['content_category'].str.contains('Action'), 'genre_Action/Adventure '] = 1
test_data.loc[test_data['Genre']=='Comedy', 'genre_Comedy'] = 1
test_data.loc[test_data['Genre']=='Drama', 'genre_Drama'] = 1

In [17]:
test_data['offering_start_date'] = pd.to_datetime(test_data['Month Year'],format= '%m/%d/%y')

# 1. Modeling - Cross Validation

In [18]:
META_COLS = ['title_name','season_number', 'offering_start_date', 'asset_run_time_hours','content_category', 'program_type', 'medal', 'Genre']

In [19]:
FEATURE_COLS = ['is_pay_1', 
                'hit_series',
                'medal_number', 
                'content_category_movies', 
                'content_category_series', 
                'program_type_acquired', 
                'program_type_original', 
                'genre_Action/Adventure ', 
                'genre_Comedy',
                'genre_Drama'
#                 'age_of_content', 
#                 'budget', 
               ]
### New/Library content not able to be identified
### Budget data not able to be identified

In [20]:
def cal_error(validation_set):
    error_col = ((validation_set['pred']-validation_set[TARGET_COL[0]]).abs()/validation_set[TARGET_COL[0]]).abs()
    return error_col.mean()

## 1.3 Log AVHR

In [21]:
# !pip install pygam

#### 1.3.1 Significant title classifier

In [22]:
training_data = validation_data.copy()

In [23]:
training_data['is_significant_title'] = 1
training_data.loc[training_data['ahvr'] < 0.005, 'is_significant_title'] = 0

In [24]:
training_data[FEATURE_COLS].isnull().sum()/len(training_data)
## Checking if there are any nulls in the feature columns

is_pay_1                  0.0000
hit_series                0.0000
medal_number              0.9224
content_category_movies   0.0000
content_category_series   0.0000
program_type_acquired     0.0000
program_type_original     0.0000
genre_Action/Adventure    0.0000
genre_Comedy              0.0000
genre_Drama               0.0000
dtype: float64

In [25]:
training_data['medal_number'] = training_data['medal_number'].fillna(3)

In [26]:
TARGET_COL = ['is_significant_title']

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
## fit_predict prelaunch model|
X_train, X_test, y_train = training_data[FEATURE_COLS], test_data[FEATURE_COLS], training_data[TARGET_COL]

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
print (clf.score(X_train, y_train))

pred = clf.predict(X_test)
pred = pd.DataFrame(pred)
pred.columns = ['sig_pred']
test_data.reset_index(inplace = True, drop = True)
test_data = pd.concat([test_data, pred], axis = 1)

0.9022546240043202


  return f(*args, **kwargs)


#### 1.3.2 Modeling

In [29]:
training_data = validation_data[(validation_data['ahvr']>0.005)]
test_data = test_data[test_data['sig_pred'] == 1]

In [30]:
len(training_data.title_id.unique())

704

In [31]:
len(test_data.title_name.unique())

308

In [32]:
from pygam import LinearGAM, s, PoissonGAM

In [33]:
training_data['log_ahvr'] = np.log2(training_data['ahvr'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [34]:
TARGET_COL = ['log_ahvr']

In [35]:
training_data[FEATURE_COLS].isnull().sum()/len(training_data)

is_pay_1                  0.0000
hit_series                0.0000
medal_number              0.7329
content_category_movies   0.0000
content_category_series   0.0000
program_type_acquired     0.0000
program_type_original     0.0000
genre_Action/Adventure    0.0000
genre_Comedy              0.0000
genre_Drama               0.0000
dtype: float64

In [36]:
training_data['medal_number'] = training_data['medal_number'].fillna(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [37]:
test_data[FEATURE_COLS].isnull().sum()/len(test_data)

is_pay_1                  0.0000
hit_series                0.0000
medal_number              0.0000
content_category_movies   0.0000
content_category_series   0.0000
program_type_acquired     0.0000
program_type_original     0.0000
genre_Action/Adventure    0.0000
genre_Comedy              0.0000
genre_Drama               0.0000
dtype: float64

In [38]:
X_train, X_test, y_train = training_data[FEATURE_COLS], test_data[FEATURE_COLS], training_data[TARGET_COL]
gam = LinearGAM(n_splines=20).fit(X_train, y_train)
pred = gam.predict(X_test)
pred = pd.DataFrame(pred)
pred.columns = ['logged_pred']
test_data.reset_index(inplace = True, drop = True)
test_data = pd.concat([test_data, pred], axis = 1)

In [42]:
test_data = test_data[META_COLS+FEATURE_COLS+['sig_pred']+['logged_pred']]

In [43]:
test_data['prediction'] = (2**(test_data['logged_pred']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [44]:
test_data[test_data['title_name'].str.contains('THE LAST OF US')]

Unnamed: 0,title_name,season_number,offering_start_date,asset_run_time_hours,content_category,program_type,medal,Genre,is_pay_1,hit_series,medal_number,content_category_movies,content_category_series,program_type_acquired,program_type_original,genre_Action/Adventure,genre_Comedy,genre_Drama,sig_pred,logged_pred,prediction
7,THE LAST OF US S1,1.0,2023-01-01,9.5,Scripted Series - Live Action Drama,HBO OP,Platinum,Drama,0,1,0.0,0,1,0,1,1,0,1,1,-2.9661,0.128
57,THE LAST OF US S2,2.0,2025-03-01,7.0,Scripted Series - Live Action Drama,HBO OP,Platinum,Drama,0,1,0.0,0,1,0,1,1,0,1,1,-2.9661,0.128


In [None]:
asset_run_time_hours

# Write to S3/SF

In [45]:
import boto3
import io

def to_s3(filename, output_bucket, content):
    client = boto3.client('s3')
    client.put_object(Bucket=output_bucket, Key=filename, Body=content)
    
output_bucket = 'hbo-outbound-datascience-content-dev'
s3 = boto3.resource('s3')
bucket = s3.Bucket(output_bucket)

def write_to_sf(df, file_name):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index = False)
    content = csv_buffer.getvalue()
    filename = 'title_hours_viewed_retention/{}.csv'.format(file_name)
    to_s3(filename, output_bucket, content)



In [46]:
test_data.columns

Index(['title_name', 'season_number', 'offering_start_date',
       'asset_run_time_hours', 'content_category', 'program_type', 'medal',
       'Genre', 'is_pay_1', 'hit_series', 'medal_number',
       'content_category_movies', 'content_category_series',
       'program_type_acquired', 'program_type_original',
       'genre_Action/Adventure ', 'genre_Comedy', 'genre_Drama', 'sig_pred',
       'logged_pred', 'prediction'],
      dtype='object')

In [47]:
test_data.to_csv('content_slate_prediction.csv')

# 