In [7]:
import boto3
import datetime as dt
import json
import numpy as np
import pandas as pd
import snowflake.connector
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.options.display.float_format = '{:,.2f}'.format

In [8]:
from abc import ABCMeta, abstractmethod

class Credentials(metaclass=ABCMeta):
    pass
    
    
class SSMPSCredentials(Credentials):
    def __init__(self, secretid: str):
        self._secretid = secretid
        self._secrets = {}
        
    def get_keys(self):
        """
        credential fetching 
        """
        _aws_sm_args = {'service_name': 'secretsmanager', 'region_name': 'us-east-1'}
        secrets_client = boto3.client(**_aws_sm_args)
        get_secret_value_response = secrets_client.get_secret_value(SecretId=self._secretid)
        return get_secret_value_response
    
    
class BaseConnector(metaclass=ABCMeta):
    @abstractmethod
    def connect(self):
        raise NotImplementedError
    

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_DEV","WORKSPACE")

def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df



In [9]:
# run_query('''
# create or replace table max_dev.workspace.user_title_hours_watched_by_cohort_test 
# (
# request_date	VARCHAR, 
# viewable_id VARCHAR,
# segment_name VARCHAR,  
# hours_viewed	float
# );
# ''')

In [10]:
# run_query('''
# create or replace table max_dev.workspace.USER_TITLE_HOURS_WATCHED_BY_COHORT_SUBS 
# (
# segment_name VARCHAR,  
# start_date VARCHAR,
# end_date VARCHAR,
# cohort_subs 	bigint
# );
# ''')

In [None]:
title_info = run_query('''
with offerings as (select DISTINCT
            case when season_number is not null then concat(title_name, ' S', season_number, ('E'), EPISODE_NUMBER_IN_SEASON)
            else title_name end as title_episode_name
    , title_name
    , rad.VIEWABLE_ID
    , concat(coalesce(rad.series_id, rad.viewable_id)) as title_id
    , rad.SERIES_ID
    , rad.SEASON_NUMBER
    , EPISODE_NUMBER_IN_SERIES
    , content_category
    , PRIMARY_GENRE_DESC
    , program_type
    , aod.release_year
    , ASSET_RUN_TIME
    , offering_start_date
    , offering_end_date
     FROM "MAX_PROD"."CATALOG"."ASSET_OFFERING_DIM" aod
LEFT JOIN "MAX_PROD"."CATALOG"."REPORTING_ASSET_DIM" rad ON aod.VIEWABLE_ID = rad.VIEWABLE_ID
where territory='HBO MAX DOMESTIC'
    and channel = 'HBO MAX SUBSCRIPTION'
    and OFFERING_TYPE = 'FEATURE'),

airtable as (select DISTINCT title_id 
    , MIN(PILLAR_GENRE) as PILLAR_GENRE -- multiple PILLAR GENRE assignments for a single VIEWABLE_ID, taking at random.
    from max_prod.catalog.airtable_content_strategy group by title_id),

offerings_by_date as (
select
     TITLE_NAME, title_episode_name
    , offerings.VIEWABLE_ID
    , offerings.title_id, offerings.SEASON_NUMBER
    , CONTENT_CATEGORY, program_type, release_year
    , am.home_territory_observed_medal as medal
    , CAST(FLOOR(COALESCE( (ASSET_RUN_TIME)/60/60 ,0)*(1000000*1.0)) AS DECIMAL(38,0))
          / CAST((1000000*1.0) AS DOUBLE PRECISION) as ASSET_RUN_TIME_HOURS -- to prevent floating point errors
    , case when to_date(offerings.offering_start_date) < '2020-05-01' THEN '2020-05-01' ELSE offerings.offering_start_date
           end as offering_start_date
    , to_date(offerings.offering_end_date) as offering_end_date
    , PRIMARY_GENRE_DESC
    , aos.PILLAR_GENRE as PILLAR_GENRE
FROM offerings
LEFT JOIN airtable aos ON (offerings.title_id = aos.title_id)
LEFT JOIN max_prod.content_analytics.asset_medals am 
        on offerings.title_id = am.title_id
        and coalesce(offerings.season_number,0) = coalesce(am.season,0)
WHERE 1=1
-- offerings.OFFERING_START_DATE > '2020-05-01'
-- and CURRENT_DATE() between (OFFERING_START_DATE) and (OFFERING_END_DATE)
ORDER BY VIEWABLE_ID)

SELECT * FROM offerings_by_date
''')

In [None]:
title_info[title_info['viewable_id'] == 'GWSL97g2lpwjDwgEAAAAl']

In [50]:
### DE-DUPTE THE TITLES

title_test_dedup = title_info.groupby(['viewable_id'])['offering_start_date'].min().reset_index()
title_info = title_info.merge(title_test_dedup, on = ['viewable_id', 'offering_start_date'])

title_test_dedup = title_info.groupby(['viewable_id'])['offering_end_date'].max().reset_index()
title_info = title_info.merge(title_test_dedup, on = ['viewable_id', 'offering_end_date'])

title_info['medal'] = title_info['medal'].fillna('None')
title_test_dedup = title_info.groupby(['viewable_id'])['medal'].max().reset_index()
title_info = title_info.merge(title_test_dedup, on = ['viewable_id', 'medal'])


title_info = title_info.drop_duplicates()
title_info['season_number'] = title_info['season_number'].fillna(0)

In [42]:
# title_test = title_info.groupby(['viewable_id']).count()
# title_test[title_test['title_name'] == 2]
# title_info[title_info['viewable_id'] == 'GXJ0vugNPwIG6gAEAAAK1']

In [51]:
title_info.head()

Unnamed: 0,title_name,title_episode_name,viewable_id,title_id,season_number,content_category,program_type,release_year,medal,asset_run_time_hours,offering_start_date,offering_end_date,primary_genre_desc,pillar_genre
0,Magnífica 70,Magnífica 70 S2E2,GV--ozQ3TzQoxJgEAAAAL,GVfvUMQlxeLKLw_QIAAH9,2.0,series,original,2016,,0.84,2020-12-08,2025-02-28,latino,Drama
1,Magnífica 70,Magnífica 70 S2E3,GV--pKgtaj2ISVAEAAAAP,GVfvUMQlxeLKLw_QIAAH9,2.0,series,original,2016,,0.84,2020-12-08,2025-02-28,latino,Drama
2,Magnífica 70,Magnífica 70 S2E4,GV--paAWYF2ISVAEAAAAQ,GVfvUMQlxeLKLw_QIAAH9,2.0,series,original,2016,,0.85,2020-12-08,2025-02-28,latino,Drama
3,VICE Special Report: Killing Cancer,VICE Special Report: Killing Cancer,GV-54_wnG7qbCwwEAAAAY,GV-54_wnG7qbCwwEAAAAY,0.0,movies,acquired,2015,,0.64,2020-10-08,2020-12-28,documentary,Documentary
4,VICE Special Report: Countdown to Zero,VICE Special Report: Countdown to Zero,GV-55HghKlqbCwwEAAAAb,GV-55HghKlqbCwwEAAAAb,0.0,movies,acquired,2015,,0.81,2020-10-08,2020-12-28,documentary,Documentary


In [52]:
title_series_info = title_info.groupby(['title_id', 'season_number'])['offering_start_date'].min().reset_index()

In [56]:
title_series_test = title_info.groupby(['title_id', 'season_number'])['asset_run_time_hours'].sum().reset_index()
title_series_info = title_series_info.merge(title_series_test, on = ['title_id', 'season_number'])



In [63]:
title_series_test = title_info.groupby(['title_id', 'season_number'])[['content_category', 'program_type', 'release_year', 'medal', 'pillar_genre']].max().reset_index()

In [65]:
title_series_info = title_series_info.merge(title_series_test, on = ['title_id', 'season_number'])

In [85]:
title_series_info[title_series_info['title_id']=='GVU2cggagzYNJjhsJATwo']

Unnamed: 0,title_name,viewable_id,title_id,season_number,offering_start_date,asset_run_time_hours,content_category,program_type,release_year,medal,pillar_genre
313,Game of Thrones,GVU2-JAdktlFvjSoJAWPH,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama
314,Game of Thrones,GVU26GAPPTINJjhsJAV66,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama
315,Game of Thrones,GVU2icAz_X4NJjhsJAUPO,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama
316,Game of Thrones,GVU2qbgaPhYNJjhsJAU0h,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama
317,Game of Thrones,GVU3B9wO-xY7DwvwIAWqU,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama
318,Game of Thrones,GVU3G4Qx3eoNJjhsJAWyZ,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama
319,Game of Thrones,GVU3PsQzLXVFvjSoJAXbl,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama
320,Game of Thrones,GVU3n0wQ4zY7DwvwIAZPa,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama
321,Game of Thrones,GVU45FQ3XQFFvjSoJAeMT,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama
322,Game of Thrones,GVU4lpQPMbVFvjSoJAdDk,GVU2cggagzYNJjhsJATwo,5.0,2020-05-04,9.17,series,original,2015,,Drama


In [71]:
title_info.head()

Unnamed: 0,title_name,title_episode_name,viewable_id,title_id,season_number,content_category,program_type,release_year,medal,asset_run_time_hours,offering_start_date,offering_end_date,primary_genre_desc,pillar_genre
0,Magnífica 70,Magnífica 70 S2E2,GV--ozQ3TzQoxJgEAAAAL,GVfvUMQlxeLKLw_QIAAH9,2.0,series,original,2016,,0.84,2020-12-08,2025-02-28,latino,Drama
1,Magnífica 70,Magnífica 70 S2E3,GV--pKgtaj2ISVAEAAAAP,GVfvUMQlxeLKLw_QIAAH9,2.0,series,original,2016,,0.84,2020-12-08,2025-02-28,latino,Drama
2,Magnífica 70,Magnífica 70 S2E4,GV--paAWYF2ISVAEAAAAQ,GVfvUMQlxeLKLw_QIAAH9,2.0,series,original,2016,,0.85,2020-12-08,2025-02-28,latino,Drama
3,VICE Special Report: Killing Cancer,VICE Special Report: Killing Cancer,GV-54_wnG7qbCwwEAAAAY,GV-54_wnG7qbCwwEAAAAY,0.0,movies,acquired,2015,,0.64,2020-10-08,2020-12-28,documentary,Documentary
4,VICE Special Report: Countdown to Zero,VICE Special Report: Countdown to Zero,GV-55HghKlqbCwwEAAAAb,GV-55HghKlqbCwwEAAAAb,0.0,movies,acquired,2015,,0.81,2020-10-08,2020-12-28,documentary,Documentary


In [72]:
title_series_info = title_info[['title_name', 'viewable_id', 'title_id', 'season_number']].merge(
                    title_series_info, on = ['title_id', 'season_number'])

In [74]:
title_series_info[title_series_info['title_id']=='GYsYeoAxKH8LCwgEAAAOR']

Unnamed: 0,title_name,viewable_id,title_id,season_number,offering_start_date,asset_run_time_hours,content_category,program_type,release_year,medal,pillar_genre
29101,House of the Dragon,GYs1v_AAmgsJdiQEAAAAC,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama
29102,House of the Dragon,GYs1wDw4FcMM_wwEAAAAF,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama
29103,House of the Dragon,GYs1wfgSiqkqupQEAAAVB,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama
29104,House of the Dragon,GYs1wiAU7aLa6vwEAAAAG,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama
29105,House of the Dragon,GYs1wiAp3eLa6vwEAAAAI,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama
29106,House of the Dragon,GYsYeyA2EnHmangEAAAPc,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama
29107,House of the Dragon,GYtm1iQD9UsNewgEAAAsi,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama
29108,House of the Dragon,GYtp0KQgCF73DYAEAAAAJ,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama
29109,House of the Dragon,GYtp0bwdoYMI5wwEAAAAE,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama
29110,House of the Dragon,GYtpz2QXO2qzCAwEAAAAI,GYsYeoAxKH8LCwgEAAAOR,1.0,2022-08-22,10.26,series,original,2022,Platinum,Drama


In [75]:
import boto3
import io

In [76]:
def to_s3(filename, output_bucket, content):
    client = boto3.client('s3')
    client.put_object(Bucket=output_bucket, Key=filename, Body=content)

In [77]:
output_bucket = 'hbo-outbound-datascience-content-dev'
s3 = boto3.resource('s3')
bucket = s3.Bucket(output_bucket)



In [80]:
def write_to_sf(df, file_name):
    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index = False)
    content = csv_buffer.getvalue()
    filename = 'title_hours_viewed_retention/{}.csv'.format(file_name)
    to_s3(filename, output_bucket, content)

In [81]:
write_to_sf(title_series_info, 'title_series_info')

In [82]:
title_series_info.head()

Unnamed: 0,title_name,viewable_id,title_id,season_number,offering_start_date,asset_run_time_hours,content_category,program_type,release_year,medal,pillar_genre
0,Magnífica 70,GV--ozQ3TzQoxJgEAAAAL,GVfvUMQlxeLKLw_QIAAH9,2.0,2020-12-08,8.69,series,original,2016,,Drama
1,Magnífica 70,GV--pKgtaj2ISVAEAAAAP,GVfvUMQlxeLKLw_QIAAH9,2.0,2020-12-08,8.69,series,original,2016,,Drama
2,Magnífica 70,GV--paAWYF2ISVAEAAAAQ,GVfvUMQlxeLKLw_QIAAH9,2.0,2020-12-08,8.69,series,original,2016,,Drama
3,Magnífica 70,GV-9b1gmNoGISVAEAAAAH,GVfvUMQlxeLKLw_QIAAH9,2.0,2020-12-08,8.69,series,original,2016,,Drama
4,Magnífica 70,GWBNi4gEjLZzCOwEAAAAJ,GVfvUMQlxeLKLw_QIAAH9,2.0,2020-12-08,8.69,series,original,2016,,Drama


In [83]:
# run_query('''
# create or replace table max_dev.workspace.user_title_hours_watched_title_series_info 
# (
# title_name	VARCHAR,
# viewable_id	VARCHAR,
# title_id	VARCHAR,
# season_number	int,
# offering_start_date	VARCHAR,
# asset_run_time_hours	float,
# content_category	VARCHAR,
# program_type	VARCHAR,
# release_year	int,
# medal	VARCHAR,
# pillar_genre VARCHAR
# );

# ''')

Unnamed: 0,status
0,Table USER_TITLE_HOURS_WATCHED_TITLE_SERIES_IN...


In [84]:
# run_query('''
# copy into max_dev.workspace.user_title_hours_watched_title_series_info
#     from(
#         select
#               $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11
#         from @HBO_OUTBOUND_DATASCIENCE_CONTENT_DEV/title_hours_viewed_retention/title_series_info.csv
#         )
#     file_format = (type = csv null_if=('') skip_header = 1  FIELD_OPTIONALLY_ENCLOSED_BY='"')
#     on_error = 'CONTINUE';

# ''')

Unnamed: 0,file,status,rows_parsed,rows_loaded,error_limit,errors_seen,first_error,first_error_line,first_error_character,first_error_column_name
0,s3://hbo-outbound-datascience-content-dev/titl...,LOADED,31372,31372,31372,0,,,,
