In [1]:
# !pip install snowflake --user
# !pip install snowflake-connector-python --user
# !pip install matplotlib
# !pip install seaborn

# Data

In [1]:
import os
import sys
path=!pwd
sys.path.append(os.path.join(path[0], '..'))
from utils import *
import snowflake.connector

In [2]:
pd.set_option('display.max_rows', 1000)

In [3]:
class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")

In [4]:
def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df

# Input Data

In [13]:
import itertools as it
import os
import io
import logging

import boto3
import sys

In [49]:
logger = logging.getLogger()
logger.info(f'Loading inputs')
data_list ={}

s3 = boto3.resource('s3')
bucket = s3.Bucket('hbo-ingest-datascience-content-dev')

In [50]:
for obj in bucket.objects.filter(Prefix='input_percent_view'):
    key = obj.key
    logger.info('Loading csv file {}'.format(key))
    body = obj.get()['Body']
    var_name = key.split('.')[0].split('/')[1]
    print('Reading {0} features'.format(var_name))
    df = pd.read_csv(body, na_values = [np.NaN])
#     exec("{0}=pd.read_csv(body, na_values = [r'\\\\N'])".format(var_name))
    df.columns = df.columns.str.lower()
    #     exec("{0}.columns = {0}.columns.str.lower()".format(var_name))

    # exclude the full null columns
    df = df.loc[:, df.isnull().sum()!=df.shape[0]]
    #     exec("{0} = {0}.loc[:,{0}.isnull().sum()!={0}.shape[0]]".format(var_name))

    # exclude the old Mortal Kombat movie because the trailer percent view 
    # matching matches the trailer of the new movie to the old movie
    df = df.loc[df['match_id_platform'].\
    isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL'])==False,:]\
    .reset_index(drop = True)
    #     exec("{0} = {0}.loc[{0}['match_id_platform'].\
    #     isin(['1-GYGQBcwsaCIW2XgEAAAAL', '0-GYGQBcwsaCIW2XgEAAAAL'])==False,:]\
    #     .reset_index(drop = True)".format(var_name))

    # append the feature df
    data_list[var_name] = df
    #     exec("data_list.append({0})".format(var_name))

Reading cost_feature features
Reading funnel_metric_feature features
Reading media_cost_postlaunch_feature features
Reading media_cost_prelaunch_feature features
Reading metadata_feature features
Reading prelaunch_trailer_feature features
Reading production_budget_imdb_feature features
Reading sub_total_feature features
Reading trailer_feature features
Reading vtp_feature features


In [54]:
data_list.keys()

dict_keys(['cost_feature', 'funnel_metric_feature', 'media_cost_postlaunch_feature', 'media_cost_prelaunch_feature', 'metadata_feature', 'prelaunch_trailer_feature', 'production_budget_imdb_feature', 'sub_total_feature', 'trailer_feature', 'vtp_feature'])

In [70]:
metadata_feature = data_list['metadata_feature']

In [71]:
metadata_feature[metadata_feature['title_name'] == 'Malignant']

Unnamed: 0,title_name,match_id,match_id_platform,season_number_adj,earliest_offered_timestamp,platform_name,program_type,content_category,single_episode_ind,in_sequantial_releasing_period,at_release_year,dayofweek_earliest_date,total_hours,prod_release_year,title_age_approx,licensor_agg,descriptive_genre_desc_agg,wm_enterprise_genres_agg,navigation_genre_desc_agg
9094,Malignant,GYSaKIQwrHruinAEAAABN,1-GYSaKIQwrHruinAEAAABN,-1,2021-09-10 07:01:00.000,1,acquired,movies,1,0,1,5,1.791111,2021,0.0,"warner bros. inc., warner media direct, llc",horror|thriller,crime | horror | suspense,horror|thriller


In [73]:
prelaunch_trailer_feature = data_list['prelaunch_trailer_feature']
prelaunch_trailer_feature[prelaunch_trailer_feature['match_id_platform'] == '1-GYSaKIQwrHruinAEAAABN']

Unnamed: 0,match_id_platform,day000_trailer_metric_d28,day001_trailer_metric_d28,day002_trailer_metric_d28,day003_trailer_metric_d28,day004_trailer_metric_d28,day005_trailer_metric_d28,day006_trailer_metric_d28,day007_trailer_metric_d28,day008_trailer_metric_d28,...,day018_trailer_metric_d28,day019_trailer_metric_d28,day020_trailer_metric_d28,day021_trailer_metric_d28,day022_trailer_metric_d28,day023_trailer_metric_d28,day024_trailer_metric_d28,day025_trailer_metric_d28,day026_trailer_metric_d28,day027_trailer_metric_d28
4656,1-GYSaKIQwrHruinAEAAABN,0.012449,0.009662,0.007358,0.004263,0.003988,0.003767,0.003562,0.003408,0.003304,...,0.002518,0.002415,0.002321,0.002239,0.002141,0.001518,0.000824,0.000246,0.000169,9.4e-05


In [74]:
trailer_feature = data_list['trailer_feature']
trailer_feature[trailer_feature['match_id_platform'] == '1-GYSaKIQwrHruinAEAAABN']

Unnamed: 0,match_id_platform,cumulative_day_num,total_trailer_num,retail_trailer_view_metric,avg_trail_metric_per_day
19,1-GYSaKIQwrHruinAEAAABN,52,5,0.012954,0.000249


In [75]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 21:26:20


In [76]:
now

datetime.datetime(2021, 9, 22, 21, 26, 20, 510647)