In [112]:
import pandas as pd
import numpy as np
import itertools as it
import os
import io
import logging

import boto3
import sys
import json

In [113]:
import os
import sys
path=!pwd
sys.path.append(os.path.join(path[0], '..'))
from utils import *
import snowflake.connector

In [2]:
pd.set_option('display.max_rows', 1000)

In [11]:
class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_PROD","DATASCIENCE_STAGE")

In [12]:
def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.lower()
    return df

In [48]:
lf_twitter = run_query('''SELECT DISTINCT UPPER(LFM_BRAND_NAME) as lf_title
                        FROM MAX_DEV.WORKSPACE.LISTENFIRST_DATASET_BRAND_TWITTER_HIST
                        ''')

In [84]:
title_scope = run_query('''
select distinct * from (    
 SELECT  COALESCE (imdb.REMAPPED_TITLE_ID, imdb.title_id) as IMDB_ID, 
       imdb.ORIGINAL_TITLE as title, 
       upper(imdb.ORIGINAL_TITLE)as am_title,
        CASE WHEN amp_o.PLATFORM IS NULL THEN 'None'
            WHEN amp_o.PLATFORM IN ('HBO', 'HBO Max') THEN 'HBO Max' 
            ELSE amp_o.PLATFORM 
       END AS PLATFORM,  imdb.number_of_votes,
       CASE WHEN CONTAINS(amp_meta.ORIGINAL, 'Original') THEN 'Yes' Else 'No' END AS IS_ORIGINAL,
       imdb.PRODUCTION_RELEASE_DATE,
       row_number() over (partition by title_id order by date desc) as rn
FROM "ENTERPRISE_DATA"."CATALOG"."IMDB_TITLE" imdb 
JOIN "MAX_PROD"."CKG"."AMPERE_METADATA"amp_meta on imdb.title_id = amp_meta.imdb_id
JOIN "MAX_PROD"."CKG"."AMPERE_OCCURRENCE" amp_o ON  amp_meta.content_pid = amp_o.content_pid
where 1=1
and imdb.title_type IN ('movie', 'tvSeries', 'tvMiniSeries')
and amp_meta.CONTENT_TYPE in ('TVShow', 'Movie')
and amp_o.country = 'USA'
  ) where rn = 1
''')

In [90]:
title_scope['number_of_votes'] = title_scope['number_of_votes'].astype(int)

In [61]:
mapping = pd.merge(title_scope, lf_twitter, left_on = ['am_title'], right_on = ['lf_title'], how = 'left')

In [72]:
mapping[(mapping['lf_title'].isnull())
        &(mapping['production_release_date'] >= '2016')
        &(mapping['platform'] == 'HBO Max')
       &(mapping['number_of_votes'] > 11987.4)].sort_values(by = ['number_of_votes'], ascending = False)

Unnamed: 0,imdb_id,language,title,am_title,platform,number_of_votes,is_original,production_release_date,rn,lf_title
998,tt7366338,en,Chernobyl,CHERNOBYL,HBO Max,716901,Yes,2019-05-06,1,
40162,tt2975590,en,Batman v Superman: Dawn of Justice,BATMAN V SUPERMAN: DAWN OF JUSTICE,HBO Max,690545,Yes,2016-03-19,1,
9890,tt0451279,en,Wonder Woman,WONDER WOMAN,HBO Max,645202,Yes,2017-05-30,1,
67051,tt6723592,en,Tenet,TENET,HBO Max,478234,Yes,2020-08-22,1,
43400,tt12361974,en,Zack Snyder's Justice League,ZACK SNYDER'S JUSTICE LEAGUE,HBO Max,384446,Yes,2021-03-18,1,
63768,tt1517451,en,A Star Is Born,A STAR IS BORN,HBO Max,376897,Yes,2018-08-31,1,
1600,tt1950186,en,Ford v Ferrari,FORD V FERRARI,HBO Max,376249,Yes,2019-11-15,1,
4091,tt6334354,en,The Suicide Squad,THE SUICIDE SQUAD,HBO Max,336620,Yes,2021-07-28,1,
20336,tt4649466,en,Kingsman: The Golden Circle,KINGSMAN: THE GOLDEN CIRCLE,HBO Max,321756,Yes,2017-09-20,1,
14733,tt7349950,en,It Chapter Two,IT CHAPTER TWO,HBO Max,254767,Yes,2019-08-26,1,


In [67]:
lf_twitter[lf_twitter['lf_title'].str.startswith('DARK')].sort_values(by = ['lf_title'])

Unnamed: 0,lf_title
26435,DARK
16429,DARK ANGEL
8656,DARK AWAKENING
2062,DARK DESIRE
27428,DARK DESIRE (NETFLIX)
34628,DARK HORSE
46025,DARK LIGHT
33596,DARK MATTER
51770,DARK MONEY
8744,DARK MOON


In [42]:
title_scope.quantile([.1, .5, .75, .90])

Unnamed: 0,number_of_votes,rn
0.1,7.0,1.0
0.5,259.0,1.0
0.75,1844.0,1.0
0.9,11987.4,1.0


In [106]:
mapping.isnull().sum() * 100 / len(mapping)

imdb_id                     0.000000
language                    0.000000
title                       0.000000
am_title                    0.000000
platform                    0.000000
number_of_votes             0.000000
is_original                 0.000000
production_release_date     1.483777
rn                          0.000000
lf_title                   86.860137
dtype: float64

In [104]:
mapping = mapping[mapping['number_of_votes']>0]

In [None]:
TOP 13%

In [111]:
mapping[mapping['platform'].isin(['Amazon Prime Video', 'Apple TV+', 'Discovery+', 'Disney+', 'HBO Max', 'Hulu', 'Netflix', 'Paramount+', 'Peacock'])]\
.groupby(['platform']).apply(lambda x: x.isnull().sum() * 100 / len(x)) # 30%

Unnamed: 0_level_0,imdb_id,language,title,am_title,platform,number_of_votes,is_original,production_release_date,rn,lf_title
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Amazon Prime Video,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.562557,0.0,93.37108
Apple TV+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.106383
Discovery+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,54.061471
Disney+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120192,0.0,64.903846
HBO Max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.300136,0.0,70.313779
Hulu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.690943,0.0,77.647059
Netflix,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21889,0.0,73.700339
Paramount+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.596792,0.0,84.520701
Peacock,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139405,0.0,80.994424


# Wiki

In [85]:
wiki_titles = run_query('''select distinct imdb_id as wiki_imdb_id from "MAX_PROD"."CKG"."ID_MAPPING_IMDB_WIKIPEDIA"
                    WHERE language = 'en'
''')

In [91]:
mapping_wiki = pd.merge(title_scope, wiki_titles, left_on = ['imdb_id'], right_on = ['wiki_imdb_id'], how = 'left')

In [99]:
mapping_wiki[(mapping_wiki['wiki_imdb_id'].isnull())
        &(mapping_wiki['production_release_date'] >= '2016')
        &(mapping_wiki['platform'] == 'HBO Max')
       &(mapping_wiki['number_of_votes'] > 11987.4)
            ].sort_values(by = ['number_of_votes'], ascending = False)

Unnamed: 0,imdb_id,title,am_title,platform,number_of_votes,is_original,production_release_date,rn,wiki_imdb_id
24935,tt12343534,Jujutsu Kaisen,JUJUTSU KAISEN,HBO Max,53598,Yes,2020-10-02,1,
77749,tt9679542,Dr. Stone,DR. STONE,HBO Max,18621,Yes,2019-07-05,1,
63819,tt5607616,"Re: Zero, Starting Life in Another World","RE: ZERO, STARTING LIFE IN ANOTHER WORLD",HBO Max,17756,Yes,2016-04-03,1,
44154,tt14324650,"Batman: The Long Halloween, Part One","BATMAN: THE LONG HALLOWEEN, PART ONE",HBO Max,15697,Yes,2021-06-21,1,
58907,tt13196080,Tokyo Revengers,TOKYO REVENGERS,HBO Max,14819,Yes,2021-03-30,1,
52449,tt14402926,"Batman: The Long Halloween, Part Two","BATMAN: THE LONG HALLOWEEN, PART TWO",HBO Max,12758,Yes,2021-07-27,1,


In [107]:
mapping_wiki = mapping_wiki[mapping_wiki['number_of_votes']>0]

In [108]:
mapping_wiki.isnull().sum() * 100 / len(mapping_wiki)

imdb_id                     0.000000
title                       0.000000
am_title                    0.000000
platform                    0.000000
number_of_votes             0.000000
is_original                 0.000000
production_release_date     1.483908
rn                          0.000000
wiki_imdb_id               43.697172
dtype: float64

In [110]:
mapping_wiki[mapping_wiki['platform'].isin(['Amazon Prime Video', 'Apple TV+', 'Discovery+', 'Disney+', 'HBO Max', 'Hulu', 'Netflix', 'Paramount+', 'Peacock'])]\
.groupby(['platform']).apply(lambda x: x.isnull().sum() * 100 / len(x)) # 30%

Unnamed: 0_level_0,imdb_id,title,am_title,platform,number_of_votes,is_original,production_release_date,rn,wiki_imdb_id
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Amazon Prime Video,0.0,0.0,0.0,0.0,0.0,0.0,1.562787,0.0,47.192499
Apple TV+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.148936
Discovery+,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,72.73227
Disney+,0.0,0.0,0.0,0.0,0.0,0.0,0.120192,0.0,20.192308
HBO Max,0.0,0.0,0.0,0.0,0.0,0.0,0.272554,0.0,16.026165
Hulu,0.0,0.0,0.0,0.0,0.0,0.0,0.693273,0.0,27.787146
Netflix,0.0,0.0,0.0,0.0,0.0,0.0,0.218938,0.0,30.136836
Paramount+,0.0,0.0,0.0,0.0,0.0,0.0,0.598131,0.0,23.028037
Peacock,0.0,0.0,0.0,0.0,0.0,0.0,0.139276,0.0,37.000929
