# 0 : Environment_Setup

### 0.1 : Load_packages

In [1]:
!pip install urllib2 --user

[31mERROR: Could not find a version that satisfies the requirement urllib2 (from versions: none)[0m
[31mERROR: No matching distribution found for urllib2[0m


In [1]:
!pip install IMDbPY BeautifulSoup4  --user



In [1]:
# general
import boto3
import datetime as dt
import json
import numpy as np
import pandas as pd
import snowflake.connector

from abc import ABCMeta, abstractmethod
from scipy.spatial.distance import squareform
#from scipy.cluster.hierarchy import ward
from scipy.cluster.hierarchy import  linkage
from scipy.cluster.hierarchy import fcluster

In [2]:
pd.set_option('display.max_rows', 1000)


### 0.2 : Connection to Snowflake

In [124]:
from abc import ABCMeta, abstractmethod

class Credentials(metaclass=ABCMeta):
    pass
    
    
class SSMPSCredentials(Credentials):
    def __init__(self, secretid: str):
        self._secretid = secretid
        self._secrets = {}
        
    def get_keys(self):
        """
        credential fetching 
        """
        _aws_sm_args = {'service_name': 'secretsmanager', 'region_name': 'us-east-1'}
        secrets_client = boto3.client(**_aws_sm_args)
        get_secret_value_response = secrets_client.get_secret_value(SecretId=self._secretid)
        return get_secret_value_response
    
    
class BaseConnector(metaclass=ABCMeta):
    @abstractmethod
    def connect(self):
        raise NotImplementedError
    

class SnowflakeConnector(BaseConnector):
    def __init__(self, credentials: Credentials):
        keys = credentials.get_keys()
        self._secrets = json.loads(keys.get('SecretString', "{}"))

    def connect(self, dbname: str, schema: str = 'DEFAULT'):
        ctx = snowflake.connector.connect(
            user=self._secrets['login_name'],
            password=self._secrets['login_password'],
            account=self._secrets['account'],
            warehouse=self._secrets['warehouse'],
            database=dbname,
            schema=schema
        )

        return ctx
    
## Credentials
SF_CREDS = 'datascience-max-dev-sagemaker-notebooks'

## Snowflake connection 
conn=SnowflakeConnector(SSMPSCredentials(SF_CREDS))
ctx=conn.connect("MAX_DEV","WORKSPACE")

def run_query(query):
    cursor = ctx.cursor()
    cursor.execute(query)
    df = pd.DataFrame(cursor.fetchall(), columns = [desc[0] for desc in cursor.description])
    df.columns= df.columns.str.upper()
    return df



# 1 : Data Load

In [125]:
power_df=run_query('''
select distinct TITLE_ID, MAX(FIRST_VIEWS_7) AS  ACQUIRING_POWER
from max_dev.workspace.forecasting_fv_summary where country_iso_code = 'US'
GROUP BY 1
''')

In [126]:
input_master=run_query('''select * from  max_prod.DATASCIENCE.MAX_AUDIENCE_OVERLAP_FILTERED_ENDPOINT where dt = '2022-04-10' ''')

# input_june=run_query('''select * from  max_prod.DATASCIENCE.MAX_AUDIENCE_OVERLAP_FILTERED_ENDPOINT where dt = '2022-06-26' ''')

# input_may=run_query('''select * from  max_prod.DATASCIENCE.MAX_AUDIENCE_OVERLAP_FILTERED_ENDPOINT where dt = '2022-05-29' ''')

# input_april=run_query('''select * from  max_prod.DATASCIENCE.MAX_AUDIENCE_OVERLAP_FILTERED_ENDPOINT where dt = '2022-04-17' ''')

# input_index=run_query('''
# select * from max_prod.datascience_stage.AUDIENCE_OVERLAP_CONSOLIDATED_IDS 
# ''')


In [86]:
input_master.head()

Unnamed: 0,SCORE,INDEX_1,CONSOLIDATED_TITLE_1,LEVEL_1_CATG_NAME_1,CONSOLIDATED_ID_1,INDEX_2,CONSOLIDATED_TITLE_2,LEVEL_1_CATG_NAME_2,CONSOLIDATED_ID_2,DT
0,1.0,0,euphoria,series,GXKN_xQX5csPDwwEAAABj,0,euphoria,series,GXKN_xQX5csPDwwEAAABj,2022-04-10
1,0.795509,0,euphoria,series,GXKN_xQX5csPDwwEAAABj,1,game of thrones,series,GVU2cggagzYNJjhsJATwo,2022-04-10
2,0.842598,0,euphoria,series,GXKN_xQX5csPDwwEAAABj,2,friends,series,GXdbR_gOXWJuAuwEAACVH,2022-04-10
3,0.666564,0,euphoria,series,GXKN_xQX5csPDwwEAAABj,3,south park,series,GXr7SEgRi2sLCAAEAAAQu,2022-04-10
4,0.206462,0,euphoria,series,GXKN_xQX5csPDwwEAAABj,4,last week tonight with john oliver,series,GVU2cCgUFTYNJjhsJATuH,2022-04-10


In [87]:
input_index=input_master.sort_values(by = ['INDEX_1'])[['INDEX_1','CONSOLIDATED_TITLE_1','LEVEL_1_CATG_NAME_1', 'CONSOLIDATED_ID_1']].drop_duplicates()

In [88]:
input_index=input_index.rename({'CONSOLIDATED_TITLE_1':'CONSOLIDATED_TITLE','LEVEL_1_CATG_NAME_1':'LEVEL_1_CATG_NAME','INDEX_1':'INDEX', 'CONSOLIDATED_ID_1':'CONSOLIDATED_ID'},axis=1)

In [89]:
input_index['CONSOLIDATED_TITLE']=input_index.CONSOLIDATED_TITLE.str.replace('$','')

In [90]:
input_index.head()

Unnamed: 0,INDEX,CONSOLIDATED_TITLE,LEVEL_1_CATG_NAME,CONSOLIDATED_ID
0,0,euphoria,series,GXKN_xQX5csPDwwEAAABj
1782,1,game of thrones,series,GVU2cggagzYNJjhsJATwo
2852,2,friends,series,GXdbR_gOXWJuAuwEAACVH
3927,3,south park,series,GXr7SEgRi2sLCAAEAAAQu
4997,4,last week tonight with john oliver,series,GVU2cCgUFTYNJjhsJATuH


In [91]:
input_index.shape

(1070, 4)

In [92]:
power_df.shape

(6232, 2)

In [93]:
power_df[power_df['TITLE_ID'] == 'GXKN_xQX5csPDwwEAAABj']

Unnamed: 0,TITLE_ID,ACQUIRING_POWER
246,GXKN_xQX5csPDwwEAAABj,306205.0


In [94]:
# from sklearn.preprocessing import quantile_transform

# power_df['title']=power_df['INDEX'].str.lower()
# power_df['ACQUIRING_POWER_std']= quantile_transform(power_df['ACQUIRING_POWER'].values.reshape(-1, 1), n_quantiles=100, random_state=0, copy=True).ravel()
# power_df['RETENTION_POWER_std']= quantile_transform(power_df['RETENTION_POWER'].values.reshape(-1, 1), n_quantiles=100, random_state=0, copy=True).ravel()
all_index=input_index.merge(power_df.drop_duplicates(subset = ["TITLE_ID"]),left_on='CONSOLIDATED_ID',right_on='TITLE_ID',how='left')

# 2 : Data Processing

In [95]:
# Turn input row-based table into a pivoted, square matrix
audience_overlap = pd.pivot_table(
    input_master, 
    values='SCORE', 
    index=['INDEX_1'], 
    columns=['INDEX_2'], 
    aggfunc=np.sum
)
audience_overlap = audience_overlap.reset_index()
audience_overlap = audience_overlap.set_index('INDEX_1')

In [96]:
audience_overlap.iloc[0:3, 0:3]

INDEX_2,0,1,2
INDEX_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.0,0.795509,0.842598
1,0.795509,1.0,0.822794
2,0.842598,0.822794,1.0


In [97]:
audience_overlap.shape

(1070, 1070)

In [98]:
# Calculating the distance matrix from similarity score
# i.e., higher similarity = smaller distance

distance_matrix = 1 - audience_overlap.iloc[:, :]

In [99]:
# convert the redundant n*n square matrix form into a condensed nC2 array
# distance_matrix[{n choose 2}-{n-i choose 2} + (j-i-1)] is the distance between points i and j
distance_matrix = squareform(distance_matrix)

In [100]:
len(distance_matrix)

571915

# 3 : Clustering

In [101]:
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy

In [102]:
# Clusters have been defined by an elbow curve approach and also silhouette to confirm
z = linkage(distance_matrix, method='ward', optimal_ordering=True)

In [103]:
len(input_index)

1070

In [104]:
number_of_clusters = 8

In [105]:
# Generating results for K number of clusters
clusters = fcluster(z, number_of_clusters, criterion='maxclust')

In [106]:
result = all_index.copy()
result['CLUSTER'] = clusters
result['title'] = result['CONSOLIDATED_TITLE']

In [107]:
# result.groupby('CLUSTER').agg({'ACQUIRING_POWER':'mean','RETENTION_POWER':'mean'})

In [108]:
# title_list=['cry macho', 'friends', 'sex and the city', 'south park', 'the big bang theory', 'sesame street', 'game of thrones', 'the matrix: resurrections'
#            , 'euphoria', 'search party']

In [118]:
result\
.sort_values(by=['CLUSTER','ACQUIRING_POWER'], ascending=False).groupby('CLUSTER').tail(5)

Unnamed: 0,INDEX,CONSOLIDATED_TITLE,LEVEL_1_CATG_NAME,CONSOLIDATED_ID,TITLE_ID,ACQUIRING_POWER,CLUSTER,title
1061,7167,moulin rouge!,other,GVU3FdAjALINJjhsJAWsO,,,8,moulin rouge!
1062,7186,julia,other,GVYEs8QbN7SzCKRAJACGM,,,8,julia
1065,7236,game night,other,GYK0FNAd52b2ElQEAAABP,,,8,game night
1067,7285,jingle all the way,other,GVU34KgxBJINJjhsJAaJ-,,,8,jingle all the way
1069,7407,war dogs,other,GYPm-KQYuU2yAwgEAAAA0,,,8,war dogs
986,3231,it's complicated,movies,GWyku8grTIhnCwwEAAAB4,,,7,it's complicated
996,3405,surviving christmas,unknown,GW3sshA73HcNOkQEAAAgu,,,7,surviving christmas
1001,3525,just like heaven,movies,GXRtotQvV_G2YwgEAAABz,,,7,just like heaven
1046,6892,john tucker must die,other,GVYERIAGgZCzCKRAJABKl,,,7,john tucker must die
1059,7124,the family stone,other,GVU4y1A5XNVFvjSoJAd0X,,,7,the family stone


In [119]:
result[result['CONSOLIDATED_TITLE'].str.contains('suicide')]

Unnamed: 0,INDEX,CONSOLIDATED_TITLE,LEVEL_1_CATG_NAME,CONSOLIDATED_ID,TITLE_ID,ACQUIRING_POWER,CLUSTER,title
10,10,the suicide squad,movies,GYOxtow3Wz8PDwgEAAAdw,GYOxtow3Wz8PDwgEAAAdw,238977.0,8,the suicide squad
49,49,suicide squad,movies,GXnEEogAPtp4_wwEAAEvU,GXnEEogAPtp4_wwEAAEvU,11286.0,5,suicide squad
1052,7053,the suicide squad,other,GYTt7RAvZw7gTsAEAAAHi,,,6,the suicide squad


# 4: Cluster Results Analysis

In [111]:
from bs4 import BeautifulSoup
import imdb
import urllib.request  as urllib2 

In [112]:
def get_movie_id(name):
    ia = imdb.IMDb()
    # searching the name 
    search = ia.search_movie(name)
    # loop for printing the name and id
    for i in range(len(search)):
        # getting the id
        id = search[i].movieID
        # printing it
        print(search[i]['title'] + " : " + id )
        return id

In [120]:
from IPython.display import HTML, display,IFrame
import imdb
import urllib.request  as urllib2 

access = imdb.IMDb()
num_titles = 10

topmovies_df=result.drop_duplicates(subset='title').sort_values(by=['CLUSTER','ACQUIRING_POWER'], ascending=False).groupby('CLUSTER').head(10)
topmovies_df.head()
htmlstr=f'''<table>'''
n=num_titles
for index, row in topmovies_df.iterrows():    
    if (n>0):
        if (n==num_titles):
            htmlstr=htmlstr+'<tr>'
        try:
            movie_id=get_movie_id(row['title'])
            cluster_id=row['CLUSTER']
            movie = access.get_movie(movie_id)
            imgurl=movie['full-size cover url']
            htmlstr=htmlstr+f'''<td><img src={imgurl} width="120""></td>'''
        except:
            htmlstr=htmlstr+f'''<td>Missing</td>'''
        #print(row['title'], row['CLUSTER'])
        if (n==1):
            htmlstr=htmlstr+'</tr>'
            n=num_titles
        else:
            n=n-1  
htmlstr=htmlstr+'</table>'

The Suicide Squad : 6334354
Dune : 1160419
The Little Things : 10016180
The Many Saints of Newark : 8110232
King Richard : 9620288
Cry Macho : 1924245
Malignant : 3811906
Those Who Wish Me Dead : 3215824
Peacemaker : 13146488
An American Pickle : 9059704
Friends: The Reunion : 11337862
Euphoria : 8772296
And Just Like That... : 13819960
Friends : 0108778
In the Heights : 1321510
Gossip Girl : 0397442
The Fallout : 11847410
Pretty Little Liars : 1578873
Space Jam: A New Legacy : 3554046
The Conjuring: The Devil Made Me Do It : 7069210
Tom & Jerry : 1361336
Charm City Kings : 9048840
The Fresh Prince of Bel-Air Reunion : 13315308
Judas and the Black Messiah : 9784798
The Witches by Roald Dahl (2021) (Podcast Episode) - The Drunk Guys Book Club Podcast : 18793996
South Park : 0121955
The Fresh Prince of Bel-Air : 0098800
Looney Tunes Cartoons : 8543208
Godzilla vs. Kong : 5034838
Wonder Woman 1984 : 7126948
Mortal Kombat : 0293429
Zack Snyder's Justice League : 12361974
The Matrix: Resurr

2022-08-15 06:14:09,548 CRITICAL [imdbpy] /home/ec2-user/.local/lib/python3.6/site-packages/imdb/_exceptions.py:34: IMDbParserError exception raised; args: ('invalid title: """"',); kwds: {}
NoneType: None


The Shop : 8755712
Band of Brothers : 0185906
Ballers : 2891574
McMillions : 9148598
Jojo Rabbit : 2584384
Class Action Park : 11015214
Expecting Amy : 11188392
The Way Down: God, Greed and the Cult of Gwen Shamblin (2021) (Podcast Episode)  - Season 1 | Episode 57  - Tig and Cheryl: True Story : 21289854
What's Love Got to Do with It : 0108551
What Happened, Brittany Murphy? : 14396056
The Bee Gees: How Can You Mend a Broken Heart : 9850386
Allen v. Farrow : 13990468
I'll Be Gone in the Dark : 8259114
Q: Into the Storm : 14215442
Heaven's Gate: The Cult of Cults : 11210146
Insecure : 5024912
Love Life : 10380768
Succession : 7660850
Last Week Tonight with John Oliver : 3530232
The Flight Attendant : 7569576
Real Time with Bill Maher : 0350448
The Gilded Age : 4406178
Legendary : 11048090
I Know This Much Is True : 0425118
Curb Your Enthusiasm : 0264235
The Big Bang Theory : 0898266
A West Wing Special to Benefit When We All Vote : 13180026
Raised by Wolves : 9170108
Westworld : 047578

In [121]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [122]:

from IPython.display import HTML, display,IFrame
display(HTML(htmlstr))

0,1,2,3,4,5,6,7,8,9
,,,,,,,,,
,,,,,Missing,,,Missing,
,,,,,,Missing,,,
,,,,Missing,,,,,
,,,,Missing,,,,,
,,Missing,,,,,,,
,,,,,,,,,
,,,,,,,,,


In [116]:
result[result['CONSOLIDATED_TITLE'].str.contains('succession')]

Unnamed: 0,INDEX,CONSOLIDATED_TITLE,LEVEL_1_CATG_NAME,CONSOLIDATED_ID,TITLE_ID,ACQUIRING_POWER,CLUSTER,title
9,9,succession,series,GWukCJAq-nIuHwwEAAAB4,GWukCJAq-nIuHwwEAAAB4,43844.0,2,succession
