In [2]:
import pandas as pd
from tqdm import notebook
from itertools import combinations
from collections import Counter
import networkx as nx
from collections import defaultdict
import numpy as np
import pickle
import random

### Load Data

In [14]:
#Load movie data
movie = pd.read_csv('movie.csv', sep=",",low_memory=False)
movie = movie.sort_values(by=['openDt'],axis=0)
movie = movie.reset_index(drop=True)
movie = movie.drop(['Unnamed: 0'],axis=1)

#Load daily data
daily = pd.read_csv('daily.csv', sep=",",low_memory=False)
daily = daily.sort_values(by=['showRange'],axis=0)
daily = daily.reset_index(drop=True)

#Get movie's box office
movieAudiDf = daily.copy()
movieAudiDf = movieAudiDf.drop_duplicates(['movieCd'],keep='last', ignore_index=True)
movieAudiDf = movieAudiDf[['movieCd', 'audiAcc']]
movieAudiDf = movieAudiDf.reset_index(drop=True)

#Merging
normal_data = pd.merge(movieAudiDf, movie, on = 'movieCd', how = 'inner')

#Data setting
normal_data = normal_data.sort_values(by=['openDt'],axis=0)
normal_data = normal_data.reset_index(drop=True)
normal_data.loc[3740,'openDt']=19530402.0
normal_data.loc[3741,'openDt']=20111215.0
normal_data.loc[3742,'openDt']=20130430.0
normal_data.loc[3743,'openDt']=19540922.0
normal_data.loc[3744,'openDt']=20141211.0
normal_data = normal_data.sort_values(by=['openDt'],axis=0)
normal_data = normal_data.loc[normal_data['openDt']<20200000]
normal_data = normal_data.reset_index(drop=True)
print("Total number of movie (until 2019) : ", len(normal_data))

normal_data.tail()

Total number of movie (until 2019) :  3418


Unnamed: 0,movieCd,audiAcc,movieNm,movieNmEn,openDt,typeNm,showTm,nations,actors,companys,audits,directors,genres
3413,20199981,211012,눈의 여왕4,The Snow Queen: Mirrorlands,20191224.0,장편,87.0,러시아,20162319|20137223|20174743|20312032|20309149|2...,20100932,2019-MF02349|전체관람가,20333593|20333594,애니메이션|가족|판타지|어드벤처
3414,20198818,201946,프린스 코기,The Queen's Corgi,20191224.0,장편,82.0,벨기에,20191807|20333138|10047326|20232454|20129731|2...,20142048,2019-MF02388|전체관람가,10031148,애니메이션
3415,20192721,728328,캣츠,Cats,20191224.0,장편,,미국|영국,10062748|10084561|10055199|10067573|10068911|1...,20100603,2019-MF02543|12세이상관람가,10081129,뮤지컬|드라마
3416,20184571,1987842,천문: 하늘에 묻는다,Forbidden Dream,20191226.0,장편,132.0,한국,10072251|10087518|10040262|20175721|10088453|1...,20188021,2019-MF02489|12세이상관람가,10088463,사극
3417,20196272,957894,미드웨이,Midway,20191231.0,장편,138.0,미국,10082878|10082678|10052042|20129264|10042114|1...,20100634,2019-MF02551|15세이상관람가,10018115,액션|드라마


### Extraction of Actor Synergy Graph

In [5]:
#Actor's star power 
st_time = defaultdict(list) # key : actorCd / value : star power measurement time list
st_boxoffice = defaultdict(list) # key : actorCd / value : star power box office list

for idx,row in normal_data.iterrows():
    actors = row['actors'].split('|')
    actors = list(set(actors))
    for a in actors : 
        st_time[a].append(row['openDt'])
        st_boxoffice[a].append(row['audiAcc'])  
             
print('Total number of actors measured for star power : ',len(st_boxoffice))

Total number of actors measured for star power :  17556


In [6]:
#Indexing
#Movie node list
movieCdlist_original=normal_data['movieCd'].tolist()
movieCdlist = ['M'+mv for mv in movieCdlist_original]

#Actor node list
actorlist=[]
actorsetlist=normal_data['actors'].tolist()
for i in actorsetlist : 
    actors = i.split('|')
    actorlist.extend(actors)
actorlist = list(set(actorlist))
actorCdlist = actorlist.copy()

index_map = {} 
reverse_index_map = {} 
index = 0
for node in movieCdlist:
    index_map[index] = node
    reverse_index_map[node] = index
    index += 1
for node in actorCdlist:
    index_map[index] = node
    reverse_index_map[node] = index
    index += 1

In [7]:
def disynergy(actor_st_time, actor_st_boxoffice, avg_synergy, boxoffice, relatedate):
    di_synergy = []
    #first
    x = actor_st_time[0]
    y = actor_st_boxoffice[0]
    if relatedate-x == 0 :
        di_synergy.append(avg_synergy)
    else :
        di_synergy.append((boxoffice-y)/(relatedate-x))

    #maximum
    max_index = list(filter(lambda e:actor_st_boxoffice[e] == max(actor_st_boxoffice), range(len(actor_st_boxoffice))))
    x = actor_st_time[max_index[-1]]
    y = actor_st_boxoffice[max_index[-1]]
    if relatedate-x == 0 :
        di_synergy.append(avg_synergy)
    else :
        di_synergy.append((boxoffice-y)/(relatedate-x))

    #minimum
    min_index = list(filter(lambda e:actor_st_boxoffice[e] == min(actor_st_boxoffice), range(len(actor_st_boxoffice))))
    x = actor_st_time[min_index[-1]]
    y = actor_st_boxoffice[min_index[-1]]
    if relatedate-x == 0 :
        di_synergy.append(avg_synergy)
    else :
        di_synergy.append((boxoffice-y)/(relatedate-x))

    #average
    x = np.mean(actor_st_time)
    y = np.mean(actor_st_boxoffice)
    if relatedate-x == 0 :
        di_synergy.append(avg_synergy)
    else :
        di_synergy.append((boxoffice-y)/(relatedate-x))

    #frequency
    u = list(filter(lambda e:boxoffice > actor_st_boxoffice[e], range(len(actor_st_boxoffice))))
    u = len(u)
    n = len(actor_st_boxoffice)
    if n == 0 :
        di_synergy.append(0)
    else :
        di_synergy.append(u/n)
    return di_synergy

In [15]:
#Synergy Graph Extraction
base_relatedate = normal_data.iloc[0]['openDt'] 
base_boxoffice = normal_data.iloc[0]['audiAcc']
sy_synergy = defaultdict(list) # key : (actor index, actor index) / value : synergy list

for index, row in notebook.tqdm(normal_data.iterrows()):
    relatedate = row['openDt']#movie release date
    boxoffice = row['audiAcc']#movie box office
    actors = row['actors'].split('|')#List of actors in a movie
    actors = list(set(actors))
    beforeDf = normal_data.loc[normal_data['openDt']<relatedate] 
    if len(beforeDf)!=0 :
        avg_boxoffice = np.mean(beforeDf['audiAcc'].tolist())
        avg_relatedate = int(pd.to_datetime(pd.Series(beforeDf['openDt'].tolist()), format='%Y%m%d').mean().strftime("%Y%m%d"))
        avg_synergy = (boxoffice - avg_boxoffice)/(relatedate - avg_relatedate) 
    
    actorsetlist = list(combinations(actors,2)) 
    for a,b in actorsetlist : # a,b Synergy calculations giving and receiving each other
        if len(normal_data.loc[(normal_data['actors'].str.contains(a))&(normal_data['actors'].str.contains(b))]) >=2 :
            a_index = list(np.where(np.array(st_time[a]) < relatedate)[0])
            b_index = list(np.where(np.array(st_time[b]) < relatedate)[0])
            a_st_time = []
            a_st_boxoffice = []
            b_st_time = []
            b_st_boxoffice = []
            ab_synergy = []
            ba_synergy = []

            if (len(a_index) == 0)&(len(b_index) == 0) : #actor a,b are new actors
                if len(beforeDf)==0 : 
                    ab_synergy=[0, 0, 0, 0, 0]
                    ba_synergy=[0, 0, 0, 0, 0]
                else : 
                    ab_synergy=[avg_synergy, avg_synergy, avg_synergy, avg_synergy, 0]
                    ba_synergy=[avg_synergy, avg_synergy, avg_synergy, avg_synergy, 0]

            elif (len(a_index) == 0)&(len(b_index) != 0) : #actor a is new actor
                if len(beforeDf)==0 :
                    ba_synergy=[0, 0, 0, 0, 0]
                else : 
                    ba_synergy=[avg_synergy, avg_synergy, avg_synergy, avg_synergy, 0]
                for bi in b_index : 
                    b_st_time.append(st_time[b][bi])
                    b_st_boxoffice.append(st_boxoffice[b][bi])
                ab_synergy = disynergy(b_st_time, b_st_boxoffice, avg_synergy, boxoffice, relatedate)

            elif (len(a_index) != 0)&(len(b_index) == 0) : #actor b is new actor
                if relatedate-base_relatedate == 0 :
                    ab_synergy=[0, 0, 0, 0, 0]
                else : 
                    ab_synergy=[avg_synergy, avg_synergy, avg_synergy, avg_synergy, 0]

                for ai in a_index : 
                    a_st_time.append(st_time[a][ai])
                    a_st_boxoffice.append(st_boxoffice[a][ai])
                ba_synergy = disynergy(a_st_time, a_st_boxoffice, avg_synergy, boxoffice, relatedate)

            else :          
                for ai in a_index : 
                    a_st_time.append(st_time[a][ai])
                    a_st_boxoffice.append(st_boxoffice[a][ai])
                for bi in b_index : 
                    b_st_time.append(st_time[b][bi])
                    b_st_boxoffice.append(st_boxoffice[b][bi])
                ab_synergy = disynergy(b_st_time, b_st_boxoffice, avg_synergy, boxoffice, relatedate)
                ba_synergy = disynergy(a_st_time, a_st_boxoffice, avg_synergy, boxoffice, relatedate)

            sy_synergy[(a,b)].append(np.mean(ab_synergy))  # a→b : Synergy of a giving to b
            sy_synergy[(b,a)].append(np.mean(ba_synergy))  # b→a : Synergy of b giving to a
        
# Normalization
synergies_list = list(sy_synergy.values())
length = len(synergies_list)
for i in range(length) : 
    synergies_list[i] = np.mean(synergies_list[i])
synergies_list_min = min(synergies_list)
synergies_list_max = max(synergies_list)

mapactorslist = []
for a,b in sy_synergy.keys():
    value = []
    value.append(reverse_index_map[a])
    value.append(reverse_index_map[b])
    synergy_get = sy_synergy[(a,b)]
    syn_mean = np.mean(synergy_get)
    syn_mean = (syn_mean - synergies_list_min)/(synergies_list_max - synergies_list_min)
    value.append(syn_mean)
    mapactorslist.append(tuple(value))

synergyGraph = nx.DiGraph()
synergyGraph.add_weighted_edges_from(mapactorslist)
print("Total number of actor nodes : ",len(synergyGraph.nodes())) 
print("Total number of edges : ",len(synergyGraph.edges())) 

0it [00:00, ?it/s]

Total number of actor nodes :  3041
Total number of edges :  32594
