In [1]:
import json
import os
import sys
from collections import Counter, defaultdict
from itertools import combinations
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import random 
from copy import copy

src_dir = os.path.abspath(os.path.join(os.pardir, os.pardir,'src'))
sys.path[0] = src_dir
print(sys.path[0])
from parser.support import ROLES, CREDITS
from parser.my_mongo_db_login import DB_LOGIN_INFO
import parser.support as support
import network.shift_graph_maker as sgm
import model.contagion as contagion
import model.general as gen


/home/staff/junelee/Research-Project/movie_propagation_reserve/src


In [2]:
%reload_ext autoreload
%autoreload 2
%load_ext snakeviz

%matplotlib inline

In [3]:
with open('/home/projects/movie-network/data/raw_data/movies.json') as f:
    movie_file = f.read()
    movie_data = json.loads(movie_file)

In [4]:
role = 'producing'
role_key = role + "_gender_percentage"
all_movies = support.get_movies_df(role_key)
print('Got all_movies')
gender_df = support.get_staff_df('producers')[['_id', 'female_count', 'first_movie', 'last_movie', 'gender']]

Loaded IMDb movies producing_gender_percentage
Got all_movies


In [5]:
movie_90s = all_movies[(all_movies.year >= 1990) & (all_movies.year < 2000)]
movie_90s = movie_90s.sort_values('year')

In [6]:
#get 90s movies with only producer, id, title, year...
movie_producer_df = movie_90s[['_id', 'producers', 'producing_gender_percentage', 'title', 'year']]

In [187]:
producer_90s = [i[0] for sublist in movie_producer_df.producers.tolist() for i in sublist]

In [12]:
gender_df = support.get_staff_df('producers')[['_id', 'female_count', 'first_movie', 'last_movie', 'gender']]
females = gender_df[(gender_df.gender == 'female')]._id.tolist()
len(females)

5572

## Aggregate by year

In [176]:
agg_year = 5
start_year = 1990
end_year = 2000

grouped_by_year = movie_90s.groupby(pd.cut(movie_90s['year'], np.arange(start_year-1, end_year, agg_year)))

In [177]:
for year, df in grouped_by_year:
    print(year)


(1989, 1994]
(1994, 1999]


In [188]:
female_90s = [i for i in females if i in producer_90s]
len(female_90s)

1064

## Build projected network

In [34]:
belief_type = 'empirical'
p, d, threshold = 0.1, 1, 1

In [180]:
adopter_history[:,1:0]

array([], shape=(184, 0), dtype=object)

In [16]:
#get only producers to check the links
producer_role_list = dict(zip(df._id, df.producers))
producer_list = {}
for movie, producers in producer_role_list.items():
    producer_list[movie] = [i[0] for i in producers]

In [17]:
all_producers = list(set([p for p_list in producer_list.values() for p in p_list]))
len(all_producers)

712

In [18]:
test_id = 'tt0100935'
producer_list[test_id]

['nm0326512', 'nm0474138', 'nm0599863', 'nm0797451']

In [31]:
total_females = gender_df[gender_df.gender == 'female']._id.tolist()
total_producers = [i[0] for sublist in movie_90s.producers.tolist() for i in sublist]
seeds = [i for i in total_producers if i in total_females]

In [54]:
len(list(set(seeds)))

1064

In [44]:
G = sgm.build_projected_network(df, seeds, belief_type, threshold)

In [45]:
#check if the producers share edges to the exact number of movies they shot together
for i, j in combinations(all_producers, 2):
    edges = G.number_of_edges(i,j)
    if edges == 2:
        movie_num = edges
        test_producers = [i,j]
        break
        
shared_movie = []
for movie, producers in producer_list.items():
    if all(e in producers for e in test_producers):
        shared_movie.append(movie)
len(shared_movie) == movie_num

True

## Contagon

### Sequential - No weights

In [46]:
iterations = 20

producer_node_ids = [n for n in G.nodes() if G.node[n]['node_type']=='P']


In [47]:
len(df)

183

In [49]:
G, adopter_history = contagion.contagion_sequential_projected_network(G, len(df), year, p, d, threshold)

In [50]:
len(adopter_history), len(df)

(184, 183)

### Cumulative adopters over aggregation

In [144]:
dict_test = {}
for i in range(10):
    dict_test['a'] = i
dict_test

{'a': 9}

In [65]:
# def cumulative_adopters_over_time_projected_network_synchronous_update(df, interval, belief_type, p, d, t, iterations=20):
total_female = copy(seeds)
total_producers = [i[0] for sublist in movie_90s.producers.tolist() for i in sublist]
seeds = [i for i in total_producers if i in total_female]
seeds = list(set(seeds))

In [53]:
total_producers = gender_df._id.tolist()
belief_dict = {n:1 if n in seeds else 0 for n in total_producers}

In [87]:
df_raw.year.iloc[0]

1990

In [83]:
df_raw = movie_90s.copy(deep=True)

In [164]:
agg_year = 1
start_year = 1990
end_year = 2000
p, d, t = 0.1, 1.0, 1.0


def cumulative_adopters_projected_network_sequential(df_raw, interval, seeds, belief_type, p, d, threshold):
    """
    Calculate cumulative of projected network over time
    input
        df_raw - df data with movies as rows
        interval - interval of aggregation 
    """
    start_year = df_raw.year.iloc[0] - 1
    end_year = df_raw.year.iloc[-1] + 1
    grouped_by_year = df_raw.groupby(pd.cut(df_raw['year'], np.arange(start_year, end_year, agg_year)))
    
    #make dictionary that keeps track of belief scale
    total_nodes = list(set([i[0] for sublist in df_raw.producers.tolist() for i in sublist]))
    belief_dict = {n:1 if n in seeds else 0 for n in total_nodes}
    
    existing_adopters = list(set(seeds))
    total_adopter_history = []
    total_steps = 0
    for interval, df in grouped_by_year:
        #make network
        G = sgm.build_projected_network(df, seeds, belief_type, threshold)
        #update beliefs in new G for the nodes that were already been in the system
        for n in G.nodes():
            G.node[n]['belief'] = belief_dict[n]
        #update adoption status
        G = gen.update_adoption_status(G, G.nodes(), threshold)
        #calculate adopter
        G_update, adopter_history = contagion.contagion_sequential_projected_network(G, len(df), interval, p, d, threshold)
        
        #calculating adopters that are in the system, but was not in G
        adopter_update = [n for n in G_update.nodes() if G_update.node[n]['status']=='Adopter'] 
        adopters_not_in_G = [k for k in belief_dict.keys() if (belief_dict[k] >= threshold) and (k not in adopter_update)]
        #add adopters that are not in the sub G but are actually in the system
        adopter_history[:,2] += len(adopters_not_in_G)

        if total_adopter_history == []:
            total_adopter_history = adopter_history    
        else:
            #add the dates
            adopter_history[:,1] += (len(total_adopter_history)-1)            
            total_adopter_history = np.append(total_adopter_history, adopter_history[1:], axis=0)
        #update the belief dictionary
        for k in G_update.nodes():
            belief_dict[k] = G_update.node[k]['belief']
        
    return total_adopter_history

In [167]:
len(total_adopter_history), len(df_raw)

(2010, 2009)

In [165]:
total_adopter_history = cumulative_adopters_projected_network_sequential(df_raw, 1, seeds, belief_type, p, d, threshold)



In [151]:
df_adopter = pd.DataFrame(total_adopter_history, columns=['year', 'movie_order', '1'])

In [152]:
df_adopter = df_adopter.set_index(['movie_order', 'year'])


In [169]:
total_adopter_history[:,0]

array([(1989, 1990), (1989, 1990), (1989, 1990), ..., (1998, 1999),
       (1998, 1999), (1998, 1999)], dtype=object)

In [155]:
len(total_adopter_history)

2019

In [61]:
#calculating adopters that are in the system, but was not in G
adopter_update = [n for n in G_update.nodes() if G_update.node[n]['status']=='Adopter'] 
adopters_not_in_G = [k for k in belief_dict.keys() if (belief_dict[k] >= t) and (k not in adopter_update)]
len(set(adopters_not_in_G)), len(set(adopter_update))

(958, 260)

In [64]:
len(set(seeds)), 958+260, adopter_history[0][-1]

(1064, 1218, 106)

In [63]:
#updating belief dictionary
for k in G_update.nodes():
    belief_dict[k] = G_update.node[k]['belief']
len([k for k in belief_dict.keys() if belief_dict[k] >= t])

1218

In [None]:
5466+106, 5466+263, 263-106

In [None]:
def update_belief(G, G_update):
    """
    use G_update to update G's belief
    """
    for node in G.nodes():
        G.node[node]['belief'] = G_update.node[node]['belief']
        G.node[node]['status'] = G_update.node[node]['status']
    return G

In [None]:
movie_counter = 0
for year, df in grouped_by_year:
    movie_counter += len(df)
print(movie_counter, len(movie_90s))

##  check quest run

### plot for sir

In [None]:

color='blueviolet'
#plot the figure:
fig, ax = plt.subplots(figsize=(5,3))
#get the bipartite data:
#plot bipartite results
ax.plot(mean, label='Multipartite', color=color)
ax.fill_between(df_contagion.index, low_ci, high_ci, color=color, alpha=0.2)

In [200]:
pd.read_json('/home/staff/junelee/Desktop/contagion_agg_2_p10_d100_t100_ver_20.json', orient='split')

Unnamed: 0,year,0,1,2,3,4,5
0,"[1989, 1991]",1064,1064,1064,1064,1064,1064
1,"[1989, 1991]",1066,1065,1065,1064,1065,1067
2,"[1989, 1991]",1068,1068,1067,1065,1068,1070
3,"[1989, 1991]",1069,1069,1067,1067,1068,1072
4,"[1989, 1991]",1071,1071,1070,1068,1070,1074
5,"[1989, 1991]",1073,1073,1070,1073,1072,1076
6,"[1989, 1991]",1074,1073,1071,1076,1074,1078
7,"[1989, 1991]",1076,1076,1073,1078,1075,1079
8,"[1989, 1991]",1077,1076,1076,1080,1076,1081
9,"[1989, 1991]",1078,1079,1077,1081,1077,1083
