# Add model 3, which is keeping the number of producers and also the number of  movies|

In [1]:
import json
import os
import sys
import argparse
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from scipy.special import factorial
from scipy import stats
from matplotlib import cm
from matplotlib.colors import LinearSegmentedColormap
from scipy.stats import ks_2samp
from operator import itemgetter

src_dir = os.path.abspath(os.path.join(os.pardir, os.pardir,'src'))
sys.path[0] = src_dir
from parser.support import ROLES, CREDITS
from parser.my_mongo_db_login import DB_LOGIN_INFO
import parser.support as support
import network.shift_graph_maker as sgm


In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
with open('/home/projects/movie-network/data/raw_data/movies.json') as f:
    movie_file = f.read()
    movie_data = json.loads(movie_file)

In [4]:
role = 'producing'
role_key = role + "_gender_percentage"
all_movies = support.get_movies_df(role_key)
print('Got all_movies')


Loaded IMDb movies producing_gender_percentage
Got all_movies


In [5]:
all_movies.columns
#get 90s movies with only producer, id, title, year...
movie_90s = all_movies[(all_movies.year >= 1990) & (all_movies.year < 2000)]
movie_producer_df = movie_90s[['_id', 'producers', 'producing_gender_percentage', 'title', 'year']]

In [6]:
movie_producer_df = movie_producer_df.sort_values('year')

In [7]:
movie_producer_df['producer_num'] = movie_producer_df['producers'].apply(lambda x: len(x))

In [8]:
def unlistify(df, column):
    matches = [i for i,n in enumerate(df.columns)
             if n==column]

    if len(matches)==0:
        raise Exception('Failed to find column named ' + column +'!')
    if len(matches)>1:
        raise Exception('More than one column named ' + column +'!')

    col_idx = matches[0]

    # Helper function to expand and repeat the column col_idx
    def fnc(d):
        row = list(d.values[0])
        bef = row[:col_idx]
        aft = row[col_idx+1:]
        col = row[col_idx]
        z = [bef + [c] + aft for c in col]
        return pd.DataFrame(z)

    col_idx += len(df.index.shape) # Since we will push reset the index
    index_names = list(df.index.names)
    column_names = list(index_names) + list(df.columns)
    return (df
          .reset_index()
          .groupby(level=0,as_index=0)
          .apply(fnc)
          .rename(columns = lambda i :column_names[i])
          .set_index(index_names)
          )

## plot colors

In [9]:
def discrete_cmap(N, base_cmap=None):
    """Create an N-bin discrete colormap from the specified input map"""

    # Note that if base_cmap is a string or None, you can simply do
    #    return plt.cm.get_cmap(base_cmap, N)
    # The following works for string, None, or a colormap instance:

    base = cm.get_cmap(base_cmap)
    color_list = base(np.linspace(0, 1, N))
    cmap_name = base.name + str(N)
    return LinearSegmentedColormap.from_list(cmap_name, color_list, N)

viridis_cmap = discrete_cmap(20, 'viridis')


In [10]:
viridis_cmap = cm.viridis.colors

# Building synthetic networks


* fixed values
    * number of movies per year
    * number of producers per year

* variables
    * number of people per team - producer_num_list *
    * number of movies per producer - occurence_list *
    * size of gaps - gap_list


In [11]:
unlistyfied_producer_df = unlistify(movie_producer_df, 'producers')

In [12]:
number_of_movies_per_year = {}
for year, df in movie_producer_df.groupby('year'):
    number_of_movies_per_year[year] = len(df)

In [13]:
number_of_producers_per_year = {}
for year, df in unlistyfied_producer_df.groupby('year'):
    producers = list(set([i[0] for i in df.producers.tolist()]))
    number_of_producers_per_year[year] = len(producers)

In [14]:
num_producers_per_movie = {}
for year, df in movie_producer_df.groupby('year'):
    num_producers_per_movie[year] = [len(i) for i in df.producers.tolist()]

In [15]:
#number of movies per producer of the 90s
unlistyfied_producer_df['producer_id'] = unlistyfied_producer_df.producers.apply(lambda x: x[0])

In [16]:
movies_per_producer = unlistyfied_producer_df.groupby('producer_id').count()['_id']

 #### generate producer id

In [17]:
import string
import random

def random_generator(size=8, chars=string.ascii_uppercase, nums=string.digits):
    uid = ''.join(random.choice(chars) for i in range(2))
    uid += ''.join(random.choice(nums) for i in range(size-2))
    return uid

In [18]:
def intersection(lst1, lst2):
    overlap_lst = [value for value in lst1 if value in lst2]
    return overlap_lst

In [19]:
def generate_producers(size):
    """
    generate unique producer ids for given size
    input:
        size - int number of total producers
    output:
        producer_list - list of generated producer list
    """
    producer_list = []
    while len(producer_list) < size:
        new_id = random_generator()
        if new_id not in producer_list:
            producer_list.append(new_id)
    return producer_list

In [20]:
def sample_producers(generated_producers, year, num_producers, year_assignment):
    """
    Sample producers from the total so that unique(sum) of producers per year adds up to the total producers
    input:
        generated_producers - list of ids of the total producers
        year - the year of the interest
        num_producers - int of the number of producers for the year
        year_assignment - dict {producer_id: year}, the baseline assginment of the year
    output:
        producer_list - list of the producer ids for the year
    """
    #get the producers that have the year as the assignment
    baseline_producers = [p for p, y in year_assignment.items() if y == year ]
    #get the producers that does not have the year of interest assgined
    non_baseline_producers = [p for p in generated_producers if p not in baseline_producers]
    #pick random producers from non baseline producers 
    #so that the total sum of the baseline + addition will be the number of producers needed
    add_producers = np.random.choice(non_baseline_producers, size=num_producers-len(baseline_producers), replace=False)
    
    producer_list = baseline_producers + list(add_producers)
    return producer_list

In [21]:
def assign_producers(producers, values):
    """
    Assign producers to certain year or movie
    input:
        producers - list of producer ids
        values - value that will be assigned to the producer
    output:
        assignment_dict - {producer:val} dict
    """
    sampling = np.random.choice(values, size=len(producers))
    assignment_dict = dict(zip(producers, sampling))
    return assignment_dict

In [22]:
def assign_team(producers_unique, team_size_list):
    """
    Assign producers to each team so that each team consists of unique producers 
    and all producers participate in at least one movie
    input:
        producers - list of unique producer ids
        team_size_list - list of team sizes
    output:
        producer_list - list of producers with the team sizes
    """
    add_num = int(sum(team_size_list))-len(producers_unique)
    add_producers = np.random.choice(producers_unique, size=add_num, replace=True)
    producers = producers_unique + list(add_producers)
    
    producer_dict = dict(Counter(producers))
    producer_team = []
    for ts in team_size_list:
        team = [1,1]
        while len(set(team)) != len(team):
            #use producer list with duplicates 
            #to increase the change of picking producers with many movie appearnces
            team = list(np.random.choice(producers, size=ts, replace=False))
        producer_team.append(team)
        for p in team:
            producer_dict[p] -= 1
            if producer_dict[p] == 0:
                producer_dict.pop(p, None)
                producers = [i for i in producers if i != p]
    return producer_team



# Model 2

fixed number of movies, fixed number of producers
given number of movies per producer
Everything else is random

## Model 2- 0

fixed number of movies, fixed number of producers

team size round down/up of the mean

In [23]:
def bootstrap(movie_num, num_producers):
    """returns movie numbers per producers based  on movie_num"""
    n = len(movie_num)
    movie_num = np.array(movie_num)
    idx = np.random.randint(0, n, (num_producers, 1))
    samples = movie_num[idx]
    samples = samples.reshape((num_producers, )).tolist()
    return samples

In [24]:
def generate_movie_num(tot_num_movie, producers_dict, movies_dict):
    '''
    Generate exact number of movies per producer
    Input
        tot_num_movies - integer of total number of movies needed
        producer - dictionary of gender list of producers names that are generated 
        num_movies - dictionary of gender list of number of movies in the original data
    Output
        dict_movies - dictionary of key:producer and value:number of movies
    '''
    #female movies:
    f_movie_list = bootstrap(movies_dict['female'], len(producers_dict['female']))
    #male movies:
    m_movie_list = bootstrap(movies_dict['male'], len(producers_dict['male']))
    movie_list = f_movie_list + m_movie_list
    #the sum of total movies for producers has to match the total producers in each team
    while sum(movie_list) != tot_num_movie:
        #female movies:
        f_movie_list = bootstrap(movies_dict['female'], len(producers_dict['female']))
        #male movies:
        m_movie_list = bootstrap(movies_dict['male'], len(producers_dict['male']))
        movie_list = f_movie_list + m_movie_list
        #the sum of total movies for producers has
    f_dict_movies = dict(zip(producers_dict['female'], f_movie_list))
    m_dict_movies = dict(zip(producers_dict['male'], m_movie_list))
    dict_movies = {**f_dict_movies, **m_dict_movies}
    return dict_movies


In [25]:
def team_size(x, mean_ceil, mean_floor):
    return random.choice([int(mean_ceil), int(mean_floor)])


In [26]:
movie_list = bootstrap(movies_per_producer, len(movies_per_producer))

In [27]:
total_num_producers = len(list(set([i[0] for i in unlistyfied_producer_df.producers.tolist()])))

## Assign genders

In [28]:
gender_df = support.get_staff_df('producers')[['_id', 'female_count', 'first_movie', 'last_movie', 'gender']]


In [29]:
females = sgm.generate_gender_seeds(gender_df)

In [30]:
original_producer_list = [i[0] for sublist in movie_90s.producers.tolist() for i in sublist]
seeds = [i for i in females if i in original_producer_list]
len(seeds)

1064

In [31]:
gender_df = support.get_staff_df('producers')[['_id', 'female_count', 'first_movie', 'last_movie', 'gender']]
gender_df.head()

Unnamed: 0,_id,female_count,first_movie,last_movie,gender
0,nm0354806,0,1914,1914,male
1,nm0750211,0,1914,1914,male
2,nm0730386,0,1915,1915,male
3,nm0162343,0,1970,1976,male
4,nm0002883,0,1970,2011,male


In [32]:
unlistyfied_producer_df.head()

Unnamed: 0,_id,producers,producing_gender_percentage,title,year,producer_num,producer_id
22884,tt0099622,"[nm0000339, producer]",33.333333,Full Fathom Five,1990,3,nm0000339
22884,tt0099622,"[nm0515891, producer]",33.333333,Full Fathom Five,1990,3,nm0515891
22884,tt0099622,"[nm0560370, associate producer]",33.333333,Full Fathom Five,1990,3,nm0560370
22502,tt0100822,"[nm0135847, executive producer]",0.0,Tune in Tomorrow...,1990,3,nm0135847
22502,tt0100822,"[nm0275836, producer]",0.0,Tune in Tomorrow...,1990,3,nm0275836


In [65]:
def assign_gender(row, gender_df):
    try:
        gender = gender_df[gender_df._id == row.producer_id].gender.values[0]
    except IndexError:
        gender='male'
    return gender

In [66]:
unlistyfied_producer_df['gender'] = unlistyfied_producer_df.apply(assign_gender, args=(gender_df,), axis=1)

In [67]:
movie_per_producer_gender = {}
for g, g_df in unlistyfied_producer_df.groupby('gender'):
    movies_per_producer = g_df.groupby('producer_id').count()['_id'].tolist()
    movie_per_producer_gender[g] = movies_per_producer

In [68]:
movie_per_producer_gender.keys()

dict_keys(['female', 'male'])

### Make dataframe with the exact team sizes

In [69]:
data_dir = '/home/projects/movie-network/data/synthetic_data/model_2_0/'
gender_dir = '/home/projects/movie-network/data/synthetic_data/genders/'

In [70]:
num_schedules = 10
def make_version(size=6, nums=string.digits):
    uid = ''.join(random.choice(nums) for i in range(size))
    return uid

version_list = []
while len(version_list) < num_schedules:
    ver = make_version()
    if ver not in version_list:
        version_list.append(ver)

In [71]:
len(movie_per_producer_gender['male']), len(movie_per_producer_gender['female'])

(4694, 1064)

In [72]:
len(generated_producers_dict['male']), len(generated_producers_dict['female'])

(4694, 1064)

In [73]:
shift_dist = []
for v in range(num_schedules):
    #generate new producers every round
    generated_producers = generate_producers(total_num_producers)
    generated_producers_dict = {}
    generated_producers_dict['female'] = generated_producers[:len(seeds)]
    generated_producers_dict['male'] = generated_producers[len(seeds):]
    total_movie_frame = movie_producer_df[['_id', 'producers', 'year', 'producer_num']].copy(deep=True)
    print('iterate years')
    for year, df in total_movie_frame.groupby('year'):
        num_producers = number_of_producers_per_year[year] #duplicate producers are already dropped
        mean_size = np.mean(df.producer_num.tolist())
        mean_ceil = np.ceil(mean_size)
        mean_floor = np.floor(mean_size)
        #fix the team size to its mean
        df['producer_num'] = df.producer_num.apply(team_size, args=(mean_ceil, mean_floor))
        total_movie_frame['producer_num'].update(df.producer_num)
    total_movie_frame['producers'] = np.nan
    print('generate movies')
    total_num_teams = total_movie_frame.producer_num.sum()
    dict_movies = generate_movie_num(total_num_teams, generated_producers_dict, movie_per_producer_gender)
    shift_dist.append(list(dict_movies.values()))
    print('distribute movies')
    for i, row in total_movie_frame.iterrows():
        producers, occurence = zip(*dict_movies.items())
        producers = np.array(producers)
        occurence = np.array(occurence)
        size = row.producer_num
        team = np.random.choice(producers, size, replace=False, p=occurence/sum(occurence))
        total_movie_frame['producers'] = total_movie_frame['producers'].astype(object)
        total_movie_frame.at[i, 'producers'] = team
        for p in team:
            dict_movies[p] -= 1
            if dict_movies[p] == 0:
                del dict_movies[p]
    total_movie_frame.to_json(os.path.join(data_dir, 'movies_2_0_{}.json'.format(version_list[v])), orient='split')
    #save gender
    generated_gender_df = pd.DataFrame(columns=['producer_id', 'gender'])
    for g, producers in generated_producers_dict.items():
        genders = [g for ii in producers]
        appending_df = pd.DataFrame({'producer_id': producers, 'gender': genders})
        generated_gender_df = generated_gender_df.append(appending_df)
    generated_gender_df.to_json(os.path.join(gender_dir, 'movies_2_0', 'version_{}.json'.format(version_list[v])), orient='split')

iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies
iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies
iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies
iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies
iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies
iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies
iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies
iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies
iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies
iterate years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.py

generate movies
distribute movies


Unnamed: 0,producer_id,gender
0,HZ680311,female
1,GN609433,female
2,ZA277337,female
3,LJ557719,female
4,GG028859,female
5,YI861374,female
6,HS260183,female
7,XG485605,female
8,ZQ519166,female
9,UP567068,female


Unnamed: 0,producer_id,gender,0,1,2,3,4,5,6,7,...,4684,4685,4686,4687,4688,4689,4690,4691,4692,4693
0,,,HZ680311,GN609433,ZA277337,LJ557719,GG028859,YI861374,HS260183,XG485605,...,,,,,,,,,,
1,,,female,female,female,female,female,female,female,female,...,,,,,,,,,,
0,,,JL414966,SB502429,TX467031,RU920015,CF491314,QS408868,VH871766,QH001868,...,TY868657,XN202588,SB998605,RZ921952,BO587525,MX797676,HZ564965,JR852432,JQ849629,KE785884
1,,,male,male,male,male,male,male,male,male,...,male,male,male,male,male,male,male,male,male,male


In [75]:
for l in shift_dist:
    print(ks_2samp(l, movie_per_producer_gender['male']+movie_per_producer_gender['female']))

Ks_2sampResult(statistic=0.011635984716915515, pvalue=0.8284437076792213)
Ks_2sampResult(statistic=0.011114970475859631, pvalue=0.8672546031935029)
Ks_2sampResult(statistic=0.005731156651615166, pvalue=0.9999813773200369)
Ks_2sampResult(statistic=0.004168113928447403, pvalue=0.9999999997573147)
Ks_2sampResult(statistic=0.0027787426189649356, pvalue=1.0)
Ks_2sampResult(statistic=0.007641542202153517, pvalue=0.9959008857765511)
Ks_2sampResult(statistic=0.003994442514762109, pvalue=0.9999999999714168)
Ks_2sampResult(statistic=0.003820771101076814, pvalue=0.999999999997512)
Ks_2sampResult(statistic=0.0027787426189649356, pvalue=1.0)
Ks_2sampResult(statistic=0.014588398749565745, pvalue=0.5694882137115395)


# Model 2-1

fixed number of movies, fixed number of producers

team size sampled from the real schedule

## Fill in producers

In [77]:
num_schedules = 10

version_list = []
while len(version_list) < num_schedules:
    ver = make_version()
    if ver not in version_list:
        version_list.append(ver)

In [None]:
from os import listdir
from os.path import isfile, join

original_file_list = [join(original_dir, f) for f in listdir(original_dir) 
                       if isfile(join(original_dir, f))]

## Concat over multiple years

In [76]:
movie_producer_df[['_id', 'producers', 'year', 'producer_num']].copy(deep=True).head()

Unnamed: 0,_id,producers,year,producer_num
22884,tt0099622,"[[nm0000339, producer], [nm0515891, producer],...",1990,3
22502,tt0100822,"[[nm0135847, executive producer], [nm0275836, ...",1990,3
10973,tt0100935,"[[nm0326512, producer], [nm0474138, executive ...",1990,4
22559,tt0099819,"[[nm0342045, executive producer], [nm0270288, ...",1990,10
24509,tt0099796,"[[nm0321860, line producer], [nm0711910, produ...",1990,2


In [78]:
data_dir = '/home/projects/movie-network/data/synthetic_data/model_2_1/'

shift_dist = []
for v in range(num_schedules):
    #generate new producers every round
    generated_producers = generate_producers(total_num_producers)
    generated_producers_dict = {}
    generated_producers_dict['female'] = generated_producers[:len(seeds)]
    generated_producers_dict['male'] = generated_producers[len(seeds):]
    total_movie_frame = movie_producer_df[['_id', 'producers', 'year', 'producer_num']].copy(deep=True)
    total_movie_frame['producers'] = np.nan
    print('generate movies')
    total_num_teams = total_movie_frame.producer_num.sum()
    dict_movies = generate_movie_num(total_num_teams, generated_producers_dict, movie_per_producer_gender)
    shift_dist.append(list(dict_movies.values()))
    print('distribute movies')
    for i, row in total_movie_frame.iterrows():
        producers, occurence = zip(*dict_movies.items())
        producers = np.array(producers)
        occurence = np.array(occurence)
        size = row.producer_num
        team = np.random.choice(producers, size, replace=False, p=occurence/sum(occurence))
        total_movie_frame['producers'] = total_movie_frame['producers'].astype(object)
        total_movie_frame.at[i, 'producers'] = team
        for p in team:
            dict_movies[p] -= 1
            if dict_movies[p] == 0:
                del dict_movies[p]
    total_movie_frame.to_json(os.path.join(data_dir, 'movies_2_1_{}.json'.format(version_list[v])), orient='split')
    #save gender
    generated_gender_df = pd.DataFrame(columns=['producer_id', 'gender'])
    for g, producers in generated_producers_dict.items():
        genders = [g for ii in producers]
        appending_df = pd.DataFrame({'producer_id': producers, 'gender': genders})
        generated_gender_df = generated_gender_df.append(appending_df)
    generated_gender_df.to_json(os.path.join(gender_dir, 'movies_2_1', 'version_{}.json'.format(version_list[v])), orient='split')

generate movies
distribute movies
generate movies
distribute movies
generate movies
distribute movies
generate movies
distribute movies
generate movies
distribute movies
generate movies
distribute movies
generate movies
distribute movies
generate movies
distribute movies
generate movies
distribute movies
generate movies
distribute movies


In [79]:
for l in shift_dist:
    print(ks_2samp(l, movie_per_producer_gender['male']+movie_per_producer_gender['female']))

Ks_2sampResult(statistic=0.006078499478985755, pvalue=0.9999257553529036)
Ks_2sampResult(statistic=0.006946856547412339, pvalue=0.9990276237794588)
Ks_2sampResult(statistic=0.003994442514761998, pvalue=0.9999999999714172)
Ks_2sampResult(statistic=0.006078499478985755, pvalue=0.9999257553529036)
Ks_2sampResult(statistic=0.005210142410559282, pvalue=0.9999986603439552)
Ks_2sampResult(statistic=0.009725599166377275, pvalue=0.9472813718008206)
Ks_2sampResult(statistic=0.003820771101076703, pvalue=0.9999999999975115)
Ks_2sampResult(statistic=0.004862799583188582, pvalue=0.999999859657745)
Ks_2sampResult(statistic=0.002084056964223646, pvalue=1.0)
Ks_2sampResult(statistic=0.002084056964223757, pvalue=0.9999999999999997)


# Problem

Random sampling does not account for all of the producers

In [80]:
ver

'656893'

In [93]:
from os.path import normpath, basename, join, isfile
from os import listdir

In [91]:
network_model =  basename(normpath(data_dir))
gender_folder = os.path.join('/home/projects/movie-network/data/synthetic_data/genders/', network_model)

In [94]:
gender_file = [join(gender_folder, f) for f in listdir(gender_folder) if isfile(join(gender_folder, f)) and ver in f]

In [96]:
if len(gender_file) == 1:
    gender_file = gender_file[0]
else:
    raise IndexError('the version has duplicate or it does not exist')

In [102]:
df_gender = pd.read_json(gender_file, orient='split')
seeds = df_gender[df_gender.gender=='female'].producer_id.tolist()

In [103]:
len(seeds)

1064

In [106]:
from os.path import dirname, realpath
two_up = dirname(dirname(__file__))

NameError: name '__file__' is not defined

In [113]:
os.path.abspath(os.path.join(data_dir, os.pardir, 'gender'))

'/home/projects/movie-network/data/synthetic_data/gender'

In [112]:
data_dir

'/home/projects/movie-network/data/synthetic_data/model_2_1/'