# V_Vaccination model preprocessing - with age

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import nafot
import model
from itertools import product
import pickle
from tqdm import tqdm, tqdm_notebook
from itertools import product

## Vaccination data by season

In [138]:
prep_data = model.data_and_network_prep()

In [139]:
# Get prep data
vaccination_data = prep_data['vaccination_data']
relevant_clinics_age = prep_data['relevant_clinics_age']
population_by_clinic_age = prep_data['population_by_clinic_age']
day_in_season_short = prep_data['day_in_season_short']
vaccination_coverage_with_age = prep_data['vaccination_coverage_with_age']
relevant_subdists_age = prep_data['relevant_subdists_age']

In [140]:
# Get only relevant seasons
years = 7  # Only last 7 seasons
seasons = np.arange(2008, 2017 + 1)[-years:]
vaccination_data_relevant_seasons = vaccination_data[vaccination_data.vac_season.isin(set(seasons))]

# Get dates for aggregation
dates = [pd.Timestamp(2016, 9, 1) + pd.Timedelta(i, unit='d') for i in range(len(day_in_season_short))]

# Get relevant days
relevant_days = np.array(day_in_season_short)

# Get only relevant data (according to the short season definition)
vaccination_data_short_season = vaccination_data_relevant_seasons[
    vaccination_data_relevant_seasons.vac_day_of_season.isin(set(relevant_days))].copy()

In [141]:
# Create a dictionary for vaccination count by clinic and age at each stage (day of the season)
# data_for_fit = dict.fromkeys(list(population_by_clinic_age.index), [0] * len(day_in_season_short))
vacc_data_by_season = {} #= {key: [0] * len(day_in_season_short) for key in (list(population_by_clinic_age.index))}

# Go over the clinics and age groups and seasons
for (clinic, age), season in product(relevant_clinics_age, model.seasons):
    # Get only data of current clinic and age
    cur_clinic_age_data = vaccination_data_short_season[(vaccination_data_short_season.clinic_code == clinic) &
                                                        (vaccination_data_short_season.age == age) & 
                                                       (vaccination_data_short_season.vac_season==season)]

    # Group by dates and count the number of vaccination at each day
    cur_daily_vacc = cur_clinic_age_data.groupby('vac_date').count()[['random_ID']].copy()
    
    # Get current weekly vaccination
    cur_weekly_vacc = pd.DataFrame(index=[pd.Timestamp(season-1, 9, 1) + pd.Timedelta(i, unit='d')
                                          for i in range(len(day_in_season_short))])
    
    cur_weekly_vacc = cur_weekly_vacc.join(cur_daily_vacc).fillna(0).resample('W').sum().fillna(0).copy()
    cur_weekly_vacc.columns = ['vacc_count']

    # Multiply by the factor between the real and model data
    vacc_data_adj = cur_weekly_vacc * population_by_clinic_age['factor'].loc[(clinic, age)]
    
    # Add season
    vacc_data_adj['season'] = season

    # Save to dict
    vacc_data_by_season[(clinic, age, season)] = vacc_data_adj.copy()

In [142]:
vacc_data_by_clinic_age = {}

# Go over the clinics and age groups and concat seasons
for clinic, age in relevant_clinics_age:
    vacc_data_by_clinic_age[(clinic, age)] = pd.concat([vacc_data_by_season[(clinic, age, season)] for season in model.seasons])

In [150]:
########################################################
# --- Aggregated - by subdist and age --- #
########################################################
# Initialize dict to all arrays of 0s
vacc_data_by_subdist = {key: vacc_data_by_clinic_age[list(vacc_data_by_clinic_age.keys())[0]].copy()
                        for key in relevant_subdists_age}
for key in vacc_data_by_subdist:
    vacc_data_by_subdist[key].vacc_count = 0

# Go over the clinics and age groups and aggregate according to the clinic's subdist
for (clinic, age), data in vacc_data_by_clinic_age.items():
    subdist = vaccination_coverage_with_age.loc[clinic].subdist[0]
    vacc_data_by_subdist[(subdist, age)].vacc_count = vacc_data_by_subdist[(subdist, age)].vacc_count + data.vacc_count

############################################
# --- Aggregated  - by age --- #
############################################
# Initialize dict to all arrays of 0s
vacc_data_by_age = {key: vacc_data_by_clinic_age[list(vacc_data_by_clinic_age.keys())[0]].copy()
                    for key in [0, 1, 'total']}
for key in vacc_data_by_age:
    vacc_data_by_age[key].vacc_count = 0

# Go over the clinics and age groups and aggregate according to the clinic's subdist
for (clinic, age), data in vacc_data_by_clinic_age.items():
    vacc_data_by_age[age].vacc_count = vacc_data_by_age[age].vacc_count + data.vacc_count
    vacc_data_by_age['total'].vacc_count = vacc_data_by_age['total'].vacc_count + data.vacc_count

In [157]:
# Save
vacc_data_all_seasons = {'by_clinic_age': vacc_data_by_clinic_age, 'by_subdist_age': vacc_data_by_subdist,
                         'by_age': vacc_data_by_age, 'total': vacc_data_by_age['total']}

with open('../../Data/vaccination_data/vaccination_data_all_seasons_adjusted.pickle', 'wb') as pickle_out:
    pickle.dump(vacc_data_all_seasons, pickle_out)

## Age contact matrix

In [2]:
# with open('../../data/network/network_population_with_age.pickle', 'rb') as pickle_in:
with open('../../data/network/network_population_with_age_updated.pickle', 'rb') as pickle_in:
    population_dict = pickle.load(pickle_in)

In [3]:
# Read contact matrix with age
contact_matrix = pd.read_pickle(model.contact_matrix_path)
contact_matrix.fillna(0, inplace=True)

# Read population data
# with open('../../data/network/network_population_with_age.pickle', 'rb') as pickle_in:
with open('../../data/network/network_population_with_age_updated.pickle', 'rb') as pickle_in:
    population_dict = pickle.load(pickle_in)

# Calculate total population
total_children = sum([value for key, value in population_dict.items() if key[1] == 0])
total_adults = sum([value for key, value in population_dict.items() if key[1] == 1])

# Calculate population proportion out of childre/adult population respectivly
population_prop_children = {key: value/total_children for key, value in population_dict.items() if key[1] == 0}
population_prop_adults = {key: value/total_adults for key, value in population_dict.items() if key[1] == 1}
pop_prop = [list(population_prop_children.values()), list(population_prop_adults.values())]

In [4]:
# Indices to aggregate
children_idx = [x for i, x in enumerate(contact_matrix.index) if x[1]==0]
adult_idx = [x for i, x in enumerate(contact_matrix.index) if x[1]==1]
indices_to_agg = [children_idx, adult_idx]


# Aggregate the columns
contact_matrix_agg_cols = pd.DataFrame(index=contact_matrix.index, columns=[0, 1])
for agg_area_id in [0, 1]:
    contact_matrix_agg_cols[agg_area_id] = contact_matrix[indices_to_agg[agg_area_id]].sum(axis=1)
    
# Aggregate the rows - according to population proportion by age
contact_matrix_age = pd.DataFrame(index=[0, 1], columns=[0, 1])
for agg_area_id in [0, 1]:
    contact_matrix_age.loc[agg_area_id,:] =\
            contact_matrix_agg_cols.loc[indices_to_agg[agg_area_id]].multiply(list(pop_prop[agg_area_id]), 'rows').sum()

In [5]:
# Save matrix
contact_matrix_age.to_pickle('./model/contact_matrix_age_updated.pickle')
contact_matrix_age

Unnamed: 0,0,1
0,0.444669,0.555331
1,0.329526,0.670474


In [5]:
# # Save matrix
# contact_matrix_age.to_pickle('./model/contact_matrix_age.pickle')
# contact_matrix_age

Unnamed: 0,0,1
0,0.442125,0.557875
1,0.323756,0.676244


## Network

### Create node by (area, age_group) dict

In [6]:
# Load prep data
prep_data = model.data_and_network_prep()

In [7]:
nodes_by_area_age = {}
for n in prep_data['network'].nodes:
    # Get node's area and age
    area = prep_data['network'].nodes[n]['area']
    age = prep_data['network'].nodes[n]['age']
    
    # Get current nodes for (area, age)
    cur_nodes = nodes_by_area_age.get((area, age), set())
    # Add current node
    cur_nodes.add(n)
    # Update dict
    nodes_by_area_age[(area, age)] = cur_nodes
    
# Save dict
# with open('../../Data/vaccination_data/nodes_by_area_age_dict.pickle', 'wb') as pickle_out:
with open('../../Data/vaccination_data/nodes_by_area_age_dict_updated.pickle', 'wb') as pickle_out:
    pickle.dump(nodes_by_area_age, pickle_out)

### Create node by (subdist, age_group) dict

In [8]:
nodes_by_subdist_age = {}
for n in prep_data['network'].nodes:
    # Get node's area and age
    subdist = prep_data['network'].nodes[n]['subdist']
    age = prep_data['network'].nodes[n]['age']
    
    # Get current nodes for (area, age)
    cur_nodes = nodes_by_subdist_age.get((subdist, age), set())
    # Add current node
    cur_nodes.add(n)
    # Update dict
    nodes_by_subdist_age[(subdist, age)] = cur_nodes
    
# Save dict
# with open('../../Data/vaccination_data/nodes_by_subdist_age.pickle', 'wb') as pickle_out:
with open('../../Data/vaccination_data/nodes_by_subdist_age_updated.pickle', 'wb') as pickle_out:
    pickle.dump(nodes_by_subdist_age, pickle_out)

### Create node by (yeshuv, age_group) dict

In [9]:
nodes_by_yeshuv_age = {}
for n in prep_data['network'].nodes:
    # Get node's area and age
    area = prep_data['network'].nodes[n]['area']
    yeshuv = nafot.gdf.loc[area].SEMEL_YISH
    age = prep_data['network'].nodes[n]['age']
    
    # Get current nodes for (area, age)
    cur_nodes = nodes_by_yeshuv_age.get((yeshuv, age), set())
    # Add current node
    cur_nodes.add(n)
    # Update dict
    nodes_by_yeshuv_age[(yeshuv, age)] = cur_nodes
    
# Save dict
# with open('../../Data/vaccination_data/nodes_by_yeshuv_age_dict.pickle', 'wb') as pickle_out:
with open('../../Data/vaccination_data/nodes_by_yeshuv_age_dict_updated.pickle', 'wb') as pickle_out:
    pickle.dump(nodes_by_yeshuv_age, pickle_out)

### Create node by (clinic, age_group) dict

In [10]:
nodes_by_clinic_age = {}
for n in prep_data['network'].nodes:
    # Get node's clinic and age
    clinic = prep_data['network'].nodes[n]['clinic']
    age = prep_data['network'].nodes[n]['age']
    
    # Get current nodes for (area, age)
    cur_nodes = nodes_by_clinic_age.get((clinic, age), set())
    # Add current node
    cur_nodes.add(n)
    # Update dict
    nodes_by_clinic_age[(clinic, age)] = cur_nodes
    
# Save dict
# with open('../../Data/vaccination_data/nodes_by_clinic_age.pickle', 'wb') as pickle_out:
with open('../../Data/vaccination_data/nodes_by_clinic_age_updated.pickle', 'wb') as pickle_out:
    pickle.dump(nodes_by_clinic_age, pickle_out)

### Relevant stat areas

In [5]:
relevant_stat_area_age = set()

for n in prep_data['network'].nodes:
    # Get node's stat area and age
    node = prep_data['network'].nodes[n]
    stat_area = node['area']
    age = node['age']
    
    # Update relevant stat area and age set
    relevant_stat_area_age.add((stat_area, age))

In [61]:
# Load population dict (lamas population data)
with open('../../data/network/network_population_with_age.pickle', 'rb') as pickle_in:
    population_dict = pickle.load(pickle_in)

# Remove irrelevant areas
population_dict = {k: v for k,v in population_dict.items() if k in relevant_stat_area_age}

# Total populaiton size
total_pop_size = int(sum(list(population_dict.values())))
total_pop_size

3606696

### Population proportion by subdist and age

In [18]:
# Load prep data
prep_data = model.data_and_network_prep()

In [8]:
population_by_clinic_age = prep_data['population_by_clinic_age'].copy()

# Add stat area and subdist
population_by_clinic_age['stat_area'] = population_by_clinic_age.apply(lambda row: prep_data['clinics_stat_areas'].loc[row.name[0]].stat_area_id, axis=1)
population_by_clinic_age['subdist'] = population_by_clinic_age.apply(lambda row: prep_data['clinics_stat_areas'].loc[row.name[0]].subdist, axis=1)

In [9]:
prop_data_pop = {}
prop_net_pop = {}

for subdist, age in prep_data['relevant_subdists_age']:
    cur_df = population_by_clinic_age[population_by_clinic_age.subdist==subdist].loc[pd.IndexSlice[:,age], :]
    prop_data_pop.update(dict(cur_df.data_population / cur_df.data_population.sum()))
    prop_net_pop.update(dict(cur_df.network_population / cur_df.network_population.sum()))

## Centrality measures

### PageRank

Page Rank by subdist

In [35]:
# Read contact matrix with age
contact_matrix = pd.read_pickle('../data/matrix/contact_matrix_final_with_age_subdist.pickle')

# Create a graph
G = nx.from_numpy_matrix(contact_matrix.values, create_using=nx.DiGraph())

# Calculate PageRank
pagerank_dict = nx.pagerank_numpy(G, alpha=1)

# Use (area, age) as keys
pagerank_by_subdist_age = {contact_matrix.index[i]: rank for i, rank in pagerank_dict.items()}
   
# Save dict
with open('../../Data/vaccination_data/pagerank_by_subdist_age.pickle', 'wb') as pickle_out:
    pickle.dump(pagerank_by_subdist_age, pickle_out)

Page Rank by yeshuv

In [73]:
# Read contact matrix with age
contact_matrix = pd.read_pickle('../data/matrix/contact_matrix_final_with_age_yeshuv.pickle')

# Create a graph
G = nx.from_numpy_matrix(contact_matrix.values, create_using=nx.DiGraph())

# Calculate PageRank
pagerank_dict = nx.pagerank_numpy(G, alpha=1)

# Use (area, age) as keys
pagerank_by_yeshuv_age = {contact_matrix.index[i]: rank for i, rank in pagerank_dict.items()}
   
# Save dict
with open('../../Data/vaccination_data/pagerank_by_yeshuv_age.pickle', 'wb') as pickle_out:
    pickle.dump(pagerank_by_yeshuv_age, pickle_out)

Page Rank by area

In [44]:
contact_matrix = pd.read_pickle(model.contact_matrix_path)
contact_matrix.shape

(2049, 2049)

In [93]:
# Read contact matrix with age
contact_matrix = pd.read_pickle(model.contact_matrix_path)
contact_matrix.fillna(0, inplace=True)

# Create a graph
G = nx.from_numpy_matrix(contact_matrix.values, create_using=nx.DiGraph())

# Calculate PageRank
pagerank_dict = nx.pagerank_numpy(G, alpha=1)

# Use (area, age) as keys
pagerank_by_area_age = {contact_matrix.index[i]: rank for i, rank in pagerank_dict.items()}

# Remove irrelevant areas
for (area, age) in set(pagerank_by_area_age.keys()) - relevant_stat_area_age:
    pagerank_by_area_age.pop((area, age))
    
# Save dict
with open('../../Data/vaccination_data/pagerank_by_area_age.pickle', 'wb') as pickle_out:
    pickle.dump(pagerank_by_area_age, pickle_out)

Page Rank by node

In [15]:
pagerank_by_node_dict = nx.pagerank(prep_data['network'], alpha=1)

with open('../../Data/vaccination_data/pagerank_by_node.pickle', 'wb') as pickle_out:
    pickle.dump(pagerank_by_node_dict, pickle_out)

For each subdist separately - by areas

In [79]:
# Read contact matrix with age
contact_matrix = pd.read_pickle(model.contact_matrix_path)
contact_matrix.fillna(0, inplace=True)
# Get only relevant stat area and age
contact_matrix = contact_matrix.loc[relevant_stat_area_age, relevant_stat_area_age].copy()

# Create a graph
G = nx.from_pandas_adjacency(contact_matrix, create_using=nx.DiGraph())

# Create a list of noedes (areas) for each subdist and age
sub_graphs_areas_nodes = {key: [] for key in prep_data['relevant_subdists_age']}

for n in G.nodes:
    # Get node subdist and age
    subdist = nafot.gdf.loc[n[0]].SubDistrictCode 
    age = n[1]
    
    # Add to node to the relevant list
    sub_graphs_areas_nodes[(subdist, age)].append(n)
    
# # Create subgraph for each node and age group
sub_graphs_areas = {key: nx.subgraph(G, nodes) for key, nodes in sub_graphs_areas_nodes.items()}

# Calculate PageRank for each subgraph
sub_graphs_areas_pageranks = {key: nx.pagerank_numpy(subgraph, alpha=1) for key, subgraph in sub_graphs_areas.items()}

# Save
with open('../../Data/vaccination_data/sub_graphs_areas_pageranks.pickle', 'wb') as pickle_out:
    pickle.dump(sub_graphs_areas_pageranks, pickle_out)

For each subdist separately - by nodes

In [74]:
# Create a list of noedes for each subdist and age
sub_graphs_nodes = {key: [] for key in prep_data['relevant_subdists_age']}

for n in prep_data['network'].nodes:
    # Get node subdist and age
    node = prep_data['network'].nodes[n] 
    subdist = node['subdist']
    age = node['age']
    
    # Add to node to the relevant list
    sub_graphs_nodes[(subdist, age)].append(n)
    
# Create subgraph for each node and age group
sub_graphs = {key: nx.subgraph(prep_data['network'], nodes) for key, nodes in sub_graphs_nodes.items()}

# Calculate PageRank for each subgraph
sub_graphs_pageranks = {key: nx.pagerank_numpy(subgraph, alpha=1) for key, subgraph in sub_graphs.items()}

# Save
with open('../../Data/vaccination_data/sub_graphs_pageranks.pickle', 'wb') as pickle_out:
    pickle.dump(sub_graphs_pageranks, pickle_out)

### Eigenvector centrality

By area

In [19]:
# Read contact matrix with age
contact_matrix = pd.read_pickle(model.contact_matrix_path)
contact_matrix.fillna(0, inplace=True)

# Create a graph
G = nx.from_numpy_matrix(contact_matrix.values, create_using=nx.DiGraph())

# Calculate PageRank
eigenvector_dict = nx.eigenvector_centrality_numpy(G, weight='weight')

# Use (area, age) as keys
eigenvector_by_area_age = {contact_matrix.index[i]: rank for i, rank in eigenvector_dict.items()}

# Save dict
with open('../../Data/vaccination_data/eigenvector_by_area_age.pickle', 'wb') as pickle_out:
    pickle.dump(eigenvector_by_area_age, pickle_out)

By node

In [21]:
eigenvector_by_node_dict = nx.eigenvector_centrality_numpy(prep_data['network'])

with open('../../Data/vaccination_data/eigenvector_by_node_dict.pickle', 'wb') as pickle_out:
    pickle.dump(eigenvector_by_node_dict, pickle_out)

### Betweenness

By area

In [24]:
# Read contact matrix with age
contact_matrix = pd.read_pickle(model.contact_matrix_path)
contact_matrix.fillna(0, inplace=True)

# Create a graph
G = nx.from_numpy_matrix(contact_matrix.values, create_using=nx.DiGraph())

# Calculate PageRank
betweenness_dict = nx.betweenness_centrality(G, weight='weight')

# Use (area, age) as keys
betweenness_by_area_age = {contact_matrix.index[i]: rank for i, rank in betweenness_dict.items()}

# Save dict
with open('../../Data/vaccination_data/betweenness_by_area_age.pickle', 'wb') as pickle_out:
    pickle.dump(betweenness_by_area_age, pickle_out)

KeyboardInterrupt: 

By node

In [21]:
betweenness_by_node_dict = nx.betweenness_centrality(prep_data['network'])

with open('../../Data/vaccination_data/betweenness_by_node_dict.pickle', 'wb') as pickle_out:
    pickle.dump(betweenness_by_node_dict, pickle_out)

## Population by clinic and age

In [2]:
# Load demographic data
dem_data = pd.read_csv('L:/Dor/Data/vaccination_data/sample_dem_data.txt')

# Add age group
dem_data['age'] = ((2017 - dem_data.birth_year) > 18).astype(int)
dem_data = dem_data[['random_ID', 'stat_code', 'age']].copy()
dem_data.columns = ['random_ID', 'clinic_code', 'age']

# Group by clinic and age group
population_by_clinic_age = dem_data.groupby(['clinic_code', 'age']).count()
population_by_clinic_age.columns = ['data_population']

In [60]:
# population_by_clinic_age.to_pickle('./model/population_by_clinic_age.pickle')
population_by_clinic_age.to_pickle('L:/Dor/Data/vaccination_data/population_by_clinic_age.pickle')

### Population proportion by subdist and age

In [3]:
# Load prep data
prep_data = model.data_and_network_prep()

In [7]:
population_by_clinic_age = prep_data['population_by_clinic_age'].copy()

# Add stat area and subdist
population_by_clinic_age['stat_area'] = population_by_clinic_age.apply(lambda row: clinics_stat_areas.loc[row.name[0]].stat_area_id, axis=1)
population_by_clinic_age['subdist'] = population_by_clinic_age.apply(lambda row: clinics_stat_areas.loc[row.name[0]].subdist, axis=1)

In [8]:
prop_data_pop = {}
prop_net_pop = {}

for subdist, age in prep_data['relevant_subdists_age']:
    cur_df = population_by_clinic_age[population_by_clinic_age.subdist==subdist].loc[pd.IndexSlice[:,age], :]
    prop_data_pop.update(dict(cur_df.data_population / cur_df.data_population.sum()))
    prop_net_pop.update(dict(cur_df.network_population / cur_df.network_population.sum()))

## Vaccination data

In [9]:
# Load vaccination data
vaccination_data = pd.read_csv(model.vaccination_data_path)
vaccination_data['vac_date'] = pd.to_datetime(vaccination_data['vac_date'])

# Remove incomplete seasons (2007 and 2018)
vaccination_data = vaccination_data[~vaccination_data.vac_season.isin([2007, 2018])].copy()

# Get only last 5 years
vaccination_data = vaccination_data[vaccination_data.vac_season.isin(np.arange(2008, 2017+1)[-5:])]

# Short list of dates (1.9-28.2) and days in season
dates_2017_short = [pd.Timestamp(2016, 9, 1) + pd.Timedelta(days=1) * i for i in range(181)]
day_in_season_short = [(date - pd.datetime(date.year if date.month > 5 else date.year - 1, 6, 1)).days
                       for date in dates_2017_short]

### Vaccination coverage data

In [12]:
# Load clinic age population
# population_by_clinic_age = pd.read_pickle('L:/Dor/Data/vaccination_data/population_by_clinic_age.pickle')
population_by_clinic_age = pd.read_pickle('L:/Dor/Data/vaccination_data/population_by_clinic_age.pickle') 

# Get Clinic-stat area data
clinics_stat_areas = pd.read_csv('../../data/vaccination_data/clinics_with_stat_area.csv')
clinics_stat_areas.set_index('clinic_code', inplace=True)
clinics_stat_areas['subdist'] = clinics_stat_areas.stat_area_id.apply(lambda area: nafot.gdf.loc[area].SubDistrictCode)

# clinics_stat_areas.to_pickle('../../data/vaccination_data/clinics_with_stat_area.pickle')

In [13]:
vaccination_data['age'] = ((vaccination_data.vac_season - vaccination_data.birth_year) > 18).astype(int)
vaccination_data = vaccination_data[vaccination_data.vac_day_of_season.isin(set(day_in_season_short))].copy()
# vaccination_data[vaccination_data.duplicated(['random_ID', 'vac_season'])]
vaccination_data_by_age_clinic_season = vaccination_data.groupby([ 'clinic_code', 'age', 'vac_season']).count()[['random_ID']]
vaccination_data_by_age_clinic = vaccination_data_by_age_clinic_season.groupby(['clinic_code', 'age']).mean()
# vaccination_data_by_age_clinic

# Devide by the population of each clinic and age
vaccination_data_by_age_clinic['random_ID'] = vaccination_data_by_age_clinic.apply(lambda row: row.random_ID / population_by_clinic_age.loc[(row.name)].values[0], axis=1)
vaccination_data_by_age_clinic.columns = ['data_coverage']

# Add stat area and subdist
vaccination_data_by_age_clinic['stat_area'] = vaccination_data_by_age_clinic.apply(lambda row: clinics_stat_areas.loc[row.name[0]].stat_area_id, axis=1)
vaccination_data_by_age_clinic['subdist'] = vaccination_data_by_age_clinic.apply(lambda row: clinics_stat_areas.loc[row.name[0]].subdist, axis=1)

# Add prop out of subdist (if the clinic is not relevant - 0)
vaccination_data_by_age_clinic['prop_data_pop'] = vaccination_data_by_age_clinic.index.map(lambda x: prop_data_pop.get(x,0))
vaccination_data_by_age_clinic['prop_net_pop'] = vaccination_data_by_age_clinic.index.map(lambda x: prop_net_pop.get(x,0))

# Save
# vaccination_data_by_age_clinic.to_pickle('../../Data/vaccination_data/vaccination_coverage_with_age.pickle')
vaccination_data_by_age_clinic.to_pickle('../../Data/vaccination_data/vaccination_coverage_with_age_updated.pickle')

In [16]:
vaccination_data_by_age_clinic.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,data_coverage,stat_area,subdist,prop_data_pop,prop_net_pop
clinic_code,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1106,0,0.137318,4000722,31.0,0.0,0.0
1106,1,0.270681,4000722,31.0,0.0,0.0
1107,0,0.091028,960031,31.0,0.0,0.0
1107,1,0.265489,960031,31.0,0.0,0.0
1108,0,0.128704,760031,24.0,0.0,0.0


In [17]:
prep_data['population_by_clinic_age']

Unnamed: 0_level_0,Unnamed: 1_level_0,data_population,network_population,factor,subdist,prop_network
clinic_code,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7704,1,613,435,0.709625,61.0,0.004408
7714,0,439,240,0.546697,61.0,0.002432
7714,1,627,610,0.972887,61.0,0.006181
5534,0,303,135,0.445545,44.0,0.001368
5534,1,359,305,0.849582,44.0,0.003090
7704,0,347,193,0.556196,61.0,0.001956
5517,0,873,831,0.951890,43.0,0.008420
5517,1,1441,1868,1.296322,43.0,0.018928
5507,1,1246,531,0.426164,61.0,0.005380
5526,0,396,81,0.204545,61.0,0.000821


## Number of contacts by age

In [169]:
# Load israeli population by age
israeli_pop_all_ages = pd.read_excel('../data/population_all_age_groups.xlsx', sheet_name='israeli_pop')
israeli_pop_all_ages.set_index('age', inplace=True)

# Load contact matric from paper
contact_mat_paper = pd.read_excel('../data/contact_matrix_israel.xlsx')

In [212]:
# Number of contacts per paper age group  (5-year groups)
contacts_by_age_group_paper = contact_mat_paper.sum(1).values

# Proportion calculation for age group adjustments - children
props_children = np.array([0.]*4)

# 0-4 out of 0-17
props_children[0] = israeli_pop_all_ages.iloc[0:4+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[0:17+1,:].israeli_pop.sum()

# 5-9 out of 0-17
props_children[1] = israeli_pop_all_ages.iloc[5:9+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[0:17+1,:].israeli_pop.sum()

# 10-14 out of 0-17
props_children[2] = israeli_pop_all_ages.iloc[10:14+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[0:17+1,:].israeli_pop.sum()

# 15-17 out of 0-17
props_children[3] = israeli_pop_all_ages.iloc[15:17+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[0:17+1,:].israeli_pop.sum()


# Proportion calculation for age group adjustments - adults
props_adults = np.array([0.]*13)

# 18-19 out of 18+
props_adults[0] = israeli_pop_all_ages.iloc[18:19+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 20-24 out of 18+
props_adults[1] = israeli_pop_all_ages.iloc[20:24+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 25-29 out of 18+
props_adults[2] = israeli_pop_all_ages.iloc[25:29+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 30-34 out of 18+
props_adults[3] = israeli_pop_all_ages.iloc[30:34+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 35-39 out of 18+
props_adults[4] = israeli_pop_all_ages.iloc[35:39+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 40-44 out of 18+
props_adults[5] = israeli_pop_all_ages.iloc[40:44+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 45-49 out of 18+
props_adults[6] = israeli_pop_all_ages.iloc[45:49+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 50-54 out of 18+
props_adults[7] = israeli_pop_all_ages.iloc[50:54+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 55-59 out of 18+
props_adults[8] = israeli_pop_all_ages.iloc[55:59+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 60-64 out of 18+
props_adults[9] = israeli_pop_all_ages.iloc[60:64+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 65-69 out of 18+
props_adults[10] = israeli_pop_all_ages.iloc[65:69+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 70-74 out of 18+
props_adults[11] = israeli_pop_all_ages.iloc[70:74+1,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

# 75+ out of 18+
props_adults[12] = israeli_pop_all_ages.iloc[75:,:].israeli_pop.sum() / israeli_pop_all_ages.iloc[18:,:].israeli_pop.sum()

In [230]:
# Calculate number of contact for children and adults
children_contacts = (props_children*contacts_by_age_group_paper[:4]).sum()
adults_contacts = (props_adults*contacts_by_age_group_paper[3:]).sum()

print(f'Number of contacts children: {children_contacts:.50f}')
print(f'Number of contacts adults: {adults_contacts:.50f}')

Number of contacts children: 16.83276766767968624094464757945388555526733398437500
Number of contacts adults: 12.14280414131447649594974791398271918296813964843750


In [None]:
contacts_by_age_group_paper

In [45]:
contact_matrix = pd.read_pickle(contact_matrix_path)

In [51]:
stat_areas_clinics = pd.read_csv(stat_areas_clinics_path)
stat_areas_clinics = stat_areas_clinics[stat_areas_clinics.stat_area_id.isin(set([x[0] for x in contact_matrix.index]))]
stat_areas_clinics.set_index('stat_area_id', inplace=True)

In [60]:
network_pop_by_clinic = dict.fromkeys(product(population_by_clinic.index,[0,1]), 0)
network_pop_by_clinic

{(1000, 0): 0,
 (1000, 1): 0,
 (1106, 0): 0,
 (1106, 1): 0,
 (1107, 0): 0,
 (1107, 1): 0,
 (1108, 0): 0,
 (1108, 1): 0,
 (1109, 0): 0,
 (1109, 1): 0,
 (1110, 0): 0,
 (1110, 1): 0,
 (1111, 0): 0,
 (1111, 1): 0,
 (1114, 0): 0,
 (1114, 1): 0,
 (1115, 0): 0,
 (1115, 1): 0,
 (1116, 0): 0,
 (1116, 1): 0,
 (1117, 0): 0,
 (1117, 1): 0,
 (1118, 0): 0,
 (1118, 1): 0,
 (1119, 0): 0,
 (1119, 1): 0,
 (1120, 0): 0,
 (1120, 1): 0,
 (1121, 0): 0,
 (1121, 1): 0,
 (1124, 0): 0,
 (1124, 1): 0,
 (1125, 0): 0,
 (1125, 1): 0,
 (1126, 0): 0,
 (1126, 1): 0,
 (1127, 0): 0,
 (1127, 1): 0,
 (1128, 0): 0,
 (1128, 1): 0,
 (1129, 0): 0,
 (1129, 1): 0,
 (1130, 0): 0,
 (1130, 1): 0,
 (1131, 0): 0,
 (1131, 1): 0,
 (1132, 0): 0,
 (1132, 1): 0,
 (1134, 0): 0,
 (1134, 1): 0,
 (1135, 0): 0,
 (1135, 1): 0,
 (1136, 0): 0,
 (1136, 1): 0,
 (1138, 0): 0,
 (1138, 1): 0,
 (1139, 0): 0,
 (1139, 1): 0,
 (1140, 0): 0,
 (1140, 1): 0,
 (2207, 0): 0,
 (2207, 1): 0,
 (2209, 0): 0,
 (2209, 1): 0,
 (2210, 0): 0,
 (2210, 1): 0,
 (2211, 0)

In [6]:
population_by_clinic = pd.read_csv(population_by_clinic_path)
population_by_clinic.set_index('clinic_code', inplace=True)
population_by_clinic.columns = ['data_population']

In [73]:
stat_areas_clinics = pd.read_csv(stat_areas_clinics_path)
stat_areas_clinics['popualtion'] = stat_areas_clinics.clinic_code.apply(lambda code: population_by_clinic.loc[code])
stat_areas_clinics['age0_17_pcnt'] = nafot.gdf.age0_17_pcnt.values
stat_areas_clinics['population_children'] = stat_areas_clinics.popualtion*stat_areas_clinics.age0_17_pcnt
stat_areas_clinics['population_adult'] = stat_areas_clinics.popualtion*(1-stat_areas_clinics.age0_17_pcnt)
stat_areas_clinics['population_total'] = stat_areas_clinics.population_children + stat_areas_clinics.population_adult

In [74]:
population_by_clinic_age = stat_areas_clinics.groupby('clinic_code').sum()[['population_children', 'population_adult', 'population_total']]
population_by_clinic_age.to_pickle('./model/population_by_clinic_age.pickle')
# population_by_clinic_age.to_pickle('L:/Dor/Data/vaccination_data/population_by_clinic_age.pickle')

# OLD

In [None]:
# net_contact_mat = contact_matrix.copy()*0
# for n in tqdm_notebook(prep_data['network'].nodes):
#     # Get node's area and age
#     node = prep_data['network'].nodes[n]
#     node_area = node['area']
#     node_age = node['age']
    
#     # Go over contacts and add to the matrix
#     for m in prep_data['network'][n]:
#         # Get contact area and age
#         contact = prep_data['network'].nodes[m]
#         contact_area = contact['area']
#         contact_age = contact['age']
#         net_contact_mat.loc[[(node_area, node_age)], [(contact_area, contact_age)]] += 1

In [30]:
# net_contact_mat2 = net_contact_mat.copy()

In [33]:
# net_contact_mat_norm = net_contact_mat.divide(net_contact_mat.sum(1), 'rows')

PageRank

In [34]:
# # Create a graph
# G = nx.from_numpy_matrix(net_contact_mat_norm.values, create_using=nx.DiGraph())

# # Calculate PageRank
# pagerank_dict = nx.pagerank_numpy(G, alpha=1)

# # Use (area, age) as keys
# pagerank_by_area_age = {net_contact_mat_norm.index[i]: rank for i, rank in pagerank_dict.items()}


# # Save dict
# with open('../../Data/vaccination_data/pagerank_by_area_age_net.pickle', 'wb') as pickle_out:
#     pickle.dump(pagerank_by_area_age, pickle_out)

In [22]:
# # Load contact matrix
# contact_matrix = pd.read_pickle(model.contact_matrix_path)
# contact_matrix.fillna(0, inplace=True)

# # Define a threshold
# thresh = 1e-8
# prop_below_thresh = ((contact_matrix < thresh).sum(1) / contact_matrix.shape[0]).mean()
# print(f'Average proportion of edges with probability <= {thresh} per node: {prop_below_thresh:.3f}')
# print(f'Number of edges removed {(contact_matrix < thresh).sum().sum():,d} ({((contact_matrix < thresh).sum().sum() / 2049**2)*100:.2f}%)')

# # Update contact matrix
# contact_matrix_thresh = contact_matrix*(contact_matrix > thresh)

# # Normlize
# contact_matrix_thresh = contact_matrix_thresh.divide(contact_matrix_thresh.sum(1), 'rows')

PageRank

In [95]:
# # Create a graph
# G = nx.from_numpy_matrix(contact_matrix_thresh.values, create_using=nx.DiGraph())

# # Calculate PageRank
# pagerank_dict = nx.pagerank_numpy(G, alpha=1)

# # Use (area, age) as keys
# pagerank_by_area_age = {contact_matrix_thresh.index[i]: rank for i, rank in pagerank_dict.items()}

# # Remove irrelevant areas
# for (area, age) in set(pagerank_by_area_age.keys()) - relevant_stat_area_age:
#     pagerank_by_area_age.pop((area, age))

# # Save dict
# with open('../../Data/vaccination_data/pagerank_by_area_age_thresh.pickle', 'wb') as pickle_out:
#     pickle.dump(pagerank_by_area_age, pickle_out)