In [None]:
import pandas as pd
import networkx as nx
from functions import *

from collections import defaultdict, Counter
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

In [None]:
# Load Data Tables
advises = pd.read_csv("data/advises.tsv",sep='\t')
academic     = pd.read_csv("data/academic.tsv",sep='\t')
academic["Full_Name"] = academic.given_name + " " + academic.family_name
degree_grant = pd.read_csv("data/degree_grant.tsv",sep='\t')
school       = pd.read_csv("data/school.tsv", sep='\t')
country      = pd.read_csv("data/country.tsv", sep='\t')
degree1 = pd.read_csv("data/new_degree.tsv", sep='\t',keep_default_na=True)

# Make Adjacency List & Graph
adj_list = list(zip(advises["advisor"], advises["advisee"]))
G = nx.DiGraph()
G.add_edges_from(adj_list)
G_undirect = G.to_undirected()

Fullname2ID = pd.Series(academic.academic_id.values, index=academic.Full_Name).to_dict()
ID2Name = pd.Series(academic.Full_Name.values,index=academic.academic_id).to_dict()
country2CN = pd.Series(country.country_name.values,index=country.country_id).to_dict()
nx.set_node_attributes(G, ID2Name, 'full_name')

In [None]:
# Load Fields Medalists
medalists     = pd.read_csv("new_medalists.csv",sep=',')
medalist_IDs = []

for i in medalists.Winner.values:
    ID = Fullname2ID[i]
    if i == "Paul Cohen":
        ID = 6479
    elif i == "Michael Freedman":
        ID = 1365
    elif i == "Alan Baker":
        ID = 22765
    elif ID == 230591:
        ID = 93772
    elif ID == 245820:
        ID = 15779
    elif ID == 211588:
        ID = 6488
    G.nodes()[ID]["medalist"] = 1
    medalist_IDs.append(ID)

In [None]:
# Step through medalists and get shortest path
subgraph = defaultdict(int)
for i, a in enumerate(medalist_IDs):
    subgraph[a] = 1
    for j, b in enumerate(medalist_IDs[i+1:]):
        if nx.has_path(G_undirect,a,b):
            path_nodes = nx.shortest_path(G_undirect,a,b)
            for p in path_nodes:
                subgraph[p] = 1
subgraph_nodes = list(subgraph.keys())
K = G.subgraph(subgraph_nodes) # Network of Elites

# Meso Analysis by Country

In [None]:
academic2degree = pd.Series(degree1.degree_id.values,index=degree1.academic).to_dict()
degree2school   = pd.Series(degree_grant.school.values,index=degree_grant.degree).to_dict()
school2country  = pd.Series(school.country.values,index=school.school_id).to_dict()
def ID2country(n1):
    try:
        degree_id  = academic2degree[n1]
        school_id  = degree2school[ degree_id ]
        country_id = school2country[ school_id ] 
        return country2CN[country_id]
    except:
        if n1 == 56371:
            return "Germany"
        
edges_c = defaultdict(int)
node_size = defaultdict(int)
for n in K.nodes():
    neighbors = K.neighbors(n)
    node_size[ID2country(n)] += 1
    for nei in neighbors:
        edge_tuple = (ID2country(n) , ID2country(nei) )
        edges_c[edge_tuple] += 1
        
C = nx.DiGraph()
CC = nx.Graph()
for k in edges_c.keys():
    C.add_edge(k[0],k[1], weight = edges_c[k])
    CC.add_edge(k[0],k[1])

In [None]:
# Extract self-flow, inflow, outflow in dictionary form
ethni_adj = nx.adj_matrix(C)
ethni_adj = ethni_adj.toarray()
eth2SF = {} # self-flow
eth2IF = {} # in-flow
eth2OF = {} # out-flow
M = ethni_adj

for i, D in enumerate(C.adjacency()):
    n, __ = D
    Self_Flow = M[i,i]
    In_Flow   = np.sum(M[:,i]) - Self_Flow
    Out_Flow  = np.sum(M[i,:]) - Self_Flow
    
    eth2SF[n] = Self_Flow
    eth2IF[n] = In_Flow
    eth2OF[n] = Out_Flow
    
# Format into Dirichlet Format
points = {}
color_dict = {}
for k in eth2SF.keys():
    NORM = eth2SF[k] + eth2IF[k] + eth2OF[k]
    points[k] = np.array([eth2IF[k] , eth2OF[k] , eth2SF[k] ]) / NORM
    if np.argmin(points[k]) == 0:
        color_dict[k] = "r"
    elif np.argmin(points[k]) == 1:
        color_dict[k] = "b"

        if points[k][0] == 1:
            print(k)
    else:
        color_dict[k] = "g"
        
plt.figure(figsize=(12,8))

F = np.array([[1,0,0],
              [0,1,0],
              [0,0,1],
#               [1/3,1/3,1/3],
             ])
F = SimplexTo2D(F)

plt.fill(F.T[0],F.T[1],zorder=1,color="peachpuff")
label_list = ["In-Flow", "Out-Flow", "Self-Flow"]
loc_adjustment = np.array([[-0.10, -0.06],
                           [-0.1, 0.03],
                           [-0.07,-0.06]
                            ])

for i in range(3): # plot each point + it's index as text above
#     plt.scatter(*F[i], color='b',zorder=1,)
    new_loc = F[i] + loc_adjustment[i]
    plt.text(*new_loc, '%s' % (label_list[i]), size=22, zorder=1, color='r')


points_2D = {}
B = np.zeros((len(points.keys()),3))

for i,k in enumerate(points.keys()):
    p = np.copy(points[k])
    points_2D[k] = SimplexTo2D(np.atleast_2d(p))
    B[i,:] = np.copy(points[k])  
    
B = SimplexTo2D(B)
B = B.T
# plt.scatter(B[0],B[1],zorder=3)

for k in points.keys():
    plt.scatter(*points_2D[k][0], color= color_dict[k],zorder=3, s = 10+ node_size[k], alpha = 0.4)
    location = points_2D[k][0] + np.array([0.008,0.001])
    label = k
    if k == "Democratic Republic of the Congo":
        label = "Congo"
        location += np.array([0.,-0.01])
    elif k == "Belgium":
        location += np.array([-0.07,0.010])
    elif k == "Argentina":
        location += np.array([0,-0.020])
    elif k == "Hungary":
        location += np.array([-0.095,-0.03])
    elif k == "Germany":
        location += np.array([-0.115,-0.03])
    elif k == "France":
        location += np.array([-0.02,0.01])
    elif k == "United Kingdom":
        location += np.array([-0.005,-0.035])
    elif k == "Poland":
        location += np.array([-0.005,0.01])
    elif k == "Greece":
        location += np.array([-0.03,0.016])
    elif k == "Spain":
        location += np.array([-0.03,0.016])
    elif k == "HongKong":
        location += np.array([-0.13,0.016])
    elif k == "Tunisia":
        location += np.array([0.0,-0.016])
    elif k == "Norway":
        continue
    elif k == "Croatia":
        continue
        location += np.array([0.1,0.02])
    elif k == "Iran":
#         location += np.array([0.05,0.05])
        label = "Iran, etc"
        location += np.array([-0.01,0.024])
#         continue
    elif k == "Uruguay":
        continue
        location += np.array([0.11,0.05])
    elif k == "Norway":
        location += np.array([0.09,0.08])
    elif k == "Kenya":
        continue
        location += np.array([0.14,-0.01])
    elif k == "Turkey":
        continue
        location += np.array([0.14,-0.03])
    elif k == "Colombia":
        continue
        location += np.array([0.0,0.024])
        

    plt.text(*location, '%s' % (label), size=14, zorder=1, color='k')

plt.axis('off')
plt.xlim(-0.2,1.2)

## By Lingo-Ethnicity

In [None]:
WIKI_FULL = pd.read_pickle("wiki_fullname.pkl")
Wiki_General = defaultdict(int)
Wiki_Specific = defaultdict(int)
for i, row in WIKI_FULL.iterrows():
    n = row.race
    general_region = n.split(",")[0] 
    specif_region  = n.split(",")[-1] 
    if specif_region == "Jewish":
        n = (row.iloc[4:].sort_values(ascending = False).index[1] )
        specif_region  = n.split(",")[-1]
    
    if specif_region == "Muslim":
        specif_region = "Arabic"
    elif specif_region == "Hispanic":
        specif_region = "Spanish"
    elif specif_region == "British":
        specif_region = "Anglo"
    
    Wiki_General[ row.academic_id ]  = general_region
    Wiki_Specific[ row.academic_id ] = specif_region

In [None]:
## Switched 
edges_c = defaultdict(int)
node_size = defaultdict(int)
edges_GEN = defaultdict(int)
node_size_GEN = defaultdict(int)
for n in K.nodes():
    neighbors = K.neighbors(n)
    node_size[Wiki_Specific[n]] += 1
    node_size_GEN[Wiki_General[n]] += 1
    for nei in neighbors:
        edge_tuple = (Wiki_Specific[n], Wiki_Specific[nei])
        edges_c[edge_tuple] += 1
        edge_tuple = (Wiki_General[n], Wiki_General[nei])
        edges_GEN[edge_tuple] += 1

In [None]:
C = nx.DiGraph()
GEN = nx.DiGraph()
CC = nx.Graph()
for k in edges_c.keys():
    C.add_edge(k[0],k[1], weight = edges_c[k])
    CC.add_edge(k[0],k[1])
for k in edges_GEN:
    GEN.add_edge(k[0],k[1], weight = edges_GEN[k])

In [None]:
# Extract self-flow, inflow, outflow in dictionary form
ethni_adj = nx.adj_matrix(C)
ethni_adj = ethni_adj.toarray()
eth2SF = {} # self-flow
eth2IF = {} # in-flow
eth2OF = {} # out-flow
M = ethni_adj

for i, D in enumerate(C.adjacency()):
    n, __ = D
    Self_Flow = M[i,i]
    In_Flow   = np.sum(M[:,i]) - Self_Flow
    Out_Flow  = np.sum(M[i,:]) - Self_Flow
    
    eth2SF[n] = Self_Flow
    eth2IF[n] = In_Flow
    eth2OF[n] = Out_Flow
    
# Format into Dirichlet Format
points = {}
color_dict = {}
for k in eth2SF.keys():
    NORM = eth2SF[k] + eth2IF[k] + eth2OF[k]
    points[k] = np.array([eth2IF[k] , eth2OF[k] , eth2SF[k] ]) / NORM
    if np.argmin(points[k]) == 0:
        color_dict[k] = "r"
    elif np.argmin(points[k]) == 1:
        color_dict[k] = "b"
    else:
        color_dict[k] = "g"

In [None]:
plt.figure(figsize=(12,8))

F = np.array([[1,0,0],
              [0,1,0],
              [0,0,1],
#               [1/3,1/3,1/3],
             ])
F = SimplexTo2D(F)

plt.fill(F.T[0],F.T[1],zorder=1, color = "peachpuff")
label_list = ["In-Flow", "Out-Flow", "Self-Flow"]
loc_adjustment = np.array([[-0.05, -0.05],
                           [-0.05, 0.05],
                           [-0.05,-0.05]
                            ])

for i in range(3): # plot each point + it's index as text above
#     plt.scatter(*F[i], color='b',zorder=1,)
    new_loc = F[i] + loc_adjustment[i]
    plt.text(*new_loc, '%s' % (label_list[i]), size=22, zorder=1, color='r')

points_2D = {}
B = np.zeros((len(points.keys()),3))
for i,k in enumerate(points.keys()):
    p = np.copy(points[k])
    points_2D[k] = SimplexTo2D(np.atleast_2d(p))
    B[i,:] = np.copy(points[k])
    
B = SimplexTo2D(B)
B = B.T
# plt.scatter(B[0],B[1],zorder=3)

for k in points.keys():
    plt.scatter(*points_2D[k][0], color=color_dict[k],zorder=3 ,s = node_size[k]/2, alpha = 0.5)
    label = k
    location = points_2D[k][0] + np.array([0.008,0.001])
    if k == "IndianSubContinent":
        label = "Indian/Asia"
    if k == "Africans":
        label = "African"
    elif k == "EastEuropean":
#         label = "Slavic"
        location += np.array([0,-0.02])
    elif k == "Anglo":
        location += np.array([-0.01,0.01])
    elif k == "Japanese":
        location += np.array([-0.06,0.03])
    elif k == "Germanic":
        location += np.array([-0.06,0.03])
    plt.text(*location, '%s' % (label), size=16, zorder=1, color='k')

    
# Extract self-flow, inflow, outflow in dictionary form
ethni_adj = nx.adj_matrix(GEN)
ethni_adj = ethni_adj.toarray()
eth2SF = {} # self-flow
eth2IF = {} # in-flow
eth2OF = {} # out-flow
M = ethni_adj

for i, D in enumerate(GEN.adjacency()):
    n, __ = D
    Self_Flow = M[i,i]
    In_Flow   = np.sum(M[:,i]) - Self_Flow
    Out_Flow  = np.sum(M[i,:]) - Self_Flow
    
    eth2SF[n] = Self_Flow
    eth2IF[n] = In_Flow
    eth2OF[n] = Out_Flow
    
# Format into Dirichlet Format
points_GEN = {}
color_dict_GEN = {}
for k in eth2SF.keys():
    NORM = eth2SF[k] + eth2IF[k] + eth2OF[k]
    points_GEN[k] = np.array([eth2IF[k] , eth2OF[k] , eth2SF[k] ]) / NORM
    if np.argmin(points_GEN[k]) == 0:
        color_dict_GEN[k] = "r"
    elif np.argmin(points_GEN[k]) == 1:
        color_dict_GEN[k] = "b"
    else:
        color_dict_GEN[k] = "g"    
    
points_2D_GEN = {}
for i,k in enumerate(points_GEN.keys()):
    p = np.copy(points_GEN[k])
    points_2D_GEN[k] = SimplexTo2D(np.atleast_2d(p))

labeled = defaultdict(int)
for k in points_GEN.keys():
    # Label color
#     if color_dict_GEN[k] == "r" and not label_bool["r"] == 1:
    if color_dict_GEN[k] == "r":
        plt.scatter(*points_2D_GEN[k][0], color=color_dict_GEN[k],zorder=3,s = 300 , alpha=1.0, label = "Out-/Self-Flow")
        labeled["r"] = 1
#     elif color_dict_GEN[k] == "g"and not label_bool["g"] == 1:
    elif color_dict_GEN[k] == "g":
        plt.scatter(*points_2D_GEN[k][0], color=color_dict_GEN[k],zorder=3,s = 300 , alpha=1.0, label = "In-/Out-Flow")
        labeled["g"] = 1
#     if color_dict_GEN[k] == "b" and not label_bool["b"] == 1:
    if color_dict_GEN[k] == "b":
        plt.scatter(*points_2D_GEN[k][0], color=color_dict_GEN[k],zorder=3,s = 300 , alpha=1.0, label = "In-/Self-Flow")
        labeled["b"] = 1    
    label = k
    location = points_2D_GEN[k][0] + np.array([0.02,-0.02])
    if k == "Asian":
        location += np.array([-0.06,-0.035])
    if k == "GreaterAfrican":
        location += np.array([-0.02,0.035])
    plt.text(*location, '%s' % (label), size=20, zorder=1, color='k')
    print(k)

plt.legend(fontsize=20)
    
plt.axis('off')
plt.xlim(-0.2,1.2)