Load data

In [1]:
import pandas as pd
from igraph import *
import cairo
import numpy as np
import numpy.random as random
import math
from matplotlib import pyplot as plt
from scipy.stats import bernoulli

In [2]:
# create attribute list
# easy for this file because the first 6 columns are in fact attribute list
attr_list = pd.read_csv('HTTLPR.csv', usecols=[0, 1, 2, 3, 4, 5])
attr_list['PaperID'] = attr_list.index

# create edge list
matrix = pd.read_csv('HTTLPR.csv')
attr_list

Unnamed: 0,Study,Year,YearOnline,Outcome,OutcomeSmallestPvalue,Abstract,PaperID
0,Mossner,2001,2001,Positive,Positive,Positive,0
1,Caspi,2003,2003,Positive,Positive,Positive,1
2,Eley,2004,2004,Negative,Negative,Positive,2
3,Grabe_a,2005,2004,Unclear,Unclear,Partially supportive,3
4,Kaufman_a,2004,2004,Positive,Positive,Positive,4
...,...,...,...,...,...,...,...
68,Grabe_b,2012,2012,Negative,Negative,Partially supportive,68
69,Petersen,2012,2012,Positive,Positive,Positive,69
70,Beaver,2012,2012,Positive,Positive,Partially supportive,70
71,Brown,2013,2012,Negative,Positive,Positive,71


In [3]:
# select only the adjacency matrix
matrix = matrix.iloc[:, 6:]
matrix

Unnamed: 0,Mossner,Caspi,Eley,Grabe_a,Kaufman_a,Gillespie,Kendler,Surtees,Sjoberg,Nakatani,...,Comasco_a,Cicchetti_b,Jenness,Scheid_b,Quinn,Grabe_b,Petersen,Beaver,Brown,Wilhelm_b
0,,,,,,,,,,,...,,,,,,,,,,
1,-,,,,,,,,,,...,,,,,,,,,,
2,-,X,,,,,,,,,...,,,,,,,,,,
3,-,X,-,,,,,,,,...,,,,,,,,,,
4,-,X,-,-,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,-,X,-,X,-,-,X,-,X,-,...,-,-,-,-,-,,,,,
69,-,X,X,-,-,-,X,X,X,-,...,-,-,-,-,-,-,,,,
70,-,X,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,,,
71,-,X,X,-,-,-,-,-,X,-,...,-,-,-,-,-,-,-,-,,


In [4]:
# search_for_alias
search_dict = pd.Series(attr_list['PaperID'].values, index=attr_list['Study']).to_dict()
search_dict_reverse = pd.Series(attr_list['Study'].values, index=attr_list['PaperID']).to_dict()
matrix.columns = [search_dict[x] for x in matrix.columns]
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,72
0,,,,,,,,,,,...,,,,,,,,,,
1,-,,,,,,,,,,...,,,,,,,,,,
2,-,X,,,,,,,,,...,,,,,,,,,,
3,-,X,-,,,,,,,,...,,,,,,,,,,
4,-,X,-,-,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,-,X,-,X,-,-,X,-,X,-,...,-,-,-,-,-,,,,,
69,-,X,X,-,-,-,X,X,X,-,...,-,-,-,-,-,-,,,,
70,-,X,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,-,,,
71,-,X,X,-,-,-,-,-,X,-,...,-,-,-,-,-,-,-,-,,


Reconstruct claim-specific citation network g

In [5]:
# initialize the graph object
g = Graph(directed=True)
g.add_vertices(73)

# add attributes to the graph: study name: name; study outcome: outcome; YearOnline: year
study = attr_list['Study'].to_list()
outcome = attr_list['Outcome'].to_list()
g.vs['name'] = study
g.vs["label"] = g.vs["name"]
g.vs['outcome'] = outcome
g.vs['Year'] = attr_list['YearOnline']
g.vs['color'] = 'light blue'

# create edges for the real graph, using the matrix
edge_list = []
for i in range(0, matrix.shape[0]):
    for j in range(0, matrix.shape[1]):
        if isinstance(matrix.iloc[i, j], str):
            s = matrix.iloc[i, j]
            if s.replace(" ", "") == "X":
                edge_list.append((i, j))
                g.add_edge(source=i, target=j)
len(g.es)

488

Visualize the network

In [6]:
vis = plot(g,layout="kk", margin=50,bbox=(600,600),vertex_label_size=10,edge_arrow_size=0.5,edge_arrow_width=1.5)
vis.show()

Calculate the probability for each year

In [7]:
# potential graph
year_gap = 1

g_full = Graph(directed=True)
g_full.add_vertices(73)
g_full.vs['name'] = study
g_full.vs["label"] = g_full.vs["name"]
g_full.vs['outcome'] = outcome
g_full.vs['Year'] = attr_list['YearOnline']

full_edge_list=[]

for i in g_full.vs.indices:
    for j in g_full.vs.indices:
        if i>j and (g.vs[i]['Year']-g.vs[j]['Year']>=year_gap):
            full_edge_list.append((i,j))
            g_full.add_edge(source=i,target=j)
g_full.get_edge_dataframe()

Unnamed: 0_level_0,source,target
edge ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1,0
1,2,0
2,2,1
3,3,0
4,3,1
...,...,...
2343,72,62
2344,72,63
2345,72,64
2346,72,65


In [8]:
Year_source = np.unique(g.vs(g_full.get_edge_dataframe().source)['Year'])
prob = [len(g.es.select(_source_in=g.vs.select(Year=year).indices))
        /len(g_full.es.select(_source_in=g_full.vs.select(Year=year).indices)) for year in Year_source]
# replace the first 0 with 0.5
prob[0] = 0.5
prob = pd.Series(prob,index=Year_source)
n = [len(g_full.es.select(_source_in=g_full.vs.select(Year=year).indices)) for year in Year_source]
num = pd.Series(n,index=Year_source)
num

2003      1
2004      8
2005     24
2006     80
2007    198
2008    232
2009    407
2010    336
2011    660
2012    402
dtype: int64

Create the simulated network

In [9]:
# initialize simulated network g_sim
g_sim = Graph(directed=True)
g_sim.add_vertices(73)
g_sim.vs['name'] = study
g_sim.vs["label"] = g_sim.vs["name"]
g_sim.vs['outcome'] = outcome
g_sim.vs["color"] = 'light blue'
g_sim.vs['Year'] = attr_list['YearOnline']

sim_edge_list = []

In [10]:
# construct graph
random.seed(1)
for year in Year_source:
    g_full.es.select(_source_in=g_full.vs.select(Year=year).indices)['Citation']=bernoulli.rvs(prob[year],size=num[year])

edge_dataframe = g_full.get_edge_dataframe()
sim_edge_dataframe = edge_dataframe.loc[edge_dataframe['Citation']==1]

for i in range(0,sim_edge_dataframe.shape[0]):
    sim_edge_list.append((sim_edge_dataframe.iloc[i,]['source'],sim_edge_dataframe.iloc[i,]['target']))
    g_sim.add_edge(source=sim_edge_dataframe.iloc[i,]['source'],target=sim_edge_dataframe.iloc[i,]['target'])

vis = plot(g_sim,layout="kk", margin=50,bbox=(600,600),vertex_label_size=10,edge_arrow_size=0.5,edge_arrow_width=1.5)
vis.show()

In [11]:
len(g_sim.es)

498