In [1]:
# Import Packages
import numpy as np
import pandas as pd
import networkx as nx
from sklearn import preprocessing as prep
from scipy import stats
import scipy.sparse as spr
from netivreg import G3SLS, GMM
import sys
sys.version

'3.9.12 (main, Apr  5 2022, 01:52:34) \n[Clang 12.0.0 ]'

In [2]:
# !pip install --upgrade numexpr==2.8.4
# !pip install --upgrade bottleneck==1.3.6

In [3]:
# # Upgrade all packages
# !pip install --upgrade pip
# !pip install --upgrade numpy
# !pip install --upgrade pandas
# !pip install --upgrade networkx
# !pip install --upgrade scikit-learn
# !pip install --upgrade scipy

In [4]:
# Print Versions
import scipy as sp
import sklearn as sk
print(f'Numpy: {np.__version__}')
print(f'Pandas: {pd.__version__}')
print(f'NetworkX: {nx.__version__}')
print(f'Scikit-learn: {sk.__version__}')
print(f'Scipy: {sp.__version__}')

Numpy: 1.26.4
Pandas: 2.2.2
NetworkX: 3.2.1
Scikit-learn: 1.5.0
Scipy: 1.13.1


In [5]:
# Simulated data
data = pd.read_stata("data/data_sim.dta")
# Read endogenous network
df_edges = pd.read_stata("data/W_sim.dta")
G = nx.from_pandas_edgelist(df_edges, 'source', 'target',
                            create_using=nx.DiGraph())
# Add nodes without edges
G.add_nodes_from(data["id"].unique())
W = nx.adjacency_matrix(G, nodelist=data["id"].unique())
W = prep.normalize(W, norm='l1', axis=1)
# Read exogenous network
df_edges0 = pd.read_stata("data/W0_sim.dta")
G0 = nx.from_pandas_edgelist(df_edges0, 'source', 'target',
                             create_using=nx.DiGraph())
# Add nodes without edges
G0.add_nodes_from(data["id"].unique())
W0 = nx.adjacency_matrix(G0, nodelist=data["id"].unique())
W0 = prep.normalize(W0, norm='l1', axis=1)
# Replicate results
name_y = 'y_endo'
name_x = ['x1', 'x2', 'x3', 'x4']
res_g3sls = G3SLS(name_y, name_x, W, W0, data)
res_gmm = GMM(name_y, name_x, W, W0, data)
with np.printoptions(precision=3, suppress=True):
    print(np.hstack([res_g3sls.params, res_gmm.params]))
with np.printoptions(precision=3, suppress=True):
    print(res_g3sls.cov)
    print(res_gmm.cov)

[[0.706 0.704]
 [0.346 0.349]
 [0.328 0.325]
 [0.362 0.361]
 [0.05  0.062]
 [0.378 0.388]
 [0.329 0.386]
 [0.344 0.329]
 [0.09  0.028]
 [1.036 1.026]]
[[ 0.009 -0.005 -0.005 -0.004 -0.003 -0.003 -0.001 -0.001 -0.    -0.029]
 [-0.005  0.016  0.004 -0.    -0.002  0.     0.001 -0.    -0.     0.013]
 [-0.005  0.004  0.008  0.001 -0.001  0.002  0.001  0.    -0.     0.015]
 [-0.004 -0.     0.001  0.009  0.003  0.001  0.001  0.     0.     0.013]
 [-0.003 -0.002 -0.001  0.003  0.022  0.001  0.001  0.001 -0.002  0.008]
 [-0.003  0.     0.002  0.001  0.001  0.003  0.     0.     0.001  0.009]
 [-0.001  0.001  0.001  0.001  0.001  0.     0.002 -0.    -0.     0.002]
 [-0.001 -0.     0.     0.     0.001  0.    -0.     0.002 -0.001  0.005]
 [-0.    -0.    -0.     0.    -0.002  0.001 -0.    -0.001  0.006  0.001]
 [-0.029  0.013  0.015  0.013  0.008  0.009  0.002  0.005  0.001  0.102]]
[[ 0.006 -0.003 -0.003 -0.004 -0.001 -0.001 -0.001 -0.    -0.002 -0.019]
 [-0.003  0.006  0.002  0.002  0.     0.     

In [6]:
# Check that covariance is positive definite
b_wald = res_gmm.params[:-1]
V_wald = res_gmm.cov[:-1, :-1]
print(np.all(np.linalg.eigvals(V_wald) > 0))
print((b_wald.T @ np.linalg.inv(V_wald) @ b_wald)[0][0])
np.linalg.cholesky(V_wald)
np.allclose(V_wald, V_wald.T)

True
5414.0879180316


True

In [7]:
###############
# Articles data

# Read data of scholars
data = pd.read_stata("data/articles.dta")

# Read coauthors network
df_edges = pd.read_stata("data/edges.dta")

# Read phd network
df_edges0 = pd.read_stata("data/edges0.dta")


In [8]:
data['id'] = 'node_' + data['id'].astype(str)
df_edges['source'] = 'node_' + df_edges['source'].astype(str)
df_edges['target'] = 'node_' + df_edges['target'].astype(str)
df_edges0['source'] = 'node_' + df_edges0['source'].astype(str)
df_edges0['target'] = 'node_' + df_edges0['target'].astype(str)

In [9]:
G = nx.from_pandas_edgelist(df_edges, 'source', 'target',
                            create_using=nx.DiGraph())
G.add_nodes_from(data["id"].unique())
W = nx.adjacency_matrix(G, nodelist=data["id"].unique())
W = prep.normalize(W, norm='l1', axis=1)

G0 = nx.from_pandas_edgelist(df_edges0, 'source', 'target',
                             create_using=nx.DiGraph())
G0.add_nodes_from(data["id"].unique())
W0 = nx.adjacency_matrix(G0, nodelist=data["id"].unique())
W0 = prep.normalize(W0, norm='l1', axis=1)

data = pd.get_dummies(data, columns=["journal", "year"], drop_first=True) * 1

In [10]:
data

Unnamed: 0,id,lcitations,editor,diff_gender,isolated,n_pages,n_authors,n_references,c_alumni,c_coauthor,journal_eca,journal_jpe,journal_qje,year_2001,year_2002
0,node_1,3.663562,0,0,1,14,1,40,1,0,0,0,0,0,0
1,node_12,3.044523,0,1,1,7,3,35,4,7,0,0,0,0,0
2,node_17,0.000000,0,0,0,7,1,10,4,30,0,0,0,0,0
3,node_21,2.302585,0,0,1,15,2,27,4,55,0,0,0,0,0
4,node_31,3.806663,0,0,1,21,1,39,4,78,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,node_655,4.997212,0,0,0,36,2,28,4,408,0,0,1,0,1
725,node_665,4.418840,0,0,0,45,2,61,4,547,0,0,1,0,1
726,node_687,3.332205,0,0,0,45,2,62,4,434,0,0,1,0,1
727,node_708,3.258096,0,1,0,33,2,43,4,562,0,0,1,0,1


In [11]:
# Replicate results
name_y = 'lcitations'
name_x = ['editor', 'diff_gender', 'n_pages', 'n_authors', 'n_references',
          'isolated', 'journal_eca', 'journal_jpe', 'journal_qje',
          'year_2001', 'year_2002']
name_xs = ['editor', 'diff_gender']
name_xins=['editor', 'diff_gender', 'n_pages', 'n_authors', 'n_references', 'isolated']

res_g3sls = G3SLS(name_y, name_x, W, W0, data,
                  name_xs=name_xs, cluster='c_coauthor')
res_gmm = GMM(name_y, name_x, W, W0, data,
              name_xs=name_xs, maxp=4)
res_gmm_xs = GMM(name_y, name_x, W, W0, data,
              name_xs=name_xs, name_xins=name_xins, maxp=4)

with np.printoptions(precision=3, suppress=True):
    print(np.hstack([res_g3sls.params, res_gmm.params, res_gmm_xs.params]))

with np.printoptions(precision=3, suppress=True):
    print(res_g3sls.cov)
    print(res_gmm.cov)

stats.chi2.pdf(res_g3sls.wald, res_g3sls.df_m)[0][0]
stats.chi2.pdf(res_gmm.wald, res_gmm.df_m)[0][0]

[[ 0.269  1.742  0.682]
 [ 4.309  0.092  0.297]
 [-2.589 -3.035 -2.766]
 [ 0.217 -0.497 -0.254]
 [ 0.216  0.6    0.559]
 [ 0.029  0.024  0.029]
 [ 0.055  0.043  0.012]
 [ 0.012  0.002  0.007]
 [-0.215  5.514  1.746]
 [-0.354 -0.111 -0.266]
 [-0.419 -0.216 -0.274]
 [-0.205 -0.436 -0.287]
 [ 0.04   0.224  0.203]
 [ 0.107  0.103  0.165]
 [ 1.974 -3.075  0.487]]
[[ 0.018 -0.246 -0.06  -0.     0.002  0.    -0.001 -0.     0.001  0.001
   0.001 -0.003 -0.001 -0.001 -0.014]
 [-0.246 22.099 -8.811 -0.041 -0.046 -0.003  0.023  0.001 -0.019  0.028
  -0.019  0.048  0.031  0.021  0.11 ]
 [-0.06  -8.811  7.013  0.014  0.004  0.001 -0.005 -0.     0.01  -0.033
  -0.004 -0.015  0.006  0.011 -0.012]
 [-0.    -0.041  0.014  0.011 -0.002 -0.     0.     0.     0.002  0.
   0.001  0.001  0.    -0.001 -0.001]
 [ 0.002 -0.046  0.004 -0.002  0.014  0.    -0.003 -0.    -0.002  0.
  -0.    -0.001 -0.    -0.001  0.002]
 [ 0.    -0.003  0.001 -0.     0.     0.     0.    -0.     0.    -0.
  -0.    -0.    -0.     0.

4.039580894221834e-21