In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import os
import re
import scipy.sparse
import scipy.sparse.csgraph
import glob
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

In [None]:
# create the input_dir（input directory）
current_note_path = os.path.dirname(os.path.abspath('__file__'))
INPUT_DIR = os.path.join(current_note_path, 'data')

# if INPUT_DIR has not been created yet, create it
if not os.path.isdir(INPUT_DIR):
    os.mkdir(INPUT_DIR)

# output_dir(output directory) creation
OUTPUT_DIR = os.path.join(current_note_path, 'outputs')

# if OUTPUT_DIR has not been created yet, create it
if not os.path.isdir(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [None]:
# source_path = '/scratch/bell/sido/constraints'
# file_location = os.path.join(source_path, '*.net')
# filenames = sorted(glob.glob(file_location))
# print(filenames)

In [None]:
# Once you run this code, comment it out
# move csv files to `data` directory(=folder)
unique_dir_names = []
for f in Path(f'{current_note_path}').rglob('*.net'):
    unique_dir_names.append(f)
for g in Path(f'{file_location}').rglob('*.net'):
    unique_dir_names.append(g)

for file in list(set(unique_dir_names)):
    print(f'moved file: {file}')
    shutil.move(f'{file}', f'{INPUT_DIR}')

In [None]:
# os.chdir('/Users/yuliyu/Dropbox/Network_VC_2023/apr2023/quarterlyversion/Pajek')
# os.chdir('programming/RA/vc_syn_5yr1980q1.net')
# os.chdir('/scratch/bell/hu244/vxpert/quarterlyversion')
print("Current working directory: {0}".format(os.getcwd()))

In [None]:
#  print the whole path name
first_file = filenames[0]
print(first_file)

In [None]:
# Just the file name
file_name = os.path.basename(first_file)
print(file_name)

multiprocessing for-loop to return a closeness centrality for each year as csv file

In [None]:
# set the path to the directory containing the files
path = INPUT_DIR

# get a list of all the files in the directory
files = os.listdir(path)

# filter the list to include only the .net files
files = [f for f in files if f.endswith(".net")]

# define a function to process a single file
def process_file(filename):
    # extract the name of the file without the extension
    name = re.search("5yr(.+?).net", filename).group(1)
    
    # read the graph from the file
    G = nx.read_pajek(os.path.join(path, filename))
    
    # convert the graph to an undirected graph
    G = G.to_undirected()
    
    # calculate the closeness centrality of each node
    A = nx.adjacency_matrix(G).tolil()
    D = scipy.sparse.csgraph.floyd_warshall(A, directed=False, unweighted=False)
    N = D.shape[0] # N: number of all nodes
    closeness_centrality = {}
    for r in range(0, N):
        cc = 0.0
        possible_paths = list(enumerate(D[r, :]))
        shortest_paths = dict(filter(lambda x: not x[1] == np.inf, possible_paths)) # shortest path from node u
        total = sum(shortest_paths.values())
        n_shortest_paths = len(shortest_paths) - 1.0  
        if total > 0.0 and N > 1:
            s = n_shortest_paths / (N - 1)
            cc = (n_shortest_paths / total) * s
        closeness_centrality[r] = cc
    
    # create a DataFrame from the closeness centrality dictionary
    df = pd.DataFrame.from_dict(closeness_centrality, orient="index", columns=["closeness"])
    df.index.name = "node"
    
    # save the DataFrame to a CSV file
    df.to_csv(os.path.join(OUTPUT_DIR, f"closeness_{name}.csv"), index=False)

# create a process pool executor with 4 worker processes
def main():
    # When you want to know the progress in tqdm, you need to specify the total number of files.
    with tqdm(total=len(files)) as progress:
        with ProcessPoolExecutor(max_workers=os.cpu_count() // 2) as executor:
            # submit a task for each file to the executor
            futures = [executor.submit(process_file, filename).add_done_callback(lambda p: progress.update()) for filename in files]

            # wait for all the tasks to complete
            for future in futures:
                future.result()

if __name__ == "__main__":
    main()

print("Files are processed.")

# Cathy's work

In [None]:
# myoutputpath = "/scratch/bell/hu244/vxpert/quarterlyversion/PajekOutput/"
myoutputpath = OUTPUT_DIR

# i = 0
# end = end
end =174
print(end)

for i in range(0, end):
# while i <end:
# for i in range(-1,163):
   
    if i == end:
        break
#     print(filenames[i])
    #group(0) = ""5yr(.+?).net"", group(1)= "(.+?)"
    name = re.search("5yr(.+?).net",filenames[i]).group(1)
    G = nx.read_pajek(filenames[i])
    print("Processing #", i , "file")
    print(name)
    print(G) #检查 是否读了所有的 .net files
    # undirected graph
    G1 = nx.Graph(G)
    # returns the number of vertices (nodes)
    n = nx.number_of_nodes(G1)

    # returns adjacency matrix of G.
    ## creates a sparse matrix to save up the memory with COO method
    A = nx.adjacency_matrix(G, format="csr").tolil()
    D = scipy.sparse.csgraph.floyd_warshall(
            A, directed=False, unweighted=False)

    N = D.shape[0] # N: number of all nodes
    closeness_centrality = {}
    for r in range(0, N):
        cc = 0.0
        possible_paths = list(enumerate(D[r, :]))
        shortest_paths = dict(filter(lambda x: not x[1] == np.inf, possible_paths)) # shortest path from node u
        total = sum(shortest_paths.values())
        # n = len(shortest_paths)
        n_shortest_paths = len(shortest_paths) - 1.0  
        if total > 0.0 and N > 1:
            s = n_shortest_paths / (N - 1)
            cc = (n_shortest_paths / total) * s
        closeness_centrality[r] = cc
    closeness_df = pd.DataFrame()
    closeness_df["nodes"] = closeness_centrality.keys()
    closeness_df["closeness"] = closeness_centrality.values()

    newfile_closeness = "".join([myoutputpath,"closeness_", name,".csv"])
    closeness_df.to_csv(newfile_closeness, index=False)
    i += 1
else:
    print("Files are processed.")

In [None]:
# myoutputpath = "/scratch/bell/hu244/vxpert/quarterlyversion/PajekOutput/"
myoutputpath = OUTPUT_DIR

# i = 0
# end = end
end =174
print(end)

for i in range(0, end):
# while i <end:
# for i in range(-1,163):
   
    if i == end:
        break
#     print(filenames[i])
    #group(0) = ""5yr(.+?).net"", group(1)= "(.+?)"
    name = re.search("5yr(.+?).net",filenames[i]).group(1)
    G = nx.read_pajek(filenames[i])
    print("Processing #", i , "file")
    print(name)
    print(G) #检查 是否读了所有的 .net files
    # undirected graph
    G1 = nx.Graph(G)
    # returns the number of vertices (nodes)
    n = nx.number_of_nodes(G1)
#     constraint = nx.constraint(G, nodes=None, weight=None)
#     cons_df = pd.DataFrame()
#     cons_df["node"] = constraint.keys()
#     cons_df["constraint"] = constraint.values()
#     newfile="".join([myoutputpath,"constraint_", name,".csv"])
#     print(type(newfile))
#     cons_df.to_csv(newfile, index=False)
    # returns adjacency matrix of G.
    ## creates a sparse matrix to save up the memory with COO method
    A = nx.adjacency_matrix(G, format="csr").tolil()
    D = scipy.sparse.csgraph.floyd_warshall(
            A, directed=False, unweighted=False)

    N = D.shape[0] # N: number of all nodes
    closeness_centrality = {}
    for r in range(0, N):
        cc = 0.0
        possible_paths = list(enumerate(D[r, :]))
        shortest_paths = dict(filter(lambda x: not x[1] == np.inf, possible_paths)) # shortest path from node u

        
        total = sum(shortest_paths.values())
        # n = len(shortest_paths)
        n_shortest_paths = len(shortest_paths) - 1.0  
        if total > 0.0 and N > 1:
            s = n_shortest_paths / (N - 1)
            cc = (n_shortest_paths / total) * s
        closeness_centrality[r] = cc
    closeness_df = pd.DataFrame()
    closeness_df["nodes"] = closeness_centrality.keys()
    closeness_df["closeness"] = closeness_centrality.values()
    
#     closeness = nx.closeness_centrality(G, u=None, distance='weight', wf_improved=True)
#     closeness_df = pd.DataFrame()
#     closeness_df["nodes"] = closeness.keys()
#     closeness_df["closeness"] = closeness.values()
    newfile_closeness = "".join([myoutputpath,"closeness_", name,".csv"])
    closeness_df.to_csv(newfile_closeness, index=False)
    i += 1
else:
    print("Files are processed.")

In [None]:
# 10:01PM 12 665 nodes
# 10:14pm 13 707 nodes
# 10: 46pm 15 808 nodes
# 12:46 18 988 nodes
# 1:39 19 1038 nodes

# requested time for jupter ran out. 

# 2:23am 19 1038 nodes
# 在电脑熄屏之后不能继续
# 5/4 12:48pm 25 1248 nodes

#5/4 6:38pm #91 2002q4
#    6:49pm 99  2004q4  5477 nodes

In [None]:
G = nx.read_pajek(filenames[0])
print(G) #检查 是否读了所有的 .net files
G1 = nx.Graph(G)
n = nx.number_of_nodes(G1)
constraint = nx.constraint(G, nodes=None, weight=None)
print(type(constraint))

## Additional Network Measures

In [None]:
year_start = 1980
year_end = 1980

missing_list = []

myoutputpath = "/scratch/bell/hu244/vxpert/quarterlyversion/Stataoutput/PajekOutput"

year = year_start
while year <= year_end:
    for i in range(0,4):
        i += 1

        filename = "vc_syn_5yr" + str(year) + "q" + str(i) + ".net"
#         print(filename)
        name = str(year) +"q" +str(i)
        print(name)
        #Check if file is exist
        if os.path.isfile(filename) == True:
            print("File exist, processing")
        else:
            print(filename,"doesn't exist, skipped")
            missing_list.append(filename)
            continue
        #Create files if exist
        G = nx.read_pajek(filename)
        G1 = nx.Graph(G)
        n = nx.number_of_nodes(G1)
#         constraint = nx.constraint(G, nodes=None, weight=None)
#         cons_df = pd.DataFrame()
#         cons_df["node"] = constraint.keys()
#         cons_df["constraint"] = constraint.values()
#         newfile="".join([myoutputpath,"constraint_", name,".csv"])
#         cons_df.to_csv(newfile, index=False)
#         closeness = nx.closeness_centrality(G, u=None, distance='weight', wf_improved=True)
#         closeness_df = pd.DataFrame()
#         closeness_df["nodes"] = closeness.keys()
#         closeness_df["closeness"] = closeness.values()
#         newfile_closeness = "".join([myoutputpath,"closeness_", name,".csv"])
#         closeness_df.to_csv(newfile_closeness, index=False)
        effective_size = nx.effective_size(G, nodes=None, weight='weight')
        effectivesize_df = pd.DataFrame()
        effectivesize_df["node"]=effective_size.keys()
        effectivesize_df["effective_size"]=effective_size.values()
        newfile_effsize=""join([myoutputpath,"effectivesize_",name ,".csv"])
        effectivsize_df.to_csv(newfile_effsize, index=False)
        avg_neighdegree = nx.average_neighbor_degree(G, nodes=None, weight='weight')
        avgneidegree_df = pd.DataFrame()
        avgneidegree_df["node"]=avg_neighdegree.keys()
        avgneidegree_df["average_neighbor_degree"]=avg_neighdegree.values()
        newfile_avgneidegree = "".join([myoutputpath, "avgneidegree", name, ".csv"])
        avgneidegree_df.to_csv(newfile_avgneidegree, index=False)
    year += 1
else:
    print("*********Loop is over*********")
    print("The missing files:")
    print(missing_list)

In [None]:
# constraint - Cathy's loop
import time


year_start = 1986
year_end = 1986

missing_list = []

myoutputpath = "/Users/kuanchensu/Documents/RA/network/"

year = year_start
while year <= year_end:
    for i in range(0,4):
        i += 1
        start_time1 = time.time()
        print(start_time1)
        filename = "vc_syn_5yr" + str(year) + "q" + str(i) + ".net"
        name = "vc_syn_5yr" + str(year) + "q" + str(i)
        print(filename)
        #Check if file is exist
        if os.path.isfile(filename) == True:
            print("File exist, processing")
        else:
            print(filename,"doesn't exist, skipped")
            missing_list.append(filename)
            continue
        start_time2 = time.time()
        print("time2", start_time2)
        #Create files if exist
        G = nx.read_pajek(filename)
        G1 = nx.Graph(G)
        n = nx.number_of_nodes(G1)
        start_time3 = time.time()
        print("time3", start_time3)
#         constraint = nx.constraint(G, nodes=None, weight=None)
#         cons_df = pd.DataFrame()
#         cons_df["node"] = constraint.keys()
#         cons_df["constraint"] = constraint.values()
#         newfile="".join([myoutputpath,"constraint_", filename,".csv"])
#         start_time4 = time.time()
#         print("time4", start_time4)
#         cons_df.to_csv(newfile, index=False)
#         start_time5 = time.time()
#         print("time5", start_time5)
#         closeness = nx.closeness_centrality(G, u=None, distance='weight', wf_improved=True)
        A = nx.adjacency_matrix(G).tolil()
        D = scipy.sparse.csgraph.floyd_warshall( \
                     A, directed=False, unweighted=False)

        n = D.shape[0]
        closeness_centrality = {}
        for r in range(0, n):

            cc = 0.0

            possible_paths = list(enumerate(D[r, :]))
            shortest_paths = dict(filter( \
                lambda x: not x[1] == np.inf, possible_paths))

            total = sum(shortest_paths.values())
            n_shortest_paths = len(shortest_paths) - 1.0
            if total > 0.0 and n > 1:
                s = n_shortest_paths / (n - 1)
                cc = (n_shortest_paths / total) * s
            closeness_centrality[r] = cc
        closeness_df = pd.DataFrame()
        closeness_df["nodes"] = closeness_centrality.keys()
        closeness_df["closeness"] = closeness_centrality.values()
        newfile_closeness = "".join([myoutputpath,"closeness_", name,".csv"])
        start_time6 = time.time()
        print("time6", start_time6)
        closeness_df.to_csv(newfile_closeness, index=False)
        start_time7 = time.time()
        print("time7", start_time7)
    year += 1
else:
    print("*********Loop is over*********")
    print("The missing files:")
    print(missing_list)


## convert txt (from pajek) to csv 

In [None]:
import os
import time
import networkx as nx
import pandas as pd

os.chdir('/scratch/bell/hu244/vxpert/quarterlyversion/StataOutput/PajekOutput')
print("Current working directory: {0}".format(os.getcwd()))

In [None]:
# convert txt to csv
# Loop by time:

year_start = 1991
year_end = 1991

missing_list = []

myoutputpath = "/scratch/bell/hu244/vxpert/quarterlyversion/StataOutput/PajekOutput/"

year = year_start
while year <= year_end:
    for i in range(0,4):
        i += 1
        start_time1 = time.time()
        print(start_time1)
        filename = "constraint_" + str(year) + "q" + str(i) + ".txt"
        name = "constraint_" + str(year) + "q" + str(i) + ".csv"
        print(filename)
        #Check if file is exist
        if os.path.isfile(filename) == True:
            print("File exist, processing")
        else:
            print(filename,"doesn't exist, skipped")
            missing_list.append(filename)
            continue
        data = pd.read_csv(filename,
                   skiprows=1,
                   sep='\t',
                   header=None,
                   engine='python')
        new = data.iloc[:,[1,2]]
        new = new.rename(columns={1: "nodes", 2: "constraints"})
        new.to_csv(name, index=False)

    year += 1
else:
    print("*********Loop is over*********")
    print("The missing files:")
    print(missing_list)

In [None]:
Loop by file name:
import re

files = ["constraint_1988q3.txt", "constraint_1989q1.txt"]

for filename in files:
    data = pd.read_csv(filename,
                       skiprows=1,
                       sep='\t',
                       header=None,
                       engine='python')
    new = data.iloc[:,[1,2]]
    new = new.rename(columns={1: "nodes", 2: "constraints"})
    name = re.search("(.+?).txt",filename).group(1)
    name = "".join([name, ".csv"])
    print(name)
    new.to_csv(name, index=False)