In [1]:
import numpy as np
import pandas as pd
import time

# 1

### 1.1 Data Input

### 1.2 Creating an Adjacency Matrix

In [2]:
Z = np.array([[1.0,0,2,0,4,3],[3,0,1,1,0,0],[2,0,4,0,1,0],
              [0,0,1,0,0,1],[8,0,3,0,5,2],[0,0,0,0,0,0]])

### 1.3 Modifying the Adjacency Matrix

In [3]:
# Set Zii equal to 0
for i in range(len(Z)):
    Z[i,i] = 0

In [4]:
Z

array([[0., 0., 2., 0., 4., 3.],
       [3., 0., 1., 1., 0., 0.],
       [2., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 1.],
       [8., 0., 3., 0., 0., 2.],
       [0., 0., 0., 0., 0., 0.]])

In [5]:
# Normalize
H = Z.copy()
for i in range(len(H[0])):
    col_sum = 0
    for j in range(len(H)):
        col_sum = col_sum + H[j,i]
    if col_sum != 0:
        for k in range(len(H)):
            H[k,i] = H[k,i]/col_sum

In [6]:
H

array([[0.        , 0.        , 0.28571429, 0.        , 0.8       ,
        0.5       ],
       [0.23076923, 0.        , 0.14285714, 1.        , 0.        ,
        0.        ],
       [0.15384615, 0.        , 0.        , 0.        , 0.2       ,
        0.        ],
       [0.        , 0.        , 0.14285714, 0.        , 0.        ,
        0.16666667],
       [0.61538462, 0.        , 0.42857143, 0.        , 0.        ,
        0.33333333],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

### 1.4 Identifying the Dangling Nodes

In [7]:
d = np.zeros(len(H[0]))
for i in range(len(H[0])):
    col_sum = 0
    for j in range(len(H)):
        col_sum = col_sum + H[j,i]
    if col_sum == 0:
        d[i] = 1

In [8]:
d

array([0., 1., 0., 0., 0., 0.])

### 1.5 Calculating the Influence Vector

In [9]:
a = np.array([3.0,2,5,1,2,1])
A_tot = np.sum(a)
a = a/A_tot
a = a.reshape(6,1)
a

array([[0.21428571],
       [0.14285714],
       [0.35714286],
       [0.07142857],
       [0.14285714],
       [0.07142857]])

In [10]:
pi_0 = np.full((6, 1), 1/6)
pi_0

array([[0.16666667],
       [0.16666667],
       [0.16666667],
       [0.16666667],
       [0.16666667],
       [0.16666667]])

In [11]:
alp = 0.85
eps = 0.00001

In [12]:
eT = np.full((1, 6), 1)
eT

array([[1, 1, 1, 1, 1, 1]])

In [13]:
H_prime = H.copy()
for i in range(len(d)):
    if d[i] == 1:
        H_prime[:,i] = a.reshape(6,)
H_prime

array([[0.        , 0.21428571, 0.28571429, 0.        , 0.8       ,
        0.5       ],
       [0.23076923, 0.14285714, 0.14285714, 1.        , 0.        ,
        0.        ],
       [0.15384615, 0.35714286, 0.        , 0.        , 0.2       ,
        0.        ],
       [0.        , 0.07142857, 0.14285714, 0.        , 0.        ,
        0.16666667],
       [0.61538462, 0.14285714, 0.42857143, 0.        , 0.        ,
        0.33333333],
       [0.        , 0.07142857, 0.        , 0.        , 0.        ,
        0.        ]])

In [14]:
#P = alp*H_prime + (1-alp)*a@eT
#P

In [15]:
# Compute the influence vector
pi_star_k1 = pi_0
for i in range(100):
    pi_star_k2 = alp*H@pi_star_k1+(alp*d@pi_star_k1+(1-alp))*a
    if sum(np.abs(pi_star_k2-pi_star_k1)) < eps:
        print(i)
        break
    pi_star_k1 = pi_star_k2
pi_star_k2

17


array([[0.30402138],
       [0.16360304],
       [0.18979616],
       [0.04661906],
       [0.27531309],
       [0.02064727]])

### 1.6 Calculating Eigenfactor (EFi)

In [16]:
#  Calculating Eigenfactor
EF = 100*H@pi_star_k2/sum(H@pi_star_k2)
EF

array([[34.05100649],
       [17.20374224],
       [12.17545523],
       [ 3.6531636 ],
       [32.91663244],
       [ 0.        ]])

### Optimizing your code:

In [17]:
#calculating Eigenfactor Scores function
def cal_es(Z):    
    # Set diagonal equals to zero
    for i in range(len(Z)):
        Z[i,i] = 0
        
    # nomalize
    H = Z.copy()
    col_sums = []
    for i in range(len(H[0])):
        col_sum = 0
        for j in range(len(H)):
            col_sum = col_sum + H[j,i]
        col_sums = col_sums + [col_sum]
        if col_sum != 0:
            for k in range(len(H)):
                H[k,i] = H[k,i]/col_sum
            
    # Create d(N times)
    d = [0 if x != 0 else 1 for x in col_sums]
    d = np.array(d)
    
    # Calculating the influecne factor
    # A_tot
    a = np.full((len(H[0]), 1), 1)
    a = a/len(H[0])
    a = a.reshape(len(H[0]),1)
    
    #pi_0
    pi_0 = np.full((len(H[0]), 1), 1/len(H[0]))
    
    alp = 0.85
    eps = 0.00001
    
    #eT
    eT = np.full((1,len(H[0])), 1)
    
    # Compute the influence vector
    pi_star_k = pi_0
    for i in range(100):
        pi_star_k1 = alp*H@pi_star_k+(alp*(d@pi_star_k)+(1-alp))*a
        diff = np.sum(np.abs(pi_star_k1-pi_star_k))
        if diff < eps:
            break
        pi_star_k = pi_star_k1
    
    #  Calculating Eigenfactor
    EF = 100*H@pi_star_k1/(H@pi_star_k1).sum()
    journal_index = list(range(len(Z)))
    EF = pd.DataFrame(data=EF, index = journal_index)
    
    # Retrun EF and the number of iterations.
    return EF,i

In [18]:
# import files
with open('links.txt', 'r') as f:
    data = [[int(num) for num in line.split(',')] for line in f]

start = time.time()    
# change list to numpy array
data = np.array(data)

# combine all journals
citing_cited = np.concatenate([data[:,0], data[:,1]])

# make adjacency matrix
Z = np.empty([len(np.unique(citing_cited)), len(np.unique(citing_cited))])
for i in range(len(data)):
    col = data[i,0]
    row = data[i,1]
    times = data[i,2]
    Z[row,col] = times

# calculating Eigenfactor
result = cal_es(Z)

# print results
print(result[1])
EF = result[0]
stop = time.time()-start
print(stop)
EF.sort_values(0,ascending=False).iloc[0:20,:]

31
110.72124361991882


Unnamed: 0,0
4408,1.448119
4801,1.412719
6610,1.235035
2056,0.679502
6919,0.664879
6667,0.634635
4024,0.577233
6523,0.480815
8930,0.477773
6857,0.439735


### (a) The scores of the top 20 journals are on the above chart. 

### (b) The time of this alogrithm for this dataset is around 2 minutes(from constrcuting the adjacency matrix to calculate the result).

### (c)  It tooks 31 iterations to get to my answer.

### Below are my scratch code

In [None]:
Z = np.array([[1.0,0,2,0,4,3],[3,0,1,1,0,0],[2,0,4,0,1,0],
              [0,0,1,0,0,1],[8,0,3,0,5,2],[0,0,0,0,0,0]])

In [None]:
np.concatenate((Z,Z),axis = 1)

In [None]:
six = pd.DataFrame(data=Z)
result = cal_es(six)

In [None]:
result[0]

In [None]:
result[2]

In [None]:
#calculating Eigenfactor Scores function, df contains a N*N matrix
def cal_es(df):
    start = time.time()
    Z = df.copy()
    
    # Set diagonal equals to zero(N times)
    for i in range(len(Z)):
        Z.iloc[i,i] = 0
        
    # nomalize(N times)
    H = Z.copy()
    col_sums = H.sum(axis=0)
    for i in range(len(H[0])):
        if(col_sums.iloc[i]!=0):
            H.iloc[:,i] = H.iloc[:,i]/col_sums.iloc[i]
            
    # Create d(N times)
    d = np.zeros(len(H[0]))
    for i in range(len(H[0])):
        if(col_sums.iloc[i]==0):
            d[i] = 1
            
    # Calculating the influecne factor
    # A_tot
    a = np.full((len(H[0]), 1), 1)
    a = a/len(H[0])
    a = a.reshape(len(H[0]),1)
    
    #pi_0
    pi_0 = np.full((len(H[0]), 1), 1/len(H[0]))
    
    alp = 0.85
    eps = 0.00001
    
    #eT
    eT = np.full((1,len(H[0])), 1)
    
    # Compute the influence vector
    pi_star_k = pi_0
    for i in range(100):
        pi_star_k1 = alp*H@pi_star_k+(alp*(d@pi_star_k)+(1-alp))[0]*a
        diff = np.sum(np.abs(pi_star_k1-pi_star_k)).iloc[0]
        if diff < eps:
            break
        pi_star_k = pi_star_k1
    
    #  Calculating Eigenfactor
    EF = 100*H@pi_star_k1/(H@pi_star_k1).sum()
    stop = time.time()-start
    
    # Retrun EF, time and the number of iterations.
    return EF,stop,i

In [None]:
# Run using function
data = pd.read_csv('links.txt', sep=",", header=None)
data.columns = ["citing", "cited", "times"]

# Find all unique journals
all_journal = pd.concat([data.citing, data.cited], axis=0)
all_journal.nunique()

# Creating dataframes with the number of journal as name of the index and column
Z = np.empty([10748,10748])
df = pd.DataFrame(data=Z, columns = all_journal.unique(), index = all_journal.unique())

In [None]:
# Creating dataframes with the number of journal as name of the index and column
start = time.time()
for i in range(len(data)):
    df.loc[data.loc[i,"cited"],data.loc[i,"citing"]] = data.loc[i,"times"]
stop = time.time()-start
print(stop)

In [None]:
data = pd.read_csv('links.txt', sep=",", header=None)
data.columns = ["citing", "cited", "times"]

In [None]:
start = time.time()
for i in range(len(data)):
    df.loc[data.loc[i,"cited"],data.loc[i,"citing"]] = data.loc[i,"times"]
stop = time.time()-start
print(stop)

In [None]:
result = cal_es(df)

In [None]:
print(result[1])
EF = result[0]
EF = EF[0]
descending_EF = EF.sort_values(ascending=False)
print(descending_EF.iloc[0:20])

In [None]:
result[2]

In [None]:
# Find all unique journals
all_journal = pd.concat([data.citing, data.cited], axis=0)
all_journal.nunique()

In [None]:
# Creating dataframes with the number of journal as name of the index and column
Z = np.empty([10748,10748])
df = pd.DataFrame(data=Z, columns = all_journal.unique(), index = all_journal.unique())
df

In [None]:
# enter txt files to the dataframe
for i in range(len(data)):
    df.loc[data.loc[i,"cited"],data.loc[i,"citing"]] = data.loc[i,"times"]

In [None]:
Z = df.copy()
Z

In [None]:
# Set diagonal equal to 0
for i in range(len(Z)):
    Z.iloc[i,i] = 0

In [None]:
Z

In [None]:
# Nomalize
H = Z.copy()
col_sums = H.sum(axis=0)
for i in range(len(H[0])):
    if(col_sums.iloc[i]!=0):
        H.iloc[:,i] = H.iloc[:,i]/col_sums.iloc[i]

In [None]:
H

In [None]:
# Create d
d = np.zeros(len(H[0]))
for i in range(len(H[0])):
    if(col_sums.iloc[i]==0):
        d[i] = 1

In [None]:
# Calculating the influecne factor
# A_tot
A_tot = np.full((len(H[0]), 1), 1)
A_tot = A_tot/len(H[0])
A_tot = A_tot.reshape(len(H[0]),1)
A_tot

In [None]:
#pi_0
pi_0 = np.full((len(H[0]), 1), 1/len(H[0]))
pi_0

In [None]:
alp = 0.85
eps = 0.00001

In [None]:
#eT
eT = np.full((1,len(H[0])), 1)
eT

In [None]:
# Making H_prime
H_prime = H.copy()
for i in range(len(d)):
    if d[i] == 1:
        H_prime.iloc[:,i] = A_tot.reshape(len(H[0]),)
H_prime

In [None]:
#P
P = alp*H_prime + (1-alp)*A_tot@eT
P

In [None]:
# compute leading eigenvector
pi_star = pi_0
for i in range(100):
    pi_star = alp*H@pi_star+(alp*(d@pi_star)[0]+(1-alp))*A_tot
pi_star

In [None]:
EF = 100*H@pi_star/(H@pi_star).sum()
EF