In [None]:
import pickle
import os
import re
import numpy as np
import networkx as nx
from gensim.models import word2vec
import networkx as nx 
from sklearn.cluster import AgglomerativeClustering
import xml.dom.minidom
import xml.etree.ElementTree as ET
from GCN import *
import community
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import *
import csv
from scipy.sparse.csgraph import connected_components
import random
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
import copy


class AliasSampling:
    def __init__(self, prob):
        self.n = len(prob)
        self.U = np.array(prob) * self.n
        self.K = [i for i in range(len(prob))]
        overfull, underfull = [], []
        for i, U_i in enumerate(self.U):
            if U_i > 1:
                overfull.append(i)
            elif U_i < 1:
                underfull.append(i)
        while len(overfull) and len(underfull):
            i, j = overfull.pop(), underfull.pop()
            self.K[j] = i
            self.U[i] = self.U[i] - (1 - self.U[j])
            if self.U[i] > 1:
                overfull.append(i)
            elif self.U[i] < 1:
                underfull.append(i)

    def sampling(self, n=1):
        x = np.random.rand(n)
        i = np.floor(self.n * x)
        y = self.n * x - i
        i = i.astype(np.int32)
        res = [i[k] if y[k] < self.U[i[k]] else self.K[i[k]] for k in range(n)]
        if n == 1:
            return res[0]
        else:
            return res
        
        
def GHAC(mlist,G,idx_pid,n_clusters=-1):
        
    distance=[]
    graph=[]
    for i in range(len(mlist)):
        gtmp=[]
        for j in range(len(mlist)):
            if i<j and G.has_edge(idx_pid[i],idx_pid[j]):
                cosdis=1/(1+np.exp(-np.dot(mlist[i],mlist[j])))
                gtmp.append(cosdis)
            elif i>j:
                gtmp.append(graph[j][i])
            else:
                gtmp.append(0)
        graph.append(gtmp)
        
    graph=np.array(graph)
    distance =np.multiply(graph,-1)
    
    if n_clusters==-1:
        best_m=-10000000
        
        n_components, labels = connected_components(graph) 
        Gr=nx.from_numpy_matrix(graph)
        
        graph[graph<=0.9]=0 #Edge pre-clustering 
        n_components1, labels = connected_components(graph)
        
        for k in range(n_components1,n_components-1,-1):  
            model_HAC = AgglomerativeClustering(linkage="average",affinity='precomputed',n_clusters=k)
            model_HAC.fit(distance)
            labels = model_HAC.labels_
            
            part= {}
            for j in range (len(labels)):
                part[j]=labels[j]

            mod = community.modularity(part,Gr)
            if mod>=best_m:
                best_m=mod
                best_labels=labels
        labels = best_labels
    else:
        model_HAC = AgglomerativeClustering(linkage="average",affinity='precomputed',n_clusters=n_clusters)
        model_HAC.fit(distance)
        labels = model_HAC.labels_
    
    return labels
  

def pairwise_evaluate(correct_labels,pred_labels):
    TP = 0.0  # Pairs Correctly Predicted To SameAuthor
    TP_FP = 0.0  # Total Pairs Predicted To SameAuthor
    TP_FN = 0.0  # Total Pairs To SameAuthor

    for i in range(len(correct_labels)):
        for j in range(i + 1, len(correct_labels)):
            if correct_labels[i] == correct_labels[j]:
                TP_FN += 1
            if pred_labels[i] == pred_labels[j]:
                TP_FP += 1
            if (correct_labels[i] == correct_labels[j]) and (pred_labels[i] == pred_labels[j]):
                TP += 1

    if TP == 0:
        pairwise_precision = 0
        pairwise_recall = 0
        pairwise_f1 = 0
    else:
        pairwise_precision = TP / TP_FP
        pairwise_recall = TP / TP_FN
        pairwise_f1 = (2 * pairwise_precision * pairwise_recall) / (pairwise_precision + pairwise_recall)
    return pairwise_precision, pairwise_recall, pairwise_f1


save_model_name = "gene/word2vec.model"
model_w = word2vec.Word2Vec.load(save_model_name)

r = '[!“”"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~—～]+'
stopword = ['at','based','in','of','for','on','and','to','an','using','with','the','method','algrithom','by','model']
stopword = [porter_stemmer.stem(w) for w in stopword]

result=[]

path = "raw-data/"
file_names = os.listdir(path)
for fname in file_names:
    # Data processing
    fname = fname[:-4]
    f = open(path + fname + ".xml",'r',encoding = 'utf-8').read()
    text=re.sub(u"&",u" ",f)
    root = ET.fromstring(text)
    
    correct_labels=[]
    p_to={} #Original title
    p_t={} #processed to word stem

    for i in root.findall('publication'):
        pid = i.find('id').text
        
        if pid in p_t:
            pid = pid+'1'
        
        correct_labels.append(int(i.find('label').text))
               
        line = i.find('title').text
        line = re.sub(r, ' ', line)
        line = line.replace('\t',' ')
        line = line.lower()
        split_cut = line.split(' ')
        
        p_t[pid]=[]
        p_to[pid]=[]
        for j in split_cut:
            if len(j)>1:
                p_to[pid].append(j)
                if porter_stemmer.stem(j) not in stopword:
                    p_t[pid].append(porter_stemmer.stem(j))

                    
                    
    # Construct PHNet            
    pid_idx={}
    idx_pid={}
    idx=0
    G = nx.Graph()
    for pid in p_t:
        G.add_node(pid)
        pid_idx[pid]=idx
        idx_pid[idx]=pid
        idx=idx+1
        
    ## CoAuthor
    Ga = nx.Graph()
    for pid in p_t:
        Ga.add_node(pid)
    fa = open("experimental-results/authors/" + fname + "_authorlist.txt",'r',encoding = 'utf-8').readlines()  
    for line in fa:
        line.strip()
        split_cut = line.split('\t')
        keyi = idx_pid[int(split_cut[0].strip())]
        keyj = idx_pid[int(split_cut[1].strip())]
        weights = 1
        if Ga.has_edge(keyi,keyj):
            Ga[keyi][keyj]['weight'] =Ga[keyi][keyj]['weight'] + weights
        else:
            Ga.add_edge(keyi,keyj,{'weight': weights})        

    ## CoVenue
    Gv = nx.Graph()
    for pid in p_t:
        Gv.add_node(pid)
    fv = open("experimental-results/" + fname + "_jconfpair.txt",'r',encoding = 'utf-8').readlines()  
    for line in fv:
        line.strip()
        split_cut = line.split('\t')
        keyi = idx_pid[int(split_cut[0].strip())]
        keyj = idx_pid[int(split_cut[1].strip())]
        weights = 1
        Gv.add_edge(keyi,keyj,{'weight': weights})        

    ## CoTitle
    Gt = nx.Graph()
    for pid in p_t:
        Gt.add_node(pid)
    for i, keyi in enumerate(p_t):
        for j, keyj in enumerate(p_t): 
            weights=len(set(p_t[keyi]).intersection(set(p_t[keyj])))
            if (j>i and weights>=2):
                Gt.add_edge(keyi,keyj,{'weight': weights}) 
                    

    Glist=[]
    Glist.append(Ga)   
    Glist.append(Gt)
    Glist.append(Gv)
                   
    for i in range(len(Glist)):
        for u,v,d in Glist[i].edges(data = 'weight'): 
            if G.has_edge(u,v):
                G[u][v]['weight'] = G[u][v]['weight'] + d['weight']
            else:
                G.add_edge(u,v,{'weight': d['weight']}) 
    Glist.append(G)
    
    

    # Sampling
    all_neighbor_samplings=[]
    all_neg_sampling=[]

    for i,Gi in enumerate(Glist):
        adj_matrix = nx.adj_matrix(Gi).toarray()
        
        Gtmp= copy.deepcopy(Gi)
        for u,v,d in Gtmp.edges(data = 'weight'):
            Gtmp[u][v]['weight'] = 1
        length = nx.all_pairs_dijkstra_path_length(Gtmp)
        
        for u in length:
            for v in length[u]:
                if Gtmp.has_edge(u,v) is False and length[u][v]>0:
                    Gtmp.add_edge(u,v,{'weight': length[u][v]})
        pathl_matrix = nx.adj_matrix(Gtmp).toarray()

        neighbor_samplings = []
        neg_samplings=[]
        for i in range(G.number_of_nodes()):
            node_weights = adj_matrix[i]
            if np.sum(node_weights)==0:
                neighbor_samplings.append(0)
            else :
                weight_distribution = node_weights / np.sum(node_weights)            
                neighbor_samplings.append(AliasSampling(weight_distribution))
                
            node_i_degrees = pathl_matrix[i]
            node_i_degrees[node_i_degrees==0] = 6
            node_i_degrees[i]=0
            node_i_degrees[node_i_degrees<=1] = 0

            if np.sum(node_i_degrees)==0:
                neg_samplings.append(0)
            else:
                node_distribution = node_i_degrees / np.sum(node_i_degrees)
                neg_samplings.append(AliasSampling(node_distribution))
                      
        all_neighbor_samplings.append(neighbor_samplings)
        all_neg_sampling.append(neg_samplings)

    numwalks=4
    walklength=10
    negative_num=3

    u_i=[]
    u_j=[]
    label=[]
    metapath=[0,1,0,2]
    
    for node_index in range(G.number_of_nodes()):       
        for j in range(0, numwalks):
            node_start=node_index
            g_index=j
            gi=metapath[g_index]
            for i in range(0, walklength):     
                if all_neighbor_samplings[gi][node_start] != 0: 
                    node_p = all_neighbor_samplings[gi][node_start].sampling()
                    u_i.append(node_start)
                    u_j.append(node_p)
                    label.append(1)

                    if all_neg_sampling[-1][node_start] != 0:
                        for k in range(negative_num):
                            node_n = all_neg_sampling[-1][node_start].sampling()
                            u_i.append(node_start)
                            u_j.append(node_n)
                            label.append(-1)
                        
                    g_index=(g_index+1)%len(metapath)
                    gi=metapath[g_index]
                    
                    if all_neighbor_samplings[gi][node_p] != 0:

                        node_p1 = all_neighbor_samplings[gi][node_p].sampling()
                        u_i.append(node_start)
                        u_j.append(node_p1)
                        label.append(1)
                        if all_neg_sampling[-1][node_start] != 0:
                            for k in range(negative_num):
                                node_n = all_neg_sampling[-1][node_start].sampling()
                                u_i.append(node_start)
                                u_j.append(node_n)
                                label.append(-1)
                  
                    node_start = node_p
                    
                else:
                    for k in range(negative_num):
                        node_n = all_neg_sampling[-1][node_start].sampling()
                        u_i.append(node_start)
                        u_j.append(node_n)
                        label.append(-1)
                    g_index=(g_index+1)%len(metapath)
                    gi=metapath[g_index]

                    
                    
    # Training    
    node_attr = []
    for pid in p_to:
        words_vec=[]
        for word in p_to[pid]:
            if (word in model_w):
                words_vec.append(model_w[word])
        if len(words_vec)==0:
            words_vec.append(2*np.random.random(100)-1)
        node_attr.append(np.mean(words_vec,0))
    node_attr=np.array(node_attr)

    batch_size = 64
    total_batch = 3*int(len(u_i)/batch_size)
    display_batch = 100

    model = GCN(Glist, node_attr, batch_size=batch_size)

    avg_loss = 0.
    for i in range(total_batch):
        sdx=(i*batch_size)%len(u_i)
        edx=((i+1)*batch_size)%len(u_i)
        #print (sdx,edx)
        if edx>sdx:
            u_ii = u_i[sdx:edx]
            u_jj = u_j[sdx:edx]
            labeli = label[sdx:edx]
        else:
            u_ii = u_i[sdx:]+u_i[0:edx]
            u_jj = u_j[sdx:]+u_j[0:edx]
            labeli = label[sdx:]+label[0:edx]
        loss= model.train_line(u_ii, u_jj, labeli)
        avg_loss += loss / display_batch
        if i % display_batch == 0 and i > 0:
            print ('%d/%d loss %8.6f' %(i,total_batch,avg_loss))
            avg_loss = 0.
    

    
    # Evaluating
    embed_matrix = model.cal_embed()
    labels = GHAC(embed_matrix,Glist[-1],idx_pid,len(set(correct_labels)))
    pairwise_precision, pairwise_recall, pairwise_f1 = pairwise_evaluate(correct_labels,labels)
    result.append([fname,pairwise_precision, pairwise_recall, pairwise_f1])
    print (correct_labels,len(set(correct_labels)))
    print (list(labels),len(set(list(labels))))
    print (fname,pairwise_precision, pairwise_recall, pairwise_f1)

    

# Macro-F1
Prec = 0
Rec = 0
F1 = 0    
save_csvpath = 'result/'
with open(save_csvpath+'AM_nok.csv','w',newline='',encoding = 'utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["name","Prec","Rec","F1"])
    for i in result:
        Prec = Prec + i[1]
        Rec = Rec + i[2]
        F1 = F1 + i[3]
    Prec = Prec/len(result)
    Rec = Rec/len(result)
    F1 = F1/len(result)
    writer.writerow(["Avg",Prec,Rec,F1])
    for i in range(len(result)):
        tmp = result[i]
        writer.writerow(tmp[0:4])

print ("Avg",Prec,Rec,F1)