# GDELT Automorphisms via TSIR-GN

In [1]:
import pandas as pd 
import numpy as np
class loader:
    
    def __init__(self):
        self.countID=0
        self.G={}
        self.co={}
        self.revco={}
    
    def nodeID(self,x):
        if x not in self.co:
            self.co[x]=self.countID
            self.countID=self.countID+1
            self.revco[self.co[x]]=x
        return self.co[x]
    
    def read(self,file, mint,maxt):
        #x=pd.read_csv(file,sep=' ',header=None).values
        x=file.values
        for a in range(x.shape[0]):
            if float(x[a,2]) >mint and float(x[a,2])<=maxt:
                i=self.nodeID(x[a,0])
                j=self.nodeID(x[a,1])
                self.addEdge((i,j,float(x[a,2])))
        self.fixG()
        
    def storeEmb(self,file,data):
        file1 = open(file, 'w') 
        for a in range(data.shape[0]):
            s=''+str(int(self.revco[a]))
            for b in range(data.shape[1]):
                s+=' '+str(data[a,b])
            file1.write(s+"\n")
        file1.close()
            
    
    def fixG(self):
        for g in range(len(self.G)):
            orderSet=[t for t in self.G[g]]
            orderSet.sort(reverse=True)
            self.G[g]=[(t,np.array([x for x in self.G[g][t]['in']]),np.array([y for y in self.G[g][t]['out']])) for t in orderSet]

            
    def addEdge(self,s):
        (l1,l2,t)=s
        if l1 not in self.G:
            self.G[l1]={}
        if l2 not in self.G:
            self.G[l2]={}
        if t not in self.G[l1]:
            self.G[l1][t]={}
            self.G[l1][t]['out']=set()
            self.G[l1][t]['in']=set()
        if t not in self.G[l2]:
            self.G[l2][t]={}
            self.G[l2][t]['out']=set()
            self.G[l2][t]['in']=set()
        self.G[l1][t]['out'].add(l2)
        self.G[l2][t]['in'].add(l1)

In [2]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.metrics import silhouette_score,calinski_harabasz_score,davies_bouldin_score
from sklearn.decomposition import PCA,IncrementalPCA,TruncatedSVD



def dirtemporalAggregation1(embd,G,v,alpha):
    k=embd.shape[1]
    h=np.zeros((k*2,k*2))
    h1=np.zeros((1,k*2))
    w=[]
    for i in range(len(G[v])):
        #in is first in tuple, out is second
        (ti,lii, lio)=G[v][i]
        wiin=np.zeros((k,))
        wiout=np.zeros((k,))
        for f in lii:
            wiin+=embd[f,:] #sum of all the neighbors at this timestamp
        for g in lio:
            wiout+=embd[g,:] #sum of all the neighbors at this timestamp
        wiboth=np.hstack([wiin, wiout])
        h1+=wiboth #sum of all the neighbors for all timestamps
        w.append(wiboth.reshape((k*2,1)))
    z=np.zeros((1,k*2))
    for i in range(1,len(G[v])):
        (tni,lii, lio)=G[v][i]
        (tnim1,lim1i, lim1o)=G[v][i-1]
        z=np.exp((tni-tnim1)/alpha)*(w[i-1].transpose()+z)
        a=w[i]*z
        h+=a
    g=h.flatten()
    return np.hstack([g.reshape((1,g.shape[0])),h1])

def temporalAggregation2(embd,G,v,alpha):
    k=embd.shape[1]
    h=np.zeros((1,k))
    for i in range(len(G[v])):
        (ti,li)=G[v][i]
        for f in li:
            h+=embd[f,:]
    return h
    
def dirtemporalAggregation(embd,G,alpha):
    m=[]
    nv=len(G)
    for v in range(nv):
        m.append(dirtemporalAggregation1(embd,G,v,alpha))
    return np.vstack(m)

def getnumber(emb):
    ss=set()
    for x in range(emb.shape[0]):
        sd=''
        for y in range(emb.shape[1]):
            sd+=','+str(emb[x,y])
        ss.add(sd)
    return len(ss)

def dirtemporalSirGN(G,n,alpha,iter=10):
    nv=len(G) 
    embd=np.array([[1/n for i in range(n)] for x in range(nv)])
    emb=dirtemporalAggregation(embd,G,alpha)
    for i in range(iter):
        print(i)
        scaler = MinMaxScaler()
        emb1=scaler.fit_transform(emb)
        kmeans = KMeans(n_clusters=n, random_state=1).fit(emb1)
        val=kmeans.transform(emb1)
        M=val.max(axis=1)
        m=val.min(axis=1)
        subx=(M.reshape(nv,1)-val)/(M-m).reshape(nv,1)
        su=subx.sum(axis=1)
        subx=subx/su.reshape(nv,1)
        emb=dirtemporalAggregation(subx,G,alpha)
    return emb

def dirtemporalSirGNStop(G,n,alpha,iter=100):
    nv=len(G) 
    embd=np.array([[1/n for i in range(n)] for x in range(nv)])
    emb=dirtemporalAggregation(embd,G,alpha)
    count=getnumber(emb)
    print('count',count)
    for i in range(iter):
        print(i)
        scaler = MinMaxScaler()
        emb1=scaler.fit_transform(emb)
        kmeans = KMeans(n_clusters=n, random_state=1).fit(emb1)
        val=kmeans.transform(emb1)
        M=val.max(axis=1)
        m=val.min(axis=1)
        subx=(M.reshape(nv,1)-val)/(M-m).reshape(nv,1)
        su=subx.sum(axis=1)
        subx=subx/su.reshape(nv,1)
        emb2=dirtemporalAggregation(subx,G,alpha)
        count1=getnumber(emb2)
        print('count',count1)
        if count>=count1:
            break
        else:
            emb=emb2
            count=count1
    return emb


In [3]:
def load_graph(file):
    data = file.values
    G = nx.Graph()
    
    for i in range(len(data)):
        if float(data[a,2]) >mint and float(data[a,2])<=maxt:
            G.add_edge(data[i,0], data[i,1])
    return G

In [4]:
gdelt = pd.read_csv('../Data/edges.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'edges.csv'

In [6]:
gdelt.drop(columns=['Unnamed: 0', 'int_roll', 'ext_roll', 'Unnamed: 0.1'], inplace=True)

In [7]:
gdelt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191290882 entries, 0 to 191290881
Data columns (total 3 columns):
 #   Column  Dtype  
---  ------  -----  
 0   src     int64  
 1   dst     int64  
 2   time    float64
dtypes: float64(1), int64(2)
memory usage: 4.3 GB


In [8]:
def year(i):
    return int(i*365*24*4)

In [9]:
mint = year(2)
maxt = year(4)
l=loader()
print("running loader gdelt")
l.read(gdelt, mint, maxt)

running loader gdelt


KeyboardInterrupt: 

In [None]:
print("running embedding gdelt")
emb=dirtemporalSirGNStop(l.G,10,1)

In [None]:
print("converting embedding to dataframe" )
embdf = pd.DataFrame(emb)
print("extracting value counts")
counts = np.array(embdf.value_counts())
print("extracting single embeddings")
singles = list(counts).count(1)
print("singles: ", singles)

In [None]:
import matplotlib.pyplot as plt

f, (ax, ax2) = plt.subplots(2, 1, sharex=True)

# plot the same data on both axes
ax.hist(counts, bins=18, edgecolor='black')
ax2.hist(counts, bins=18, edgecolor='black')

# zoom-in / limit the view to different portions of the data
ax.set_ylim(100, 15000)  
ax2.set_ylim(0, 50)  

# hide the spines between ax and ax2
ax.spines['bottom'].set_visible(False)
ax2.spines['top'].set_visible(False)
ax.xaxis.tick_top()
ax.tick_params(labeltop=False)  # don't put tick labels at the top
ax2.xaxis.tick_bottom()


d = .015  

kwargs = dict(transform=ax.transAxes, color='k', clip_on=False)
ax.plot((-d, +d), (-d, +d), **kwargs)        # top-left diagonal
ax.plot((1 - d, 1 + d), (-d, +d), **kwargs)  # top-right diagonal

kwargs.update(transform=ax2.transAxes)  # switch to the bottom axes
ax2.plot((-d, +d), (1 - d, 1 + d), **kwargs)  # bottom-left diagonal
ax2.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)  # bottom-right diagonal


#ax.set_title('Distribution of Sizes of Automorphic Groups')
plt.xlabel('Size of Automorphic Group (# nodes)')
plt.ylabel('Count')
plt.show()
#plt.savefig('GDELTDistg.pdf', bbox_inches='tight')