In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
from scipy.sparse import csgraph
import networkx as nx

In [6]:
df = pd.read_csv('data/train.csv', names=['row_ID', 'text_a_ID', 'text_b_ID', 'text_a_text', 'text_b_text', 'target'])
targetsss = df.target

### Method to choose k according to the paper in the section 4.1.4- Other Methods for Graph Similarity

In [7]:
def choose_k(lbda_list):
    lbda_sum = np.sum(lbda_list)
    n = len(lbda_list)
    k_best = n
    
    for k in range(n):
        val = np.sum(lbda_list[:k]) / lbda_sum
        
        if(val > 0.9):
            k_best = k
            return k_best
    
    return k_best

### Build the adjency graph

In [8]:
def build_adjency_graph(g, sent):
    
    w = 2
    le = len(sent)
    
    for idx, val in enumerate(sent):
        if idx < le - w:
            source = mapIdxWord[val]
            target1 = mapIdxWord[sent[idx + 1]]
            g.add_edge(source, target1)
    
    return g

### Run the code to get the similatiries

In [9]:
similarities = []
for val in df.values:

    wt1 = nltk.word_tokenize(val[3])
    wt2 = nltk.word_tokenize(val[4])
    
    mergeLst = list(set(wt1 + wt2))

    mapIdxWord = {}
    for idx, val in enumerate(mergeLst):
        mapIdxWord[val] = idx

    g1 = nx.DiGraph()
    g2 = nx.DiGraph()
    H = nx.path_graph(len(mergeLst))
    
    g1.add_nodes_from(H)
    g2.add_nodes_from(H)

    g1 = build_adjency_graph(g1, wt1)
    g2 = build_adjency_graph(g2, wt2)
    
    ls1 = nx.directed_laplacian_matrix(g1)
    ls2 = nx.directed_laplacian_matrix(g2)
    
    lbda1, _ = np.linalg.eig(ls1)
    lbda2, _ = np.linalg.eig(ls2)
   
    lbda1 = np.sort(lbda1)[::-1]
    lbda2 = np.sort(lbda2)[::-1]
    
    similarities.append(np.sum((lbda1 - lbda2)**2))

### Display the similarities

In [10]:
for a, b in zip(targetsss.values, similarities):
    print(a, "   ", b)

0     0.6516433079966009
0     0.132137478179229
1     0.4590518275337406
1     1.5062312909063694e-29
0     0.05062379305417762
1     6.005819938577156e-29
0     0.02762384849592552
0     0.026003288296439695
1     9.83919089988551e-30
1     0.04330417640408943
0     0.29244375464744904
1     1.2671078290112502e-29
0     0.03299612220402279
0     0.10510814479308946
0     0.2132882655025808
0     1.342296134040128e-29
0     (0.5929128989129447-4.930380657631325e-32j)
1     0.07182564760735326
1     0.04566932996006655
1     0.6104738958540931
1     3.286406857102379e-29
1     0.8483683330878308
1     1.5066187735551815
1     0.6693444466295548
1     0.4249577828149503
0     (0.2696330782245958-7.741927174952217e-18j)
1     0.029985876357585994
1     0.08217933120157578
1     7.005146468120805e-29
1     0.03528132886709741
0     0.17500493128813013
1     0.08833522127739411
0     0.7355994770824724
1     0.3683849004858942
1     1.0661555082206249
1     0.1664341403125947
1     0.04860