### GNN

In [None]:
import torch
from torch_geometric.loader import DataLoader

from torch_geometric.nn import GAE, VGAE

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import AgglomerativeClustering, KMeans
import plotly.express as px
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

from GNN.dataset import GraphData
from GNN.autoencoder import VariationalLinearEncoder, VariationalGCNEncoder, LinearEncoder, GCNEncoder

In [None]:
device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')

dataset = GraphData(root="./data")
in_channels, out_channels = dataset.num_features, 128

model = GAE(LinearEncoder(in_channels, out_channels))

model = torch.load('checkpoint/model.pt', map_location=device)
model.eval()
encoder = model.encoder

graph_embeddings = []
files = []

for i in range(9999):
    batch, file_name = dataset.get_test(i)
    files.append(file_name)
    batch = batch.to(device)
    try:
        out = encoder(batch.x, batch.edge_index)
    except:
        print(file_name)
        continue
    
    d = out.view(-1,71,128)
    edge_indexes = batch.edge_index.flatten()  #reshape(2*71,-1).transpose(1,0)

    # for idx, edges in enumerate(edge_indexes):
    x = torch.sum(d[:, edge_indexes, :], dim=1)  #calulating dimesnions by summing all the nodes.

    graph_embeddings.append(x) 


graph_embeddings = torch.stack(graph_embeddings).detach().cpu().squeeze()

In [None]:
clustering = AgglomerativeClustering(n_clusters=6).fit(graph_embeddings)
clustering_labels = clustering.labels_

dim_reduced = TSNE(n_components=2).fit_transform(graph_embeddings)


prj = pd.DataFrame(dim_reduced, columns=['X-axis', 'Y-axis'])
fig = px.scatter(
    prj, x='X-axis', y='Y-axis',
    hover_name=files,
    color=clustering_labels,
    labels={'color': 'class'},
    height=2000,
    width=2000,
)

fig.update_traces(marker_size=10)
fig.show()
fig.write_html("Sentences_clustered_agglo_6.html")


In [None]:
cal_har = calinski_harabasz_score(graph_embeddings, clustering_labels)
sil = silhouette_score(graph_embeddings, clustering_labels)
db_score = davies_bouldin_score(graph_embeddings, clustering_labels)

print(f"Cluster Validation score\n Calinksi Harabasz Score : {cal_har}\n Silhoutte score : {sil}\n davies_bouldin_score : {db_score}")

In [None]:
clusters = {0:[], 1:[], 2:[], 3:[], 4:[], 5:[]}

for i in range(len(clustering_labels)):
    clusters[clustering_labels[i]].append(i)


In [None]:
with open('./sentences.txt', 'r') as fp:
    sentences_ = list(fp.readlines())

for k in clusters.keys():
    with open('./clustered_sentences/cluster_'+str(k)+'.txt', 'w') as dp:
        for vals in clusters[k]:
            dp.write(sentences_[vals])


In [None]:
clustered_sentences_indices = {}

for l in range(len(clustering_labels)):
  if clustered_sentences_indices.get(clustering_labels[l], -1) != -1:
    clustered_sentences_indices[clustering_labels[l]].append(files[l])
  else:
    clustered_sentences_indices[clustering_labels[l]] = [files[l]]

In [None]:
with open('/content/GraphEmbeddings/POS_TO_ID.json') as fp:
  pos_to_id = json.load(fp)

In [None]:
id_to_pos = {v: k for k, v in pos_to_id.items()}

In [None]:
def draw_graphs(f):
  f = f.split('.pt')[0]

  fp = open('/content/GraphEmbeddings/data/raw/' + f)
  data = []

  multi_edge_detector = set()
  label_dict = {}

  for edges in fp.readlines():
    if edges in multi_edge_detector:
        continue
    
    multi_edge_detector.add(edges)

    e_1 = int(edges.split(" ")[0])
    e_2 = int(edges.split(" ")[1].split('\n')[0])
    
    label_dict[e_1] = id_to_pos[e_1]
    label_dict[e_2] = id_to_pos[e_2]

    data.append((e_1, e_2))

    nx.draw(nx.DiGraph(data), with_labels=True, labels=label_dict)
    

In [None]:
c=0
cluster_n = 4

for i in clustered_sentences_indices[cluster_n]:
  if  c>=5:
    break
  c+=1
  plt.figure(figsize=(10,10)) 
  draw_graphs(i)
  plt.savefig('cluster_'+str(cluster_n)+'_fig_'+i+'.jpg')
  # plt.show()