In [None]:
# conda install -c conda-forge spacy


In [None]:
# !python -m spacy download en

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re
from tqdm import tqdm
import pickle
from collections import Counter
import spacy
from sklearn.metrics import silhouette_score
from spacy import displacy
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')

In [2]:
train_triplet_to_text = pd.read_csv('train_triplet_to_text_2.csv')
train_triplet_to_text.head()

Unnamed: 0,input_text,target_text
0,"Allen_Forrest | birthPlace | ""Fort Campbell, K...","Allen Forrest was born in Fort Campbell, KY an..."
1,Akron_Summit_Assault | ground | St._Vincent–St...,The ground of Akron Summit Assault is in St Vi...
2,Addis_Ababa_City_Hall | buildingStartDate | 19...,The Addis Ababa City Hall was built in 1961 an...
3,ALCO_RS-3 | builder | Montreal_Locomotive_Work...,The ALCO RS-3 has a V12 engine and is 17068.8 ...
4,Atlantic_City_International_Airport | operatin...,"The Port Authority of New York and New Jersey,..."


In [3]:
train_triplet_to_text["target_text"]

0        Allen Forrest was born in Fort Campbell, KY an...
1        The ground of Akron Summit Assault is in St Vi...
2        The Addis Ababa City Hall was built in 1961 an...
3        The ALCO RS-3 has a V12 engine and is 17068.8 ...
4        The Port Authority of New York and New Jersey,...
                               ...                        
28381    Antwerp International Airport serves the city ...
28382    Aaron Hunt has played for, Viktor Skrypnyk man...
28383    There are 600 students at the Accademia di Arc...
28384                  Alberto Teisaire is a Rear Admiral.
28385    Hüseyin Bütüner and Hilmi Güner designed the B...
Name: target_text, Length: 28386, dtype: object

In [4]:
text_to_tuples_dict = {}
for idx,row in train_triplet_to_text.iterrows():
    text_to_tuples_dict[row['target_text']] = row['input_text'].split("&&")

In [None]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text  = str(text)
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [None]:
master_dict = {} 
# dictionary format -- {text : {split_sentences: [ ] , tuples : []}}
for text in tqdm(list(text_to_tuples_dict.keys()), position = 0, desc = "Progress"):
    
    split_sentences = split_into_sentences(text)
    tuples = text_to_tuples_dict[text]
    master_dict[text] = {'split_sentences': split_sentences,
                        'tuples' : tuples}

In [None]:
master_dict[list(master_dict.keys())[0]]

In [None]:
from pathlib import Path

def show_dependency_graph(doc, save_file = False):
    print ("{:<15} | {:<8} | {:<15} | {:<20}".format('Token','Relation','Head', 'Children'))
    print ("-" * 70)

    for token in doc:
      # Print the token, dependency nature, head and all dependents of the token
      print ("{:<15} | {:<8} | {:<15} | {:<20}"
             .format(str(token.text), str(token.dep_), str(token.head.text), str([child for child in token.children])))

     # Use displayCy to visualize the dependency 
    img = displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})
    if save_file:
        output_path = Path("dependency_plot.svg") # you can keep there only "dependency_plot.svg" if you want to save it in the same folder where you run the script 
        output_path.open("w", encoding="utf-8").write(img)
    
    




In [None]:
nlp = spacy.load("en_core_web_sm")

for key in tqdm(list(master_dict.keys()), position = 0, desc = "Progress : "):
    
    split_sentences = master_dict[key]['split_sentences']
    dependency_relations = []
    for sentence in split_sentences:
        doc = nlp(sentence)
        dependency_relations_sentence = [doc]
        for token in doc:
            dep_dict = {}
            text = token.text
            Relation = token.dep_
            Head = token.head.text
            Children = [child for child in token.children]
#             print(text, Relation , Head, Children)
            dep_dict['text'] = text
            dep_dict['Relation'] = Relation
            dep_dict['Head'] = Head
            dep_dict['Children'] = Children
            dependency_relations_sentence.append(dep_dict)
        dependency_relations.append(dependency_relations_sentence)
    master_dict[key]['dependency_relations'] = dependency_relations
        
#         show_dependency_graph(doc)
        
    
        

In [None]:
sample_idx = 1
master_dict[list(master_dict.keys())[sample_idx]]['dependency_relations']

In [None]:
show_dependency_graph(master_dict[list(master_dict.keys())[sample_idx]]['dependency_relations'][0][0],save_file=False)

# Clustering Test on random sentences

In [None]:
# conda install -c conda-forge sentence-transformers 

In [None]:
# !pip install sentence-transformers

In [5]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dakshthapar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [None]:
sentences = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'A man is eating pasta.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.'
            ]

# Tokenization of each document
tokenized_sent = []
for s in sentences:
    tokenized_sent.append(word_tokenize(s.lower()))
tokenized_sent

In [7]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
sentence_embeddings = sbert_model.encode(sentences)
print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))
print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

In [None]:
len(sentence_embeddings), sentence_embeddings.shape

In [None]:
# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(sentence_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(sentences[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

# Clustering Test on WebNLG data


In [None]:
sentences=list(train_triplet_to_text["target_text"])

# Tokenizaton of each document
tokenized_sent = []
for s in tqdm(sentences):
    try:
        tokenized_sent.append(word_tokenize(s.lower()))
    except:
        pass
sentence_embeddings = sbert_model.encode(sentences)
print(len(sentence_embeddings), sentence_embeddings.shape)


100%|██████████████████████████████████| 28386/28386 [00:02<00:00, 11010.57it/s]


In [None]:
num_clusters = 50
clustering_model = KMeans(n_clusters=num_clusters,random_state=0).fit(sentence_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(sentences[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

In [None]:
clust_num=[]
clust_len=[]

for i, cluster in enumerate(clustered_sentences):
    clust_num.append(i+1)
    clust_len.append(len(cluster))

plt.plot(clust_num,clust_len)
plt.xlabel('Cluster number') 
plt.ylabel('Cluster size') 
plt.show()

In [None]:
cluster_assignment

In [None]:
range_n_clusters = []
for i in range(300,1000,200):
    range_n_clusters.append(i)
silhouette_avg = []

for num_clusters in range_n_clusters:
    # initialise kmeans
    
    clustering_model = KMeans(n_clusters=num_clusters,random_state=0).fit(sentence_embeddings)
    cluster_assignment = clustering_model.labels_  

    # silhouette score
    silhouette_avg.append(silhouette_score(sentence_embeddings, cluster_assignment))
    
plt.plot(range_n_clusters,silhouette_avg,'bx-')
plt.xlabel('Values of K') 
plt.ylabel('Silhouette score') 
plt.title('Silhouette analysis For Optimal k')
plt.show()

In [None]:
type(sentence_embeddings),sentence_embeddings.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
pca = PCA(n_components=2)

S = StandardScaler().fit_transform(sentence_embeddings)
principalComponents = pca.fit_transform(S)
principalComponents.shape

In [None]:
principalComponents[:5]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
import numpy as np

In [None]:
plt.scatter(principalComponents[:, 0], principalComponents[:, 1], c=cluster_assignment, s=50, cmap='rainbow')
# plt.legend()
centers = clustering_model.cluster_centers_

In [None]:
len(centers)