In [1]:
import nltk
from nltk.metrics import jaccard_distance, edit_distance
from nltk.parse.corenlp import CoreNLPDependencyParser
parser = CoreNLPDependencyParser(url='http://localhost:9000')
from scipy.stats import pearsonr
import re

In [2]:
with open('./given/trial/STS.input.txt') as input:
    corpus = input.read()
corpus

'id1\tThe bird is bathing in the sink.\tBirdie is washing itself in the water basin.\nid2\tIn May 2010, the troops attempted to invade Kabul.\tThe US army invaded Kabul on May 7th last year, 2010.\nid3\tJohn said he is considered a witness but not a suspect.\t"He is not a suspect anymore." John said.\nid4\tThey flew out of the nest in groups.\tThey flew into the nest together.\nid5\tThe woman is playing the violin.\tThe young lady enjoys listening to the guitar.\nid6\tJohn went horse back riding at dawn with a whole group of friends.\tSunrise at dawn is a magnificent view to take in if you wake up early enough for it.\n'

## Divide the text into pairs of sentences
We take the text and split it by their id, getting pairs of sentences separated by a tab, later we split those sentences by tat tab. Finally we calculate the triplets

In [3]:
sentences = [
        list(map(lambda p: list(next(parser.raw_parse(p)).triples()), sentence_pair.strip().split('\t')))
        for sentence_pair in re.compile('id[\d]\t').split(corpus) 
        if sentence_pair != ''
]
sentences

[[[(('bathing', 'NN'), 'nsubj', ('bird', 'NN')),
   (('bird', 'NN'), 'det', ('The', 'DT')),
   (('bathing', 'NN'), 'cop', ('is', 'VBZ')),
   (('bathing', 'NN'), 'nmod', ('sink', 'NN')),
   (('sink', 'NN'), 'case', ('in', 'IN')),
   (('sink', 'NN'), 'det', ('the', 'DT')),
   (('bathing', 'NN'), 'punct', ('.', '.'))],
  [(('washing', 'VBG'), 'nsubj', ('Birdie', 'NNP')),
   (('washing', 'VBG'), 'aux', ('is', 'VBZ')),
   (('washing', 'VBG'), 'dobj', ('itself', 'PRP')),
   (('washing', 'VBG'), 'nmod', ('basin', 'NN')),
   (('basin', 'NN'), 'case', ('in', 'IN')),
   (('basin', 'NN'), 'det', ('the', 'DT')),
   (('basin', 'NN'), 'compound', ('water', 'NN')),
   (('washing', 'VBG'), 'punct', ('.', '.'))]],
 [[(('attempted', 'VBN'), 'nmod', ('May', 'NNP')),
   (('May', 'NNP'), 'case', ('In', 'IN')),
   (('May', 'NNP'), 'nummod', ('2010', 'CD')),
   (('attempted', 'VBN'), 'punct', (',', ',')),
   (('attempted', 'VBN'), 'nsubj', ('troops', 'NNS')),
   (('troops', 'NNS'), 'det', ('the', 'DT')),
   

## Calculate the Jaccard distance among the pairs of sentences
We calculate the distance between the pairs of sentences to give a measure of their similarity and we contrast it with the golden ratio which is a real measure of the similarity from 0 to 5 being 0 phrases that are completely different and 5 phrases that mean exactly the same

In [4]:
jaccard_distances = [jaccard_distance(*map(lambda s: set(s), sentence_pairs)) for sentence_pairs in sentences]
golden_ratios = [0, 1, 2, 3, 4, 5]
for golden_ratio, jaccard_distance in zip(golden_ratios, jaccard_distances):
    print(f'Jaccard distance is {jaccard_distance} and its real golden ratio is {golden_ratio}')

Jaccard distance is 1.0 and its real golden ratio is 0
Jaccard distance is 1.0 and its real golden ratio is 1
Jaccard distance is 1.0 and its real golden ratio is 2
Jaccard distance is 0.6 and its real golden ratio is 3
Jaccard distance is 1.0 and its real golden ratio is 4
Jaccard distance is 0.9666666666666667 and its real golden ratio is 5


## Pearson Linear Correlation Cofficient
Finally we get a measure of the correlation among those values

In [5]:
pearsonr(jaccard_distances, golden_ratios)[0]

-0.18798210894408277

## To recap
It is useful to use NEs but it must be together with other techniques so we can extract useful information. Using NEs alone we get a collection of relations but also using semantic matching and coreference resolution for example will obtain a better result.