# Mandatory exercise

#### First, we import the required modules and open the input file in reading mode.

In [1]:
import nltk
import string
from nltk import pos_tag
from nltk.wsd import lesk
from nltk.metrics import jaccard_distance
from scipy.stats.stats import pearsonr

txt = open('../trial/STS.input.txt', 'r')

#### Then, we read all the pairs of sentences of the trial set.

In [2]:
d = {}
for line in txt:
    # lowercase the sentences
    line = line.lower()
    fields = line.strip().split('\t')
    d[fields[0]] = fields[1:]

print(d)

{'id1': ['the bird is bathing in the sink.', 'birdie is washing itself in the water basin.'], 'id2': ['in may 2010, the troops attempted to invade kabul.', 'the us army invaded kabul on may 7th last year, 2010.'], 'id3': ['john said he is considered a witness but not a suspect.', '"he is not a suspect anymore." john said.'], 'id4': ['they flew out of the nest in groups.', 'they flew into the nest together.'], 'id5': ['the woman is playing the violin.', 'the young lady enjoys listening to the guitar.'], 'id6': ['john went horse back riding at dawn with a whole group of friends.', 'sunrise at dawn is a magnificent view to take in if you wake up early enough for it.']}


#### Now, we tokenize the sentences.

In [3]:
for key in d:
    d[key] = [nltk.word_tokenize(s) for s in d[key]]

print(d)

{'id1': [['the', 'bird', 'is', 'bathing', 'in', 'the', 'sink', '.'], ['birdie', 'is', 'washing', 'itself', 'in', 'the', 'water', 'basin', '.']], 'id2': [['in', 'may', '2010', ',', 'the', 'troops', 'attempted', 'to', 'invade', 'kabul', '.'], ['the', 'us', 'army', 'invaded', 'kabul', 'on', 'may', '7th', 'last', 'year', ',', '2010', '.']], 'id3': [['john', 'said', 'he', 'is', 'considered', 'a', 'witness', 'but', 'not', 'a', 'suspect', '.'], ['``', 'he', 'is', 'not', 'a', 'suspect', 'anymore', '.', "''", 'john', 'said', '.']], 'id4': [['they', 'flew', 'out', 'of', 'the', 'nest', 'in', 'groups', '.'], ['they', 'flew', 'into', 'the', 'nest', 'together', '.']], 'id5': [['the', 'woman', 'is', 'playing', 'the', 'violin', '.'], ['the', 'young', 'lady', 'enjoys', 'listening', 'to', 'the', 'guitar', '.']], 'id6': [['john', 'went', 'horse', 'back', 'riding', 'at', 'dawn', 'with', 'a', 'whole', 'group', 'of', 'friends', '.'], ['sunrise', 'at', 'dawn', 'is', 'a', 'magnificent', 'view', 'to', 'take', 

In [4]:
def get_valid_pos(tag):
    #if word is a noun
    if tag.startswith('N'):
        return 'n'
    #if word is a verb
    elif tag.startswith('V'):
        return 'v'
    #if word is an adjective
    elif tag.startswith('J'):
        return 'a'
    #if word is a adverb
    elif tag.startswith('R'):
        return 'b'

def compute_lesk(d):
    new_dict = {}
    for key in d:
        new_dict[key] = []
        for sentence in d[key]:
            new_sentence = []
            for word in sentence:
                pos = pos_tag([word])[0][1]
                if pos[0] in {'N', 'V', 'J', 'R'}:
                    synset = lesk(sentence, word, get_valid_pos(pos))
                    if(synset is not None):
                        split = synset.name().split('.')
                        new_sentence.append(word + '/' + pos + '/' + split[-1])
                    else:
                        new_sentence.append(word + '/' + pos)
                else:
                    new_sentence.append(word + '/' + pos)
            new_dict[key].append(new_sentence)
    return(new_dict)

def jaccard(d):
    similarities = {}
    print('Computed Jaccard similarity for each pair of sentences:')
    for key in d:
        sentences = d[key]
        distance = jaccard_distance(set(sentences[0]), set(sentences[1]))
        similarity = 1-distance
        similarities[key] = [similarity]
        print(key, ': ', similarity)
    return similarities

#### Finally, we apply Lesk algorithm to the sentences of the trial set.

In [5]:
new_d = compute_lesk(d)
print(new_d)

{'id1': [['the/DT', 'bird/NN/02', 'is/VBZ/12', 'bathing/NN/01', 'in/IN', 'the/DT', 'sink/NN/01', './.'], ['birdie/NN/01', 'is/VBZ/12', 'washing/VBG/09', 'itself/PRP', 'in/IN', 'the/DT', 'water/NN/01', 'basin/NN/01', './.']], 'id2': [['in/IN', 'may/MD', '2010/CD', ',/,', 'the/DT', 'troops/NNS/02', 'attempted/VBN/01', 'to/TO', 'invade/NN', 'kabul/NN/01', './.'], ['the/DT', 'us/PRP', 'army/NN/01', 'invaded/VBN/03', 'kabul/NN/01', 'on/IN', 'may/MD', '7th/NNS', 'last/JJ/02', 'year/NN/03', ',/,', '2010/CD', './.']], 'id3': [['john/NN/01', 'said/VBD/01', 'he/PRP', 'is/VBZ/02', 'considered/VBN/02', 'a/DT', 'witness/NN/05', 'but/CC', 'not/RB', 'a/DT', 'suspect/NN/01', './.'], ['``/``', 'he/PRP', 'is/VBZ/02', 'not/RB', 'a/DT', 'suspect/NN/01', 'anymore/RB', './.', "''/''", 'john/NN/01', 'said/VBD/01', './.']], 'id4': [['they/PRP', 'flew/NN', 'out/IN', 'of/IN', 'the/DT', 'nest/JJS', 'in/IN', 'groups/NNS/02', './.'], ['they/PRP', 'flew/NN', 'into/IN', 'the/DT', 'nest/JJS', 'together/RB', './.']], 

#### We can now compute the Jaccard distance.

In [6]:
similarities = jaccard(new_d)

Computed Jaccard similarity for each pair of sentences:
id1 :  0.33333333333333337
id2 :  0.33333333333333337
id3 :  0.5714285714285714
id4 :  0.4545454545454546
id5 :  0.16666666666666663
id6 :  0.09999999999999998


#### We can now compute the Pearson's correlation between the previously computed values and the values from the golden standard.

In [7]:
gs = open('../trial/STS.gs.txt', 'r')

for line in gs:
    fields = line.strip().split('\t')
    similarities[fields[0]].append(int(fields[1]))

jaccard_distances = []
golden_record = []
for key in similarities:
    jaccard_distances.append(similarities[key][0])
    golden_record.append(similarities[key][1])
    
print(pearsonr(golden_record, jaccard_distances)[0])

0.5444093003285272


## Compare the results
In the session 2 and 3 we got the following results:

### Session 2:
Computed Jaccard similarity for each pair of sentences:
* 'id1' :  0.3076923076923077
* 'id2' :  0.26315789473684215
* 'id3' :  0.4666666666666667
* 'id4' :  0.4545454545454546
* 'id5' :  0.23076923076923073
* 'id6' :  0.13793103448275867

Pearson's correlation: 0.3962389776119233

### Session 3:
Computed Jaccard similarity for each pair of sentences:
* 'id1': 0.33333333333333337
* 'id2': 0.4117647058823529
* 'id3': 0.5714285714285714
* 'id4': 0.4545454545454546
* 'id5': 0.16666666666666663
* 'id6': 0.13793103448275867

Pearson's correlation: 0.5790860088205633

# Optional Exercise 2

## Stopwords

#### First, we remove the stopwords from the sentences of the trial set.

In [8]:
from nltk.corpus import stopwords

sw=set(stopwords.words('english'))

filtered = {}
for key in d:
    filtered[key] = []
    for s in d[key]:
        new_s = [w for w in s if w not in sw]
        filtered[key].append(new_s)

print(filtered)

{'id1': [['bird', 'bathing', 'sink', '.'], ['birdie', 'washing', 'water', 'basin', '.']], 'id2': [['may', '2010', ',', 'troops', 'attempted', 'invade', 'kabul', '.'], ['us', 'army', 'invaded', 'kabul', 'may', '7th', 'last', 'year', ',', '2010', '.']], 'id3': [['john', 'said', 'considered', 'witness', 'suspect', '.'], ['``', 'suspect', 'anymore', '.', "''", 'john', 'said', '.']], 'id4': [['flew', 'nest', 'groups', '.'], ['flew', 'nest', 'together', '.']], 'id5': [['woman', 'playing', 'violin', '.'], ['young', 'lady', 'enjoys', 'listening', 'guitar', '.']], 'id6': [['john', 'went', 'horse', 'back', 'riding', 'dawn', 'whole', 'group', 'friends', '.'], ['sunrise', 'dawn', 'magnificent', 'view', 'take', 'wake', 'early', 'enough', '.']]}


#### Now we apply Lesk algorithm to the sentences of the trial set from which we removed the stopwords.

In [9]:
new_d = compute_lesk(filtered)
print(new_d)

{'id1': [['bird/NN/02', 'bathing/NN/01', 'sink/NN/01', './.'], ['birdie/NN/01', 'washing/VBG/09', 'water/NN/02', 'basin/NN/01', './.']], 'id2': [['may/MD', '2010/CD', ',/,', 'troops/NNS/04', 'attempted/VBN/01', 'invade/NN', 'kabul/NN/01', './.'], ['us/PRP', 'army/NN/01', 'invaded/VBN/04', 'kabul/NN/01', 'may/MD', '7th/NNS', 'last/JJ/02', 'year/NN/02', ',/,', '2010/CD', './.']], 'id3': [['john/NN/03', 'said/VBD/01', 'considered/VBN/02', 'witness/NN/05', 'suspect/NN/01', './.'], ['``/``', 'suspect/NN/01', 'anymore/RB', './.', "''/''", 'john/NN/03', 'said/VBD/01', './.']], 'id4': [['flew/NN', 'nest/JJS', 'groups/NNS/03', './.'], ['flew/NN', 'nest/JJS', 'together/RB', './.']], 'id5': [['woman/NN/02', 'playing/VBG/02', 'violin/NN/01', './.'], ['young/JJ/01', 'lady/NN/03', 'enjoys/NNS', 'listening/VBG/02', 'guitar/NN/01', './.']], 'id6': [['john/NN/01', 'went/VBD/04', 'horse/NN/01', 'back/RB', 'riding/VBG/01', 'dawn/NN/03', 'whole/JJ/02', 'group/NN/03', 'friends/NNS/01', './.'], ['sunrise/NN

#### We can now compute the Jaccard distance.

In [10]:
similarities_filtered = jaccard(new_d)

Computed Jaccard similarity for each pair of sentences:
id1 :  0.125
id2 :  0.3571428571428571
id3 :  0.4444444444444444
id4 :  0.6
id5 :  0.11111111111111116
id6 :  0.11764705882352944


#### We can now compute the Pearson's correlation between the previously computed values and the values from the golden standard.

In [11]:
jaccard_distances = []
for key in similarities_filtered:
    jaccard_distances.append(similarities_filtered[key][0])

print(pearsonr(golden_record, jaccard_distances)[0])

0.16025255903367078
