In [104]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn

In [2]:
wn.synsets('dog')

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [3]:
wn.synset('dog.n.01')

Synset('dog.n.01')

In [4]:
wn.synset('dog.n.01').definition()

'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds'

In [8]:
wn.synset('dog.n.01').examples()

['the dog barked all night']

* **`doc_to_synsets:`** returns a list of synsets in document. This function should first tokenize and part of speech tag the document using `nltk.word_tokenize` and `nltk.pos_tag`. Then it should find each tokens corresponding synset using `wn.synsets(token, wordnet_tag)`. The first synset match should be used. If there is no match, that token is skipped.

In [11]:
#doc = "Data science is probably the most popular concept nowadays. \
#        I believe that many people are looking for an entrance to get \
#        inside the industry, and I just happened to read an article that \
#        lists some great data science books that may be helpful for you. \
#        So I concluded it in this article and I’ve also given the books \
#        brief introductions, so you can choose the ones you’d like to read. \
#        Some of the data science books you can find it online, and I've \
#        given out the links. But most of them I think you may need to find \
#        them on Amazon."

In [136]:
doc = 'I like cats'

In [19]:
def convert_tag(tag):
    """Convert the tag given by nltk.pos_tag to the tag used by wordnet.synsets"""
    
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None

In [165]:
doc1 = 'This is a function to test document_path_similarity.'
doc2 = 'Use this function to see if your code in doc_to_synsets \
    and similarity_score is correct!'

In [181]:
doc_tokenized = nltk.tokenize.word_tokenize(doc2)
doc_pos = nltk.pos_tag(doc_tokenized)
l = []
res = []
for i, j in doc_pos:
    l.append((i, convert_tag(j)))
for x, y in l:
    if wn.synsets(x,y):
        res.append(wn.synsets(x,y)[0])
print(res)

[Synset('use.v.01'), Synset('function.n.01'), Synset('see.v.01'), Synset('code.n.01'), Synset('inch.n.01'), Synset('be.v.01'), Synset('correct.a.01')]


In [147]:
wn.synsets('like','v')

[Synset('wish.v.02'),
 Synset('like.v.02'),
 Synset('like.v.03'),
 Synset('like.v.04'),
 Synset('like.v.05')]

In [129]:
def doc_to_synsets(doc):
    """
    Returns a list of synsets in document.

    Tokenizes and tags the words in the document doc.
    Then finds the first synset for each word/tag combination.
    If a synset is not found for that combination it is skipped.

    Args:
        doc: string to be converted

    Returns:
        list of synsets

    Example:
        doc_to_synsets('Fish are nvqjp friends.')
        Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')]
    """
    doc_tokenized = nltk.tokenize.word_tokenize(doc)
    doc_pos = nltk.pos_tag(doc_tokenized)
    l = []
    for x, y in doc_pos:
        try:
            l.append(wn.synsets(x)[0])
        except:
            None
    return(l)

In [90]:
doc_to_synsets('the dogs have terrible gas')

[Synset('dog.n.01'),
 Synset('rich_person.n.01'),
 Synset('awful.s.02'),
 Synset('gas.n.01')]

* **`similarity_score:`** returns the normalized similarity score of a list of synsets (s1) onto a second list of synsets (s2). For each synset in s1, find the synset in s2 with the largest similarity value. Sum all of the largest similarity values together and normalize this value by dividing it by the number of largest similarity values found. Be careful with data types, which should be floats. Missing values should be ignored.

In [59]:
wn.synsets('I')

[Synset('iodine.n.01'),
 Synset('one.n.01'),
 Synset('i.n.03'),
 Synset('one.s.01')]

In [51]:
dog = wn.synset('dog.n.01')
cat = wn.synset('cat.n.01')
hit = wn.synset('hit.v.01')
slap = wn.synset('slap.v.01')

In [52]:
dog.path_similarity(cat)

0.2

In [53]:
cat.path_similarity(dog)

0.2

In [66]:
wn.synset('dog.n.01').path_similarity(wn.synset('cat.n.01'))

0.2

In [91]:
#s1 = 'I like cats'
#s2 = 'I like dogs'
s1 = "i have bad farts"
s2 = "the dogs have terrible gas"

In [92]:
synset1 = doc_to_synsets(s1)
synset2 = doc_to_synsets(s2)

In [97]:
biggest = []
for x in synset1:
    l = []
    for y in synset2:
        ps = x.path_similarity(y)
        if ps is not None:
            l.append(ps)
    biggest.append(max(l))
sum(biggest)/len(biggest)  

0.3340277777777778

In [100]:
def similarity_score(s1, s2):
    """
    Calculate the normalized similarity score of s1 onto s2

    For each synset in s1, finds the synset in s2 with the largest similarity value.
    Sum of all of the largest similarity values and normalize this value by dividing it by the
    number of largest similarity values found.

    Args:
        s1, s2: list of synsets from doc_to_synsets

    Returns:
        normalized similarity score of s1 onto s2

    Example:
        synsets1 = doc_to_synsets('I like cats')
        synsets2 = doc_to_synsets('I like dogs')
        similarity_score(synsets1, synsets2)
        Out: 0.73333333333333339
    """
    #synset1 = doc_to_synsets(s1)
    #synset2 = doc_to_synsets(s2)
    biggest = []
    for x in s1:
        l = []
        for y in s2:
            ps = x.path_similarity(y)
            if ps is not None:
                l.append(ps)
        biggest.append(max(l))
        
    return(sum(biggest)/len(biggest))

In [101]:
s1 = doc_to_synsets("i have bad farts")
s2 = doc_to_synsets("the dogs have terrible gas")
similarity_score(s1, s2)

0.3340277777777778

In [157]:
def mp_doc_to_synsets(doc):
    tokens = nltk.word_tokenize(doc)
    pos = nltk.pos_tag(tokens)
    tags = [tag[1] for tag in pos]
    wntag = [convert_tag(tag) for tag in tags]
    ans = list(zip(tokens,wntag))
    sets = [wn.synsets(x,y) for x,y in ans]
    print(sets)
    final = [val[0] for val in sets if len(val) > 0]
    
    return final

In [158]:
mp_doc_to_synsets('I like cats')

[[Synset('iodine.n.01'), Synset('one.n.01'), Synset('i.n.03'), Synset('one.s.01')], [Synset('wish.v.02'), Synset('like.v.02'), Synset('like.v.03'), Synset('like.v.04'), Synset('like.v.05')], [Synset('cat.n.01'), Synset('guy.n.01'), Synset('cat.n.03'), Synset('kat.n.01'), Synset('cat-o'-nine-tails.n.01'), Synset('caterpillar.n.02'), Synset('big_cat.n.01'), Synset('computerized_tomography.n.01')]]


[Synset('iodine.n.01'), Synset('wish.v.02'), Synset('cat.n.01')]

In [130]:
doc_to_synsets('I like cats')

['I', 'like', 'cats']
[('I', 'PRP'), ('like', 'VBP'), ('cats', 'NNS')]


[Synset('iodine.n.01'), Synset('like.n.01'), Synset('cat.n.01')]

In [108]:
def document_path_similarity(doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""

    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

In [118]:
#doc_to_synsets("i have terrible farts")
#doc_to_synsets("the dogs have awful gas")
#synsets1 = doc_to_synsets("i have terrible farts")
#synsets2 = doc_to_synsets("the dogs have awful gas")
#similarity_score(synset1, synset2)
doc1 = "i have terrible farts"
doc2 = "the dogs have awful gas"
document_path_similarity(doc1, doc2)

ValueError: max() arg is an empty sequence

<br>
___
`paraphrases` is a DataFrame which contains the following columns: `Quality`, `D1`, and `D2`.

`Quality` is an indicator variable which indicates if the two documents `D1` and `D2` are paraphrases of one another (1 for paraphrase, 0 for not paraphrase).

In [105]:
# Use this dataframe for questions most_similar_docs and label_accuracy
paraphrases = pd.read_csv('paraphrases.csv')
paraphrases.head()

Unnamed: 0,Quality,D1,D2
0,1,"Ms Stewart, the chief executive, was not expec...","Ms Stewart, 61, its chief executive officer an..."
1,1,After more than two years' detention under the...,After more than two years in detention by the ...
2,1,"""It still remains to be seen whether the reven...","""It remains to be seen whether the revenue rec..."
3,0,"And it's going to be a wild ride,"" said Allan ...","Now the rest is just mechanical,"" said Allan H..."
4,1,The cards are issued by Mexico's consulates to...,The card is issued by Mexico's consulates to i...


___

### most_similar_docs

Using `document_path_similarity`, find the pair of documents in paraphrases which has the maximum similarity score.

*This function should return a tuple `(D1, D2, similarity_score)`*

In [107]:
def most_similar_docs():
    
    # Your Code Here
    
    return # Your Answer Here