### Document Similarity
Using wordnet synsets

In [0]:
import numpy as np
import nltk
import pandas as pd
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from google.colab import drive
from nltk.corpus import wordnet as wn

!ls -al /root/nltk_data

drive.mount('/content/gdrive')
dir="/content/gdrive/My Drive/Colab Notebooks/NLP/"

!ls


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
total 24
drwxr-xr-x 5 root root 4096 Dec  3 03:45 .
drwx------ 1 root root 4096 Dec  3 03:45 ..
drwxr-xr-x 3 root root 4096 Dec  3 03:45 corpora
drwxr-xr-x 3 root root 4096 Dec  3 03:45 taggers
drwxr-xr-x 3 root root 4096 Dec  3 03:45 tokenizers
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%

In [0]:
# Converts the tag given by nltk.pos_tag to a tag used by wordnet.synsets
def convert_tag(tag):       
    tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'}
    
    try:
        return tag_dict[tag[0]]
    except KeyError:
        return None

#### Returns a list of synsets in document/sentence.

    Example:
        doc_to_synsets('Fish are nvqjp friends.')
        Out: [Synset('fish.n.01'), Synset('be.v.01'), Synset('friend.n.01')]

In [0]:
def doc_to_synsets(doc):    
    token_list = nltk.word_tokenize(doc)
    #print(token_list)
    pos_list = nltk.pos_tag(token_list)
    #print(pos_list)
    
    tag_list = [convert_tag(pos[1]) for pos in pos_list]
    #print(tag_list)
    
    doc_syn_list = []
    for token, tag in zip(token_list, tag_list):
        syn_list = wn.synsets(token, tag)
        if len(syn_list) > 0:
            # Taking only the first synset. If there is no match, that token is skipped.
            doc_syn_list.append(syn_list[0])
    return doc_syn_list

#doc_to_synsets('Fish are nvqjp friends.')

#### Returns: normalized similarity score of s1 onto s2
*Args:s1, s2: list of synsets from doc_to_synsets*



In [0]:
"""For each synset in s1, finds the synset in s2 with the largest similarity value.
    Sum of all of the largest similarity values and normalize this value by dividing it by the
    number of largest similarity values found.""" 

def similarity_score(s1, s2):       
    max_list = []
    for syn1 in s1:
        sim_list = []
        for syn2 in s2:
            word_sim = syn1.path_similarity(syn2)
            #print(syn1, syn2, word_sim)
            # eg - similarity(I, like) is None, discard them
            if type(word_sim) is float:
                sim_list.append(word_sim)
    
        if len(sim_list) > 0:
            max_list.append(max(sim_list))
            #print(syn1, sim_list, max(sim_list)) 
    #print(max_list)
    
    score = sum(max_list)/len(max_list)
    return score

In [0]:
def document_path_similarity(doc1, doc2):
    """Finds the symmetrical similarity between doc1 and doc2"""
    synsets1 = doc_to_synsets(doc1)
    synsets2 = doc_to_synsets(doc2)

    return (similarity_score(synsets1, synsets2) + similarity_score(synsets2, synsets1)) / 2

In [0]:
# synsets1 = doc_to_synsets('I like cats')
# synsets2 = doc_to_synsets('I like dogs')
# similarity_score(synsets1, synsets2)

print(document_path_similarity('I like cats', 'I like dogs'))
print(document_path_similarity('I like cats', 'I do not like dogs'))

0.7333333333333334
0.43333333333333335


#### Find the pair of documents in paraphrases file which has the maximum similarity score.


In [0]:
# `Quality` indicates if the two documents `D1` and `D2` are paraphrases of one another (1 for paraphrase, 0 for not).
paraphrases = pd.read_csv(dir + 'paraphrases.csv')
print(paraphrases.shape)
paraphrases.head()

(20, 3)


Unnamed: 0,Quality,D1,D2
0,1,"Ms Stewart, the chief executive, was not expec...","Ms Stewart, 61, its chief executive officer an..."
1,1,After more than two years' detention under the...,After more than two years in detention by the ...
2,1,"""It still remains to be seen whether the reven...","""It remains to be seen whether the revenue rec..."
3,0,"And it's going to be a wild ride,"" said Allan ...","Now the rest is just mechanical,"" said Allan H..."
4,1,The cards are issued by Mexico's consulates to...,The card is issued by Mexico's consulates to i...


In [0]:
def most_similar_columns():
    score_list = []
    
    for index, row in paraphrases.iterrows():
        score = document_path_similarity(row["D1"], row["D2"])
        score_list.append(score)
        
    paraphrases["score"] = score_list
    #print(paraphrases["score"])
    
    max_score_index = paraphrases["score"].idxmax()
    col1 = paraphrases.loc[max_score_index].D1
    col2 = paraphrases.loc[max_score_index].D2
    max_score = paraphrases.loc[max_score_index].score
    
    max_similar =  (col1, col2, max_score)
    return max_similar

most_similar_columns()

('"Indeed, Iran should be put on notice that efforts to try to remake Iraq in their image will be aggressively put down," he said.',
 '"Iran should be on notice that attempts to remake Iraq in Iran\'s image will be aggressively put down," he said.\n',
 0.9753086419753086)

#### Get label_accuracy

Compute labels for the twenty pairs of columns by computing the similarity for each pair. If the score is greater than 0.75, actual label is (1), else (0). Report accuracy of the classifier using scikit-learn's accuracy_score.

In [0]:
from sklearn.metrics import accuracy_score

def label_accuracy():
    paraphrases["label"] = np.where(paraphrases["score"] > 0.75, 1, 0)
    #print(paraphrases.head())
    return accuracy_score(paraphrases["Quality"], paraphrases["label"])

label_accuracy()

0.8