In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import numpy as np
from fuzzywuzzy import fuzz 
import ipywidgets as widgets
import pprint
from ipywidgets import interact, interact_manual
import re
__PATH__ = "./data.csv"



In [2]:
df = pd.read_csv(__PATH__,sep=";",header=0)

In [3]:
titles = list(df['title'].apply(
    lambda t : 
        tuple(
            filter(lambda e:not e in stopwords.words('english'),
                map(lambda e:e.lower(),
                       re.findall('([A-Z]{1}[a-z]+)',t.replace('.pdf','')))
                )
            )
        )
    )

In [4]:
res = {}
for title in titles[:20]:
    synsets = {}
    for word in title:
        synsets[word]=[synset for synset in wn.synsets(word)]
    res[title] = synsets

In [5]:
def get_2parent_classes(synset):
    hyps = dict()
    level_count = 1
    while level_count < 3:
        try:
            synset = synset.hypernyms()[0]
            hyps[level_count] = synset 
            level_count += 1
        except IndexError:
            break
    return hyps

In [6]:
def extract_hypornyms_en(a):
    hypornyms_en_first = set()
    hypornyms_en_second = set()
    for word in a:
        # find the hypernyms of the word
        # word = wn.synset(word)
        try:
            word_synset0 = wn.synsets(word)[0]
            hyps_buff = get_2parent_classes(word_synset0)
            #print(hyps_buff)
            hypornyms_en_first.add(hyps_buff[1].name().split('.')[0])
            hypornyms_en_second.add(hyps_buff[2].name().split('.')[0])
        except IndexError:
            continue
        except KeyError:
            continue
    return hypornyms_en_first, hypornyms_en_second

In [7]:
def f1measure(a, b):
    a = set(a)
    b = set(b)
    # missed part
    overlap = set()
    overlap_hyp_cnt = 0
    overlap = a.intersection(b)
    overlap_hyp_cnt = len(overlap) 
    
    recl = overlap_hyp_cnt/len(a)
    prec = overlap_hyp_cnt/len(b)
    
    if len(overlap) == 0:
        return 0, overlap
    else:
        return 2*recl*prec/(recl+prec), overlap

In [8]:
lang = 'eng'

    
#def distance(a,b):
    ### Put your code here
    ### В переменной synsets помимо нормализованного заголовка хранятся синсеты для каждого токена из заголовка.
    ### у синсетов есть гиперонимы про то как из брать здесь
    ### Если у токенов заголовков есть общие гиперонимы тогда заголовки связаны несмотря на то что слова разные
    
    #return float((100-fuzz.ratio(a,b))/100)

    
def distance(a,b):
    ### Put your code here
    ### В переменной synsets помимо нормализованного заголовка хранятся синсеты для каждого токена из заголовка.
    ### у синсетов есть гиперонимы про то как из брать здесь
    ### Если у токенов заголовков есть общие гиперонимы тогда заголовки связаны несмотря на то что слова разные
    
    a = set(a) 
    b = set(b) 
    f1score, overlap = f1measure(a, b)
    a = a - overlap
    b = b - overlap
    
    if len(a) == 0 or len(b) == 0:
        return 1.0 - f1score
    
    buff_a = {}
    buff_b = {}
    
    if lang == 'eng':
        buff_a_first, buff_a_second = extract_hypornyms_en(a)
        buff_b_first, buff_b_second = extract_hypornyms_en(b)
        
    overlap_hyp_cnt = 0
    #print('buff_a_first',buff_a_first)
    #print('buff_b_first',buff_b_first)
    #print('buff_a_second',buff_a_second)
    #print('buff_b_second',buff_b_second)
    #for word in a:
    #    for wordb in b:
    #        if len(buff_a[word].intersection(buff_b[wordb])) > 0:
    #            overlap_hyp_cnt += 1
    
    if len(buff_a_first.intersection(buff_b_first)) > 0:
        overlap_hyp_cnt += 1
    
    if len(buff_a_second.intersection(buff_b_second)) > 0:
        overlap_hyp_cnt += 0.5
    
    
    
    recl_hyp = overlap_hyp_cnt/len(a)
    #print('recl_hyp',recl_hyp)
    prec_hyp = overlap_hyp_cnt/len(b)
    #print('prec_hyp',prec_hyp)
    f1score_hyp = 2*recl_hyp/(recl_hyp + prec_hyp) if overlap_hyp_cnt > 0 else 0
    # print('f1score_hyp',f1score_hyp)
    f1res = (2*f1score+f1score_hyp)/3
    #print('f1res',f1res,'\n')
    return (1.0 - f1res)

buff = list(res.items())
dist = np.zeros((len(buff),len(buff)))
for lli,ll in enumerate(buff):
    for rri,rr in enumerate(buff):
        dist[lli,rri] = distance(ll[0], rr[0])
        

In [9]:
dist

array([[0.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 0.51851852, 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ],
       [1.        , 0.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 0.80952381, 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ],
       [1.        , 1.        , 0.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 0.62962963,
        1.        , 1.        , 1.        , 1.        , 1.        ],
       [1.        , 1.        , 1.        , 0.        , 0.75757576,
        1.        , 1.        , 0.85964912, 1.        , 1.        ,
        1.        , 1.        , 1.        , 1

In [10]:
@interact(ind=(0,len(buff)-1,1))
def h(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint([buff[i][0] for i in dist[ind][:].argsort()[1:11]])


interactive(children=(IntSlider(value=0, description='ind', max=19), Output()), _dom_classes=('widget-interact…

In [11]:
@interact(ind=(0,len(buff)-1,1))
def hypernyms(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint(buff[ind][1])

interactive(children=(IntSlider(value=0, description='ind', max=19), Output()), _dom_classes=('widget-interact…