# This is how the features defined in knowledge graph module is calculated

In [3]:
import nltk
import torch
import json
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pickle
import Levenshtein

In [10]:
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

In [8]:
''' 
freebase types
'''
with open('./mapping_dict.json', 'r', encoding='utf-8') as f:
    map_dict = json.load(f)

fb_sorted_list = [(i[0].split('.')[1]+' of '+i[0].split('.')[0]).replace('_', ' ') for i in sorted(map_dict.items(), key = lambda x:x[1])]

## output some example
print(len(fb_sorted_list), fb_sorted_list[:5])

255 ['football coach of american football', 'football conference of american football', 'football player of american football', 'football team of american football', 'park of amusement parks']


In [11]:
tokenized_fb_types = tokenizer(fb_sorted_list, padding=True, truncation=True, return_tensors="pt")
## input_ids   token_type_ids   attention_mask
print(tokenized_fb_types['input_ids'][:3])
'''
Get the embeddings
'''
with torch.no_grad():
    fb_embeddings = model(**tokenized_fb_types, output_hidden_states=True, return_dict=True).pooler_output
print(fb_embeddings.shape)    

tensor([[ 101, 2374, 2873, 1997, 2137, 2374,  102,    0,    0,    0],
        [ 101, 2374, 3034, 1997, 2137, 2374,  102,    0,    0,    0],
        [ 101, 2374, 2447, 1997, 2137, 2374,  102,    0,    0,    0]])
torch.Size([255, 768])


In [12]:
'''
have a glance of the data I create
'''
with open('../data/ssy_test.coltype.pkl', 'rb') as f:
    df = pickle.load(f)
    
print(df.columns, '\n', df[:2])

Index(['table_id', 'labels', 'data', 'label_ids', 'header', 'type'], dtype='object') 
     table_id                         labels  \
0  6675886-1  [sports.sports_league_season]   
1  6675886-1                [people.person]   

                                                data  \
0  1998 1999 2000 2001 2002 2003 2004 2005 2006 2...   
1  Jackie Stewart Adrian Fernández Adrian Fernánd...   

                                           label_ids         header  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  season season   
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  driver driver   

                                                type  
0  {'Champ_Car_season': 5, 'IndyCar_Series_season...  
1  {'http://dbpedia.org/ontology/Person': 10, 'ht...  


In [13]:
'''
get a dict like {"http://www.dbpedia.org/Entity": 13, ...}
do same like was in block 3
'''
temp_dict = {"Entity": 13, "http://dbpedia.org/ontology/Example": 10}

complex_list = sorted(temp_dict.items(), key = lambda x:x[1], reverse=True)
print(complex_list[:5])

target_list = [i[0].replace('_', ' ') if 'http://dbpedia.org/ontology/' not in i[0] else i[0].split('/')[-1] for i in complex_list] 
print(target_list[:5])

count_list = [i[1] for i in complex_list] 
print(count_list[:5])

[('Entity', 13), ('http://dbpedia.org/ontology/Example', 10)]
['Entity', 'Example']
[13, 10]


In [14]:
def get_dist_cosine(emb1, emb2):
    vector_a = np.array(emb1)
    vector_b = np.array(emb2)
    cos=vector_a.dot(vector_b.T)/(np.reshape(np.linalg.norm(vector_a,axis=1),(vector_a.shape[0],1))* np.linalg.norm(vector_b,axis=1))
    return np.where(np.isnan(cos),0,cos)

In [15]:
''' 
deeplearning sim 
'''

model = model

## calculate embedding similarity score btw caligraph & dbpedia and freebase
def es_cal(model, tar_list, comp_embeddings):
    token_id = tokenizer(tar_list, padding=True, truncation=True, return_tensors='pt')

    ## btw type_embeddings and fb_embeddings
    with torch.no_grad():
        type_embeddings = model(**token_id, output_hidden_states=True, return_dict=True).pooler_output
    es = get_dist_cosine(type_embeddings, comp_embeddings)
        
    return es


es = es_cal(model, target_list, fb_embeddings)
print(es.shape, '\n', #es[:1,:]
      )

(2, 255) 



In [16]:
def get_dist_leven(main_sent, sub_sent):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', None])
    mains = [i for i in nltk.word_tokenize(main_sent.lower()) if i not in stop_words]
    subs = [i for i in nltk.word_tokenize(sub_sent.lower()) if i not in stop_words]

    res_list = []

    for token in subs:
        sim = 0
        for text in mains:
            edist = Levenshtein.ratio(token, text)
            if edist > sim:
                sim = edist
        res_list.append(sim)
    error = len(subs) // 5
    while error > 0:
        res_list.pop(int(np.argmin(res_list)))
        error -= 1

    return np.mean(res_list)


x = 'life is but a dream'
y = 'life is an apple'
print(get_dist_leven(x,y))

0.7222222222222222


In [17]:
'''
text sim
'''

## calculate levenshtein similarity score btw caligraph & dbpedia and freebase
def ls_cal(tar_list, comp_list):
    res_list = []
    for target in tar_list:
        temp_list = []
        for comp in comp_list:
            temp_list.append(get_dist_leven(target, comp))
        res_list.append(temp_list)
    ls = np.array(res_list)
            
    return ls


ls = ls_cal(target_list, fb_sorted_list)
print(ls.shape, '\n', #ls[:1,:]
      )

(2, 255) 



# Now let's predict only with KG

In [4]:
import nltk
import torch
import json
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pickle
import Levenshtein

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
## Data file 
with open('../data/ssy_test.coltype.pkl', 'rb') as f:
    df = pickle.load(f)
with open('./mapping_dict.json', 'r', encoding='utf-8') as f:
    map_dict = json.load(f)
print(df.loc[0]['type'])

{'Champ_Car_season': 5, 'IndyCar_Series_season': 9}


In [6]:
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

In [7]:
fb = sorted(map_dict.items(), key = lambda x:x[1])
fb_sorted_list = [(i[0].split('.')[1]+' of '+i[0].split('.')[0]).replace('_', ' ') for i in fb]
tokenized_fb_types = tokenizer(fb_sorted_list, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    fb_embeddings = model(**tokenized_fb_types, output_hidden_states=True, return_dict=True).pooler_output
print(fb[0])

In [8]:
## ablation exp for es

def get_dist_cosine(emb1, emb2):
    vector_a = np.array(emb1)
    vector_b = np.array(emb2)
    cos=vector_a.dot(vector_b.T)/(np.reshape(np.linalg.norm(vector_a,axis=1),(vector_a.shape[0],1))* np.linalg.norm(vector_b,axis=1))
    return np.where(np.isnan(cos),0,cos)

def es_cal(model, tar_list, comp_embeddings):
    token_id = tokenizer(tar_list, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        type_embeddings = model(**token_id, output_hidden_states=True, return_dict=True).pooler_output
    es = get_dist_cosine(type_embeddings, comp_embeddings)
        
    return es

for num, row in df.iterrows():
    complex_list = sorted(row[-1].items(), key = lambda x:x[1], reverse=True)
    print(complex_list)
    target_list = [i[0].replace('_', ' ') if 'http://dbpedia.org/ontology/' not in i[0] else i[0].split('/')[-1] for i in complex_list] 
    count_list = [i[1] for i in complex_list] 
    
    es = np.array(count_list).dot(es_cal(model, target_list, fb_embeddings))
    # print(es.shape,.shape, np.array(count_list).dot(es).shape) 
    idx = np.argmax(es)
    print(idx, fb_sorted_list[idx],'### ->', row['labels'])
    break

[('IndyCar_Series_season', 9), ('Champ_Car_season', 5)]
16 company of automotive ### -> ['sports.sports_league_season']


In [9]:
## ablation exp for ls

def get_dist_leven(main_sent, sub_sent):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', None])
    mains = [i for i in nltk.word_tokenize(main_sent.lower()) if i not in stop_words]
    subs = [i for i in nltk.word_tokenize(sub_sent.lower()) if i not in stop_words]

    res_list = []

    for token in subs:
        sim = 0
        for text in mains:
            edist = Levenshtein.ratio(token, text)
            if edist > sim:
                sim = edist
        res_list.append(sim)
    error = len(subs) // 5
    while error > 0:
        res_list.pop(int(np.argmin(res_list)))
        error -= 1

    return np.mean(res_list)

def ls_cal(tar_list, comp_list):
    res_list = []
    for target in tar_list:
        temp_list = []
        for comp in comp_list:
            temp_list.append(get_dist_leven(target, comp))
        res_list.append(temp_list)
    ls = np.array(res_list)
            
    return ls

for num, row in df.iterrows():
    complex_list = sorted(row[-1].items(), key = lambda x:x[1], reverse=True)
    target_list = [i[0].replace('_', ' ') if 'http://dbpedia.org/ontology/' not in i[0] else i[0].split('/')[-1] for i in complex_list] 
    count_list = [i[1] for i in complex_list] 
    
    ls = np.array(count_list).dot(ls_cal(target_list, fb_sorted_list))
    # print(es.shape,.shape, np.array(count_list).dot(es).shape) 
    idx = np.argmax(ls)
    print(idx, fb_sorted_list[idx],'### ->', row['labels'])
    break


229 sports league season of sports ### -> ['sports.sports_league_season']


In [42]:
## here is an example
## refer to next block

for num, row in df.iterrows():
    complex_list = sorted(row[-1].items(), key = lambda x:x[1], reverse=True)
    target_list = [i[0].replace('_', ' ') if 'http://dbpedia.org/ontology/' not in i[0] else i[0].split('/')[-1] for i in complex_list] 
    count_list = [i[1] for i in complex_list] 
    
    ls = np.array(count_list).dot(ls_cal(target_list, fb_sorted_list))
    es = np.array(count_list).dot(es_cal(model, target_list, fb_embeddings))
    # print(es.shape,.shape, np.array(count_list).dot(es).shape) 
    idx = np.argmax(ls+es)
    
    print(idx, [fb[idx][0]],'### ->', row['labels'])
    break

229 ['sports.sports_league_season'] ### -> ['sports.sports_league_season']


In [11]:
def Z_Score_Normalization(x):
    return (x - np.mean(x))/(np.std(x))

def softmax(f):
    # instead: first shift the values of f so that the highest number is 0:
    f -= np.max(f) # f becomes [-666, -333, 0]
    return np.exp(f) / np.sum(np.exp(f))  # safe to do, gives the correct answer

In [16]:
ts = 0
count = 0
alpha = 0.3
beta = 1-alpha
gamma = 0.1

print("table_idx".ljust(10),"dict_len".ljust(10), 
      "pred_idx".ljust(10), "curr_acc".ljust(10), "result")

for num, row in df.iterrows():
    
    count+=1
    
    # if count<30:
    #     continue
    
    complex_list = sorted(row[-1].items(), key = lambda x:x[1], reverse=True)
    target_list = [i[0].replace('_', ' ') if 'http://dbpedia.org/ontology/' not in i[0] else i[0].split('/')[-1] for i in complex_list] 
    count_list = [i[1] for i in complex_list] 
    
    if not target_list:
        hd_ls = ls_cal([row['header']], fb_sorted_list)
        hd_es = es_cal(model, [row['header']], fb_embeddings)
        
        idx = np.argmax(softmax(hd_ls)*alpha + softmax(hd_es)*beta)

        if fb[idx][0] in row['labels']:
            ts+=1
        
        print(("%d" % num).ljust(10), 
                ("%d" % len(count_list)).ljust(10), 
                ("%d" % idx).ljust(10),          
                ("%.3f" % (ts/count)).ljust(10),
                [fb[idx][0]],' ->', row['labels'])
        continue
    
    ls = np.array(count_list).dot(Z_Score_Normalization(ls_cal(target_list, fb_sorted_list)))
    es = np.array(count_list).dot(Z_Score_Normalization(es_cal(model, target_list, fb_embeddings)))
    
    hd_ls = ls_cal([row['header']], fb_sorted_list)
    hd_es = es_cal(model, [row['header']], fb_embeddings)
    
    # print(es.shape,.shape, np.array(count_list).dot(es).shape) 
    idx = np.argmax(softmax(ls)*alpha + softmax(es)*beta + (softmax(hd_ls)*alpha + softmax(hd_es)*beta)*gamma
                    )

    if fb[idx][0] in row['labels']:
        ts+=1
        
    print(("%d" % num).ljust(10), 
          ("%d" % len(count_list)).ljust(10), 
          ("%d" % idx).ljust(10),          
          ("%.3f" % (ts/count)).ljust(10),
          [fb[idx][0]],' ->', row['labels'])
        
    # if count>=30:
    #     break

print(ts/count)

table_idx  dict_len   pred_idx   curr_acc   result
0          2          16         0.000      ['automotive.company']  -> ['sports.sports_league_season']
1          196        218        0.000      ['sports.pro_athlete']  -> ['people.person']
2          18         231        0.000      ['sports.sports_team']  -> ['organization.organization']
3          27         16         0.000      ['automotive.company']  -> ['business.business_operation']
4          78         16         0.200      ['automotive.company']  -> ['automotive.company']
5          79         208        0.333      ['soccer.football_player']  -> ['soccer.football_player']
6          51         231        0.286      ['sports.sports_team']  -> ['soccer.football_team']
7          23         227        0.375      ['sports.sports_league']  -> ['sports.sports_league']
8          3          143        0.333      ['location.in_state']  -> ['government.legislative_session']
9          31         143        0.300      ['location.in_

: 

: 