In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

In [17]:
# read in data files
# item_df = pd.read_csv("kensho-derived-wikimedia-data/item.csv")
# item_aliases_df = pd.read_csv("kensho-derived-wikimedia-data/item_aliases.csv")
item_df = pd.read_csv("kensho-derived-wikimedia-data/items_filtered.csv")
item_aliases_df = pd.read_csv("kensho-derived-wikimedia-data/item_aliases_filtered.csv")
page_df = pd.read_csv("kensho-derived-wikimedia-data/page.csv")

In [18]:
# sampling data from item_df and item_aliases_df
item_df = item_df.dropna()
item_aliases_df = item_aliases_df.dropna()

In [6]:
sampled_items = item_df.sample(n = 10000, random_state=1)
sampled_item_aliases = item_aliases_df.sample(n = 10000, random_state=1)

In [7]:
sampled_items.head()

Unnamed: 0,item_id,en_label,en_description
41077482,60541383,list of shipwrecks in August 1821,Wikimedia list article
17530739,26321418,Church Hill House,"Harbledown and Rough Common, Canterbury, Kent,..."
48529186,72564519,The disposition of paracetamol and its conjuga...,scientific article published on 01 January 1993
27487712,40668532,Pharmacokinetics of anidulafungin during album...,scientific article published on 31 March 2014
37899990,56125260,hypothetical protein MXF1_RS0110340,microbial gene found in Myxococcus xanthus DZF1


In [8]:
sampled_item_aliases.head()

Unnamed: 0,item_id,en_alias
5872190,56141708,MXF1_RS0131720
6591471,63016624,hypothetical protein
2703149,22299679,upregulation of phospholipid metabolism
5544924,47008716,Børge Gustav Bachmann
2898427,23093805,transposase/IS protein


In [9]:
page_df.head()

Unnamed: 0,page_id,item_id,title,views
0,12,6199,Anarchism,31335
1,25,38404,Autism,49693
2,39,101038,Albedo,14573
3,290,9659,A,25859
4,303,173,Alabama,52765


In [None]:
# going through all the items and item_aliases

def get_item_target(name, df):
    dataframe = df[df['en_label']==name]
        
    final_view_df = pd.DataFrame(columns = df.columns)
    item_ids = list(dataframe['item_id'])
    views = []
    for item_id in item_ids:
        view_df = page_df[page_df['item_id'] == item_id]
        final_view_df = pd.concat([final_view_df, view_df])
    views = list(final_view_df['views'])
    if len(views) != 0:
        max_view = max(views)
        target = list(final_view_df[final_view_df['views'] == max_view]['page_id'])[0]
        return target

def get_alias_target(name, df):
    dataframe = df[df['en_alias']==name]
        
    final_view_df = pd.DataFrame(columns = df.columns)
    item_ids = list(dataframe['item_id'])
    views = []
    for item_id in item_ids:
        view_df = page_df[page_df['item_id'] == item_id]
        final_view_df = pd.concat([final_view_df, view_df])
    views = list(final_view_df['views'])
    if len(views) != 0:
        max_view = max(views)
        target = list(final_view_df[final_view_df['views'] == max_view]['page_id'])[0]
        return target

predicted_dict = {}

item_names = set(list(sampled_items['en_label'])+list(sampled_item_aliases['en_alias']))
for name in item_names:
    if name in set(list(sampled_items['en_label'])):
        target = get_item_target(name, sampled_items)
        predicted_dict.update({name: target})
        
    elif name in set(list(sampled_item_aliases['en_alias'])):
        target = get_alias_target(name, sampled_item_aliases)
        predicted_dict.update({name: target})

In [None]:
predicted_df = pd.DataFrame(predicted_dict.items(), columns=['entity name', 'page id'])
predicted_df

In [71]:
# examples

# sentence1 = "Michael Jordan (born 1957) is an American scientist, professor, and leading researcher in machine learning and artificial intelligence."
sentence = "Christmas Songs is the eighth full-length studio album and first Christmas album from Jars of Clay, that was released on October 16, 2007 through Gray Matters/Nettwerk."

candidates = ["Christmas Songs", "Christmas album", "Jars of Clay"]
pool = list(predicted_dict.keys())
for candidate in candidates:
    if candidate in set(pool):
        print(str(candidate)+":")
        page_id = predicted_dict.get(candidate)
        cand_df = page_df[page_df['page_id'] == page_id]
        display(cand_df)
        item_id = list(cand_df['item_id'])[0]
        cand_item_df = item_df[item_df['item_id'] == item_id]
        display(cand_item_df)


Christmas Songs:


Unnamed: 0,page_id,item_id,title,views
1513814,11909792,5111427,Christmas Songs (Jars of Clay album),167


Unnamed: 0,item_id,en_label,en_description
4075762,5111427,Christmas Songs,2007 studio album by Jars of Clay


### Test for Baseline Model

In [4]:
# readin test data
combined_entity_df = pd.read_csv("test_data/combined_entity.csv")

In [5]:
combined_entity_df.head()

Unnamed: 0,entity,page_id,text_id
0,anti-authoritarian,867979,0
1,political,23040,0
2,social philosophy,586276,0
3,hierarchies,13998,0
4,self-managed,40949353,0


In [6]:
combined_text_df = pd.read_csv("test_data/combined_text.csv")

In [7]:
combined_text_df.head()

Unnamed: 0,text_id,text
0,0,Anarchism is an anti-authoritarian political a...
1,1,Autism is a developmental disorder characteriz...
2,2,"Albedo () (, meaning 'whiteness') is the measu..."
3,3,A or a is the first letter and the first vowel...
4,4,Alabama () is a state in the southeastern regi...


In [8]:
single_entity_df = pd.read_csv("test_data/single_entity.csv")

In [9]:
single_entity_df.head()

Unnamed: 0,entity,page_id,text_id
0,anti-authoritarian,867979,0
1,political,23040,0
2,hierarchies,13998,0
3,self-managed,40949353,0
4,self-governed,191161,0


In [29]:
# random sampling from test data

sampled_entities = combined_entity_df.sample(n = 20000, random_state=1)
sampled_entities.head()

Unnamed: 0,entity,page_id,text_id
7370448,1978 European Athletics Championships,1817534,816614
20507120,cricket,25675557,2731028
31577455,Butler County,94685,4632894
15312615,Poland,22936,1921098
25669126,National University of Sciences and Technology...,989013,3585620


In [21]:
data_array = item_df.to_numpy()
data_alias_array = item_aliases_df.to_numpy()
page_array = page_df.to_numpy()

In [23]:
# test data
# change some dataframe into numpy arrays

def get_item_target(name):
    data_array_indices = np.where(data_array[:,1]==name)[0]
    item_ids = data_array[:,0][list(data_array_indices)]
    views = []
    for item_id in item_ids:
        page_array_indices = np.where(page_array[:,1]==item_id)[0]
        view_array = page_array[list(page_array_indices)]
        views.append(view_array)
    views = np.array(views)[0]
    views = np.asarray(views)
    num_views = list(views[:,3])
    if len(num_views) != 0:
        max_view = max(num_views)
        max_view_idx = num_views.index(max_view)
        target = views[max_view_idx][0]
        return target


def get_alias_target(name):
    data_array_indices = np.where(data_alias_array[:,1]==name)[0]

    item_ids = data_alias_array[:,0][list(data_array_indices)]
    views = []
    for item_id in item_ids:
        page_array_indices = np.where(page_array[:,1]==item_id)[0]
        view_array = page_array[list(page_array_indices)]
        views.append(view_array)

    views = np.array(views)[0]
    views = np.asarray(views)
    num_views = list(views[:,3])
    if len(num_views) != 0:
        max_view = max(num_views)
        max_view_idx = num_views.index(max_view)
        target = views[max_view_idx][0]
        return target


In [30]:
total = 0
correct = 0

test_array = sampled_entities[['entity','page_id']].to_numpy()

item_names = list(item_df['en_label'])
alias_names = list(item_aliases_df['en_alias'])


In [31]:

for i in range(len(test_array)):
    if i%1000 == 0:
        print(i)
    name = test_array[i,0]
    if name in item_names:
        target = get_item_target(name)
        if target == test_array[i,1]:
            correct += 1
        
    elif name in alias_names:
        target = get_alias_target(name)
        if target == test_array[i,1]:
            correct += 1
    total += 1

accuracy = correct/total
print("The accuracy rate for the baseline model is", accuracy)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
The accuracy rate for the baseline model is 0.6183


In [25]:
# example sentence
text_id = 3
sentence = list(combined_text_df[combined_text_df['text_id'] == text_id]['text'])[0]
candidate_entities = list(combined_entity_df[combined_entity_df['text_id'] == text_id]['entity'])
print(sentence)

A or a is the first letter and the first vowel letter of the modern English alphabet and the ISO basic Latin alphabet. Its name in English is (pronounced ), plural . It is similar in shape to the alpha, from which it derives. The uppercase version consists of the two slanting sides of a triangle, crossed in the middle by a horizontal bar. The lowercase version can be written in two forms: the double-storey a and single-storey ɑ. The latter is commonly used in handwriting and fonts based on it, especially fonts intended to be read by children, and is also found in italic type. In the English grammar, "a", and its variant "an", is an .


In [26]:
print(candidate_entities)

['letter', 'vowel letter', 'modern English alphabet', 'ISO basic Latin alphabet', 'alpha', 'italic type']


In [43]:

for candidate in candidate_entities:
    if candidate in item_names:
        target = get_item_target(candidate)
        if target is not None:
            print(str(candidate))
            target_item = list(page_df[page_df['page_id'] == target]['item_id'])[0]
            print("Predicted entity:")
            display(item_df[item_df['item_id'] == target_item])
            print("Actual entity:")
            actual_pre = combined_entity_df[combined_entity_df['text_id'] == text_id]
            actual = list(actual_pre[actual_pre['entity'] == candidate]['page_id'])[0]
            actual_item = list(page_df[page_df['page_id'] == actual]['item_id'])[0]
            display(item_df[item_df['item_id'] == actual_item])
    elif candidate in alias_names:
        target = get_alias_target(candidate)
        if target is not None:
            print(str(candidate))
            target_item = list(page_df[page_df['page_id'] == target]['item_id'])[0]
            print("Predicted entity:")
            display(item_aliases_df[item_aliases_df['item_id'] == target_item])
            print("Actual entity:")
            actual_pre = combined_entity_df[combined_entity_df['text_id'] == text_id]
            actual = list(actual_pre[actual_pre['entity'] == candidate]['page_id'])[0]
            actual_item = list(page_df[page_df['page_id'] == actual]['item_id'])[0]
            display(item_aliases_df[item_aliases_df['item_id'] == actual_item])
        

letter
Predicted entity:


Unnamed: 0,item_id,en_label,en_description
9330,9788,letter,grapheme in an alphabetic system of writing


Actual entity:


Unnamed: 0,item_id,en_label,en_description
9330,9788,letter,grapheme in an alphabetic system of writing


ISO basic Latin alphabet
Predicted entity:


Unnamed: 0,item_id,en_label,en_description
4765992,5974462,ISO basic Latin alphabet,alphabet consisting of 26 letters; identical t...


Actual entity:


Unnamed: 0,item_id,en_label,en_description
4765992,5974462,ISO basic Latin alphabet,alphabet consisting of 26 letters; identical t...


alpha
Predicted entity:


Unnamed: 0,item_id,en_alias
367959,652798,alpha
367960,652798,script A
367961,652798,Latin script A
367962,652798,Latin alpha


Actual entity:


Unnamed: 0,item_id,en_alias
19671,9887,α
19672,9887,Alpha
19673,9887,A


In [45]:
# example sentence

text_id = 9
sentence = list(combined_text_df[combined_text_df['text_id'] == text_id]['text'])[0]
candidate_entities = list(combined_entity_df[combined_entity_df['text_id'] == text_id]['entity'])


print(sentence)

for candidate in candidate_entities:
    if candidate in item_names:
        target = get_item_target(candidate)
        if target is not None:
            print(str(candidate))
            target_item = list(page_df[page_df['page_id'] == target]['item_id'])[0]
            print("Predicted entity:")
            display(item_df[item_df['item_id'] == target_item])
            print("Actual entity:")
            actual_pre = combined_entity_df[combined_entity_df['text_id'] == text_id]
            actual = list(actual_pre[actual_pre['entity'] == candidate]['page_id'])[0]
            actual_item = list(page_df[page_df['page_id'] == actual]['item_id'])[0]
            display(item_df[item_df['item_id'] == actual_item])
    elif candidate in alias_names:
        target = get_alias_target(candidate)
        if target is not None:
            print(str(candidate))
            target_item = list(page_df[page_df['page_id'] == target]['item_id'])[0]
            print("Predicted entity:")
            display(item_aliases_df[item_aliases_df['item_id'] == target_item])
            print("Actual entity:")
            actual_pre = combined_entity_df[combined_entity_df['text_id'] == text_id]
            actual = list(actual_pre[actual_pre['entity'] == candidate]['page_id'])[0]
            actual_item = list(page_df[page_df['page_id'] == actual]['item_id'])[0]
            display(item_aliases_df[item_aliases_df['item_id'] == actual_item])
        

The Academy Award for Best Production Design recognizes achievement for art direction in . The category's original name was Best Art Direction, but was changed to its current name in 2012 for the 85th Academy Awards. This change resulted from the Art Director's branch of the Academy of Motion Picture Arts and Sciences (AMPAS) being renamed the Designer's branch. Since 1947, the award is shared with the set decorator(s). It is awarded to the best interior design in a film. The films below are listed with their production year (for example, the 2000 Academy Award for Best Art Direction is given to a film from 1999). In the lists below, the winner of the award for each year is shown first, followed by the other nominees in alphabetical order.
Academy Award
Predicted entity:


Unnamed: 0,item_id,en_label,en_description
3714731,4671334,Academy Award,CBS radio anthology series


Actual entity:


Unnamed: 0,item_id,en_label,en_description
18148,19020,Academy Awards,awards given annually for excellence of cinema...


85th Academy Awards
Predicted entity:


Unnamed: 0,item_id,en_label,en_description
220711,248688,85th Academy Awards,Award ceremony presented by the Academy of Mot...


Actual entity:


Unnamed: 0,item_id,en_label,en_description
220711,248688,85th Academy Awards,Award ceremony presented by the Academy of Mot...


Academy of Motion Picture Arts and Sciences
Predicted entity:


Unnamed: 0,item_id,en_label,en_description
194000,212329,Academy of Motion Picture Arts and Sciences,professional honorary organization


Actual entity:


Unnamed: 0,item_id,en_label,en_description
194000,212329,Academy of Motion Picture Arts and Sciences,professional honorary organization


1947
Predicted entity:


Unnamed: 0,item_id,en_label,en_description
4876,5263,1947,year


Actual entity:


Unnamed: 0,item_id,en_label,en_description
766359,917174,19th Academy Awards,Award ceremony presented by the Academy of Mot...
