In [1]:
import pandas as pd
import numpy as np
from IPython.display import display

In [2]:
item_df = pd.read_csv("kensho-derived-wikimedia-data/items_filtered.csv")
item_aliases_df = pd.read_csv("kensho-derived-wikimedia-data/item_aliases_filtered.csv")
page_df = pd.read_csv("kensho-derived-wikimedia-data/page.csv")

In [3]:
item_df = item_df.dropna()
item_aliases_df = item_aliases_df.dropna()

In [4]:
page_df = page_df.dropna()
page_df

Unnamed: 0,page_id,item_id,title,views
0,12,6199,Anarchism,31335
1,25,38404,Autism,49693
2,39,101038,Albedo,14573
3,290,9659,A,25859
4,303,173,Alabama,52765
...,...,...,...,...
5362169,62470350,76894635,Daming Zhu,16
5362170,62470423,76894633,Tony Dews,7
5362171,62470432,76896959,Samsung PL20,9
5362172,62470465,6034153,Nils-Fredrik Palmstierna,8


In [5]:
from collections import Counter
import json
import os
import re
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

  import pandas.util.testing as tm


In [6]:
MIN_VIEWS = 5
MIN_ANCHOR_TARGET_COUNT = 2
NUM_KLAT_LINES = 5_343_564
NUM_PAGE_LINES = 5_362_174
kdwd_path = os.path.join("/kaggle/input", "kensho-derived-wikimedia-data")

def text_normalizer(text):                              
    """Return text after stripping external whitespace and lower casing."""   
    return text.strip().lower()

In [7]:
class KdwdLinkAnnotatedText:
    def __init__(self, file_path):
        self.file_path = file_path
    def __iter__(self):
        with open(self.file_path) as fp:
            for line in fp:
                yield json.loads(line)

In [8]:
file_path = os.path.join("kensho-derived-wikimedia-data", "link_annotated_text.jsonl")
klat = KdwdLinkAnnotatedText(file_path)

In [9]:
anchor_target_counts = Counter()
for page in tqdm(
    klat, 
    total=NUM_KLAT_LINES, 
    desc='calculating anchor-target counts'
):
    for section in page['sections']:
        spans = [
            (offset, offset + length) for offset, length in 
            zip(section['link_offsets'], section['link_lengths'])]
        anchor_texts = [section['text'][ii:ff] for ii,ff in spans]
        keys = [
            (anchor_text, target_page_id) for anchor_text, target_page_id in 
            zip(anchor_texts, section['target_page_ids'])]
        anchor_target_counts.update(keys)

calculating anchor-target counts: 5343565it [08:39, 10279.72it/s]                             


In [10]:
at_count_df = pd.DataFrame([
    (row[0][0], row[0][1], row[1]) for row in anchor_target_counts.most_common()],
    columns=['anchor_text', 'target_page_id', 'anchor_target_count'])

In [11]:
at_count_df

Unnamed: 0,anchor_text,target_page_id,anchor_target_count
0,United States,3434750,144375
1,World War II,32927,125229
2,France,5843419,106629
3,India,14533,105206
4,footballer,10568,91351
...,...,...,...
12128168,visits women monthly,88003,1
12128169,the peggies,54586221,1
12128170,Dani Gómez,61551460,1
12128171,Svea Air Corps,14549063,1


In [12]:
at_count_df["normalized_anchor_text"] = at_count_df["anchor_text"].apply(text_normalizer)
at_count_df = at_count_df.loc[at_count_df['normalized_anchor_text'].str.len() > 0, :]

In [13]:
at_count_df = (                                               
    at_count_df.                                              
    groupby(["normalized_anchor_text", "target_page_id"])["anchor_target_count"].   
    sum().                                                               
    to_frame("anchor_target_count").
    sort_values('anchor_target_count', ascending=False).
    reset_index()                                                        
)

In [14]:
at_count_df

Unnamed: 0,normalized_anchor_text,target_page_id,anchor_target_count
0,united states,3434750,144382
1,world war ii,32927,125271
2,france,5843419,106632
3,india,14533,105260
4,footballer,10568,91421
...,...,...,...
11405835,"brightmoor, detroit",2994892,1
11405836,markus curry,12978440,1
11405837,markus croonen,31352358,1
11405838,markus covert,36492945,1


In [15]:
at_count_df = at_count_df.rename(columns={"target_page_id": "page_id"})
at_count_df = at_count_df.drop(columns = ['normalized_anchor_text'])

In [16]:
merged_page_df = pd.merge(left=page_df, right=at_count_df, left_on='page_id', right_on='page_id')
merged_page_df = merged_page_df.dropna()
merged_page_df

Unnamed: 0,page_id,item_id,title,views,anchor_target_count
0,12,6199,Anarchism,31335,2096
1,12,6199,Anarchism,31335,880
2,12,6199,Anarchism,31335,422
3,12,6199,Anarchism,31335,34
4,12,6199,Anarchism,31335,25
...,...,...,...,...,...
11405829,62470164,76891691,Little Miss P,31,1
11405830,62470255,76894639,Alfredo Gatica,10,1
11405831,62470423,76894633,Tony Dews,7,2
11405832,62470465,6034153,Nils-Fredrik Palmstierna,8,3


In [17]:
link_counts = list(merged_page_df.groupby(['page_id']).sum()['anchor_target_count'])

In [18]:
item_id = list(merged_page_df.groupby(['page_id']).max()['item_id'])

In [19]:
selected_page_df = pd.DataFrame({'item_id': item_id, 'anchor_target_count': link_counts})

In [20]:
selected_page_df = pd.merge(left=page_df, right=selected_page_df, left_on='item_id', right_on='item_id')
selected_page_df

Unnamed: 0,page_id,item_id,title,views,anchor_target_count
0,12,6199,Anarchism,31335,3540
1,25,38404,Autism,49693,2114
2,39,101038,Albedo,14573,2825
3,290,9659,A,25859,175
4,303,173,Alabama,52765,11125
...,...,...,...,...,...
4517096,62470164,76891691,Little Miss P,31,4
4517097,62470255,76894639,Alfredo Gatica,10,1
4517098,62470423,76894633,Tony Dews,7,2
4517099,62470465,6034153,Nils-Fredrik Palmstierna,8,3


### Test for Baseline Model

In [21]:
combined_entity_df = pd.read_csv("test_data/combined_entity.csv")

In [22]:
combined_entity_df.head()

Unnamed: 0,entity,page_id,text_id
0,anti-authoritarian,867979,0
1,political,23040,0
2,social philosophy,586276,0
3,hierarchies,13998,0
4,self-managed,40949353,0


In [28]:
# random sampling from test data

sampled_entities = combined_entity_df.sample(n = 20000, random_state=1)
sampled_entities.head()

Unnamed: 0,entity,page_id,text_id
7370448,1978 European Athletics Championships,1817534,816614
20507120,cricket,25675557,2731028
31577455,Butler County,94685,4632894
15312615,Poland,22936,1921098
25669126,National University of Sciences and Technology...,989013,3585620


In [29]:
data_array = item_df.to_numpy()
data_alias_array = item_aliases_df.to_numpy()
page_array = selected_page_df.to_numpy()

In [30]:
def get_item_target(name):
    data_array_indices = np.where(data_array[:,1]==name)[0]
    item_ids = data_array[:,0][list(data_array_indices)]
    link_counts = []
    for item_id in item_ids:
        page_array_indices = np.where(page_array[:,1]==item_id)[0]
        link_array = page_array[list(page_array_indices)]
        if len(link_array) == 0:
            return None
        link_counts.append(link_array[0])
    link_counts = np.array(link_counts)
    link_counts = np.asarray(link_counts)
    num_link_counts = list(link_counts[:,4])
    
    if len(num_link_counts) != 0:
        max_link_count = max(num_link_counts)
        max_count_idx = num_link_counts.index(max_link_count)
        target = link_counts[max_count_idx][0]
        return target


def get_alias_target(name):
    data_array_indices = np.where(data_alias_array[:,1]==name)[0]
    item_ids = data_alias_array[:,0][list(data_array_indices)]
  
    link_counts = []
    for item_id in item_ids:
        page_array_indices = np.where(page_array[:,1]==item_id)[0]
        link_array = page_array[list(page_array_indices)]
        if len(link_array) == 0:
            return None
        link_counts.append(link_array[0])
   
    link_counts = np.array(link_counts)
    link_counts = np.asarray(link_counts)
    num_link_counts = list(link_counts[:,4])

    if len(num_link_counts) != 0:
        max_link_count = max(num_link_counts)
        max_count_idx = num_link_counts.index(max_link_count)
        target = link_counts[max_count_idx][0]
        return target

In [31]:
total = 0
correct = 0

test_array = sampled_entities[['entity','page_id']].to_numpy()

item_names = list(item_df['en_label'])
alias_names = list(item_aliases_df['en_alias'])

In [32]:
for i in range(len(test_array)):
    if i%1000 == 0:
        print(i)
    name = test_array[i,0]
    if name in item_names:
        target = get_item_target(name)
        if target == test_array[i,1]:
            correct += 1
        
    elif name in alias_names:
        target = get_alias_target(name)
        if target == test_array[i,1]:
            correct += 1
    total += 1

accuracy = correct/total
print("The accuracy rate for the baseline model is", accuracy)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
The accuracy rate for the baseline model is 0.59135
