In [1]:
import pandas as pd
import numpy as np
import json

Download link_annotated_text.jsonl  
https://www.kaggle.com/kenshoresearch/kensho-derived-wikimedia-data  

Run the following unless you have tons of memory:  
$ split -l 1000000 link_annotated_text.jsonl

Rename the files to  
0, 1, 2, 3,...

In [2]:
file_number = 5 #repeat this notebook for files [0, 1, 2, 3, 4, 5]

with open(f'{file_number}') as f:
    df = pd.DataFrame(json.loads(line) for line in f)
df

Unnamed: 0,page_id,sections
0,56933780,"[{'name': 'Introduction', 'text': 'Sophia Rose..."
1,56933781,"[{'name': 'Introduction', 'text': 'Emyr Evans ..."
2,56933797,"[{'name': 'Background', 'text': 'Joseph Valéry..."
3,56933814,"[{'name': 'Introduction', 'text': 'East Buffal..."
4,56933816,"[{'name': 'Introduction', 'text': 'Milorad Mij..."
...,...,...
343560,62470350,"[{'name': 'Introduction', 'text': 'Daming Zhu ..."
343561,62470423,"[{'name': 'Introduction', 'text': 'Tony Oshey ..."
343562,62470432,"[{'name': 'Introduction', 'text': '(EC-PL20ZZB..."
343563,62470465,"[{'name': 'Introduction', 'text': 'Major Gener..."


In [3]:
#exports the text column with the assigned text_id

text = [col[0]['text'] for col in df['sections']]
index = df.index + file_number*1000000
df_text = pd.DataFrame(zip(index, text), columns = ['text_id', 'text']) 
df_text.to_csv(f"{file_number}_text.csv", index=False)

In [4]:
#generates the dictionary of entities for each text entry

def get_entities(loc, length, target, text):
    entities = {}
    for i, j, k in zip(loc, length, target):
        entities[text[i:i+j]] = k
    return entities

df['entities'] = [get_entities(col[0]['link_offsets'], col[0]['link_lengths'], col[0]['target_page_ids'], col[0]['text']) for col in df['sections']]
df['text_id'] = df_text['text_id']
df

Unnamed: 0,page_id,sections,entities,text_id
0,56933780,"[{'name': 'Introduction', 'text': 'Sophia Rose...","{'historian': 13575, 'the Enlightenment': 3075...",5000000
1,56933781,"[{'name': 'Introduction', 'text': 'Emyr Evans ...",{'2018 PSA World Tour': 56144539},5000001
2,56933797,"[{'name': 'Background', 'text': 'Joseph Valéry...","{'Bastia': 45433, 'Cap Corse': 7803274, 'Livor...",5000002
3,56933814,"[{'name': 'Introduction', 'text': 'East Buffal...","{'Mosquito Range': 1787725, 'Colorado': 5399, ...",5000003
4,56933816,"[{'name': 'Introduction', 'text': 'Milorad Mij...","{'National Assembly of Serbia': 619753, 'Socia...",5000004
...,...,...,...,...
343560,62470350,"[{'name': 'Introduction', 'text': 'Daming Zhu ...",{'Peking University School of Transnational La...,5343560
343561,62470423,"[{'name': 'Introduction', 'text': 'Tony Oshey ...","{'American': 3434750, 'football': 18951490, 'T...",5343561
343562,62470432,"[{'name': 'Introduction', 'text': '(EC-PL20ZZB...","{'digital': 52797, 'compact camera': 1140781, ...",5343562
343563,62470465,"[{'name': 'Introduction', 'text': 'Major Gener...","{'Major General': 1185281, 'Swedish Air Force'...",5343563


In [5]:
#exports each entity pairing along with the text_id

iterations = 10
rows = len(df)//10


for i in np.arange(10):
    start = i*rows
    end = (i+1)*rows
    if end > len(df):
        end = len(df)
    
    print(f'processing indexes {start}:{end}')
    df_entities = [np.array([list(col[0].keys()), list(col[0].values()), np.repeat(col[1], len(col[0]))]) for col in zip(df.iloc[start:end]['entities'], df.iloc[start:end]['text_id'])]
    df_entities = np.concatenate(df_entities, axis=1)
    df_entities = pd.DataFrame(zip(df_entities[0], df_entities[1], df_entities[2]), columns = ['entity', 'page_id', 'text_id'])
    df_entities
    if i == 0:
        df_entities.to_csv(f"{file_number}_entities.csv", index=False)
    else:
        df_entities.to_csv(f'{file_number}_entities.csv', mode='a', header=False, index=False)

print(f'export complete')

processing indexes 0:34356
processing indexes 34356:68712
processing indexes 68712:103068
processing indexes 103068:137424
processing indexes 137424:171780
processing indexes 171780:206136
processing indexes 206136:240492
processing indexes 240492:274848
processing indexes 274848:309204
processing indexes 309204:343560
export complete


In [7]:
#run this after you've generated all the preceeding files

text_filenames = [f'{i}_text.csv' for i in np.arange(6)]
entities_filenames = [f'{i}_entities.csv' for i in np.arange(6)]

combined_csv = pd.concat([pd.read_csv(f) for f in text_filenames])
combined_csv.to_csv( "combined_text.csv", index=False)
display(combined_csv)

combined_csv = pd.concat([pd.read_csv(f) for f in entities_filenames])
combined_csv = combined_csv.dropna()
combined_csv.to_csv( "combined_entities.csv", index=False)
display(combined_csv)

Unnamed: 0,text_id,text
0,0,Anarchism is an anti-authoritarian political a...
1,1,Autism is a developmental disorder characteriz...
2,2,"Albedo () (, meaning 'whiteness') is the measu..."
3,3,A or a is the first letter and the first vowel...
4,4,Alabama () is a state in the southeastern regi...
...,...,...
343560,5343560,Daming Zhu is an Assistant Dean for Continuing...
343561,5343561,"Tony Oshey Dews (born June 6, 1973) is an Amer..."
343562,5343562,(EC-PL20ZZBPRUS) is an sleek design digital co...
343563,5343563,Major General Nils-Fredrik Palmstierna (8 Marc...


Unnamed: 0,entity,page_id,text_id
0,anti-authoritarian,867979,0
1,political,23040,0
2,social philosophy,586276,0
3,hierarchies,13998,0
4,self-managed,40949353,0
...,...,...,...
1668440,Tomasa Tequiero,39519608,5343559
1668441,Sos mi hombre,39950100,5343559
1668442,Luis Gatica,2099374,5343559
1668443,Lucho Gatica,2112544,5343559
