In [1]:
import pandas as pd
import numpy as np
import pickle
import time

## Aggregates the items from text, item.csv and item_aliases.csv

In [2]:
%%time
item_df = pd.read_csv('KDWD/item.csv', usecols=['item_id', 'en_label'])
item_df

CPU times: user 46 s, sys: 2.84 s, total: 48.9 s
Wall time: 48.9 s


Unnamed: 0,item_id,en_label
0,1,Universe
1,2,Earth
2,3,life
3,4,death
4,5,human
...,...,...
51450311,77257472,2dFGRS TGN256Z026
51450312,77257483,2dFGRS TGS171Z171
51450313,77257484,2dFGRS TGS373Z078
51450314,77257491,2dFGRS TGS374Z114


In [3]:
alias_df = pd.read_csv('KDWD/item_aliases.csv').rename(columns={'en_alias': 'en_label'})
alias_df

Unnamed: 0,item_id,en_label
0,1,Our Universe
1,1,The Universe
2,1,The Cosmos
3,1,cosmos
4,2,Blue Planet
...,...,...
6823019,77256516,"Wood Adams Building, Dunedin"
6823020,77256876,Flame-bearers of Welsh history being the outli...
6823021,77256970,Flame-bearers of Welsh history being the outli...
6823022,77257218,"6 Thorn Street, Caversham"


In [4]:
%%time
entity_text_df = pd.read_feather('data/entity_df.ftr')[['item_id', 'entity']].rename(columns={'entity': 'en_label'})
entity_text_df

CPU times: user 43.6 s, sys: 3.54 s, total: 47.2 s
Wall time: 45.8 s


Unnamed: 0,item_id,en_label
0,1030234,anti-authoritarian
1,179805,political
2,180592,social philosophy
3,188619,hierarchies
4,15981562,self-managed
...,...,...
121835448,16215506,Carl Randall
121835449,1416303,The World Ends With You
121835450,18741083,2016 Summer Olympics closing ceremony
121835451,181278,2020 Summer Olympics


In [5]:
%%time
item_df = pd.concat([item_df, alias_df, entity_text_df], ignore_index=True).astype({'item_id':'int32', 'en_label':str})
item_df

CPU times: user 33.8 s, sys: 1.71 s, total: 35.5 s
Wall time: 35.4 s


Unnamed: 0,item_id,en_label
0,1,Universe
1,2,Earth
2,3,life
3,4,death
4,5,human
...,...,...
180108788,16215506,Carl Randall
180108789,1416303,The World Ends With You
180108790,18741083,2016 Summer Olympics closing ceremony
180108791,181278,2020 Summer Olympics


In [6]:
item_df.dropna(inplace=True)
item_df

Unnamed: 0,item_id,en_label
0,1,Universe
1,2,Earth
2,3,life
3,4,death
4,5,human
...,...,...
180108788,16215506,Carl Randall
180108789,1416303,The World Ends With You
180108790,18741083,2016 Summer Olympics closing ceremony
180108791,181278,2020 Summer Olympics


## Convert item list to a dictionary

In [7]:
%%time
item_df.en_label = item_df.en_label.apply(lambda x: x.strip().casefold())
item_df

CPU times: user 1min 16s, sys: 5.11 s, total: 1min 21s
Wall time: 1min 21s


Unnamed: 0,item_id,en_label
0,1,universe
1,2,earth
2,3,life
3,4,death
4,5,human
...,...,...
180108788,16215506,carl randall
180108789,1416303,the world ends with you
180108790,18741083,2016 summer olympics closing ceremony
180108791,181278,2020 summer olympics


In [8]:
%%time
item_df.sort_values(by=['en_label'], inplace=True)
item_df

CPU times: user 13min 14s, sys: 5.75 s, total: 13min 20s
Wall time: 13min 17s


Unnamed: 0,item_id,en_label
136332295,4409988,
173078121,21032551,
102749144,599404,
94709327,7452285,
171449384,357121,
...,...,...
37753682,55900012,𠃍
37743337,55885207,𥫗
83105793,837751,𨳊
7547702,11273367,𪜈


In [9]:
%%time
# get unique entities and index of those entities
item_np = item_df[['en_label', 'item_id']].to_numpy()
display(item_np)

unique, indices = np.unique(item_np[:,0], return_index=True)
display(len(unique))
display(unique)

indices = np.append(indices, len(item_np))
display(len(indices))
indices[-10:]

array([['', 4409988],
       ['', 21032551],
       ['', 599404],
       ...,
       ['𨳊', 837751],
       ['𪜈', 11273367],
       ['\U0010fc01\U0010fc01', 283399]], dtype=object)

48191955

array(['', '!', '! -attention-', ..., '𨳊', '𪜈', '\U0010fc01\U0010fc01'],
      dtype=object)

48191956

CPU times: user 2min 16s, sys: 6.67 s, total: 2min 23s
Wall time: 2min 22s


array([180108784, 180108785, 180108786, 180108787, 180108788, 180108789,
       180108790, 180108791, 180108792, 180108793])

In [10]:
%%time
item_dict={}
st = time.time()

for i, j in enumerate(unique):
    if i%1000000==0:
        print(f'{i}/{len(unique)}\t{time.time()-st}')
    item_dict[j] = np.unique(item_np[indices[i]:indices[i+1], 1])
    
item_dict['tesla']

0/48191955	6.67572021484375e-06
1000000/48191955	13.975235939025879
2000000/48191955	27.407203197479248
3000000/48191955	42.07367753982544
4000000/48191955	56.234493255615234
5000000/48191955	70.69492506980896
6000000/48191955	85.83571600914001
7000000/48191955	99.85097789764404
8000000/48191955	114.11239194869995
9000000/48191955	128.68406128883362
10000000/48191955	142.67254209518433
11000000/48191955	156.11527514457703
12000000/48191955	170.7303283214569
13000000/48191955	185.2220802307129
14000000/48191955	199.05200219154358
15000000/48191955	213.41579723358154
16000000/48191955	227.67160987854004
17000000/48191955	241.48469400405884
18000000/48191955	255.08781003952026
19000000/48191955	269.4535939693451
20000000/48191955	283.71573424339294
21000000/48191955	297.6540937423706
22000000/48191955	312.3798940181732
23000000/48191955	330.97881269454956
24000000/48191955	346.0731248855591
25000000/48191955	360.53767824172974
26000000/48191955	374.76695561408997
27000000/48191955	388.886

array([9036, 163343, 210893, 478214, 622424, 765530, 780348, 1050485,
       1428953, 1463050, 1548225, 1634161, 2384079, 2406220, 3982823,
       5172712, 7035686, 7705502, 7705506, 7705515, 16258100, 19565583,
       19845823, 23663332, 27701406, 31803712, 37251206, 56084926],
      dtype=object)

In [11]:
# remove the empty key
item_dict.pop('')
display(len(item_dict))

48191954

In [12]:
%%time
item_dict_df = pd.DataFrame(item_dict.items(), columns=['en_label', 'item_ids'])
display(item_dict_df)
item_dict_df.to_feather('item_dict.ftr')

Unnamed: 0,en_label,item_ids
0,!,"[120976, 166764, 1315167, 3714010, 4540205, 66..."
1,! -attention-,[8290256]
2,! that bastard is trying to steal our gold !,[60669584]
3,! that dick trying to steal our gold !,[60669584]
4,!!,"[12366011, 13520655]"
...,...,...
48191949,𠃍,[55900012]
48191950,𥫗,[55885207]
48191951,𨳊,[837751]
48191952,𪜈,[11273367]


CPU times: user 56.1 s, sys: 5.27 s, total: 1min 1s
Wall time: 1min


In [13]:
%%time
pickle.dump(item_dict, open('item_dict.p', 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

CPU times: user 5min 44s, sys: 16.6 s, total: 6min
Wall time: 5min 58s
