In [108]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz#egg=en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 13.9MB/s eta 0:00:01   |█                               | 3.2MB 1.8MB/s eta 0:00:51     |███                             | 9.2MB 8.4MB/s eta 0:00:11     |██████████▎                     | 31.0MB 10.0MB/s eta 0:00:07     |████████████▏                   | 36.6MB 5.3MB/s eta 0:00:12     |████████████▉                   | 38.7MB 5.3MB/s eta 0:00:11     |███████████████▎                | 46.0MB 599kB/s eta 0:01:25     |█████████████████▌              | 52.6MB 2.5MB/s eta 0:00:18     |███████████████████             | 57.4MB 3.4MB/s eta 0:00:12     |████████████████████████        | 72.2MB 4.3MB/s eta 0:00:06     |██████████████████████

In [2]:
import logging
from collections import Counter, defaultdict
import wikipediaapi
from tqdm.notebook import tqdm
import spacy
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

# nlp = spacy.load("en_core_web_sm")


In [3]:
nlp = spacy.load("en_core_web_md")

In [4]:
enwiki = wikipediaapi.Wikipedia('en')

In [5]:
def get_verbs(section):
    c = Counter()
    for line in section.splitlines():
        doc = nlp(line)
        c += Counter([t for t in doc if t.pos_=='VERB'])
    return c

In [6]:
def get_entities(section):
    c = Counter()
    for line in section.splitlines():
        doc = nlp(line)
        c += Counter([(t.label_, t.text) for t in doc.ents])
    return c    

In [7]:
def get_from_timeline(timeline, f):
    verbs = Counter()
    for section in timeline.section_by_title('Events').sections:
        verbs += f(section.text)
    return verbs

In [8]:
ordinal = lambda n: "%d%s"%(n,{1:"st",2:"nd",3:"rd"}.get(n if n<20 else n%10,"th"))

In [9]:
def get_from_pages(titles):
    verbs = Counter()
    entities = Counter()
    for title in tqdm(titles):
        logging.info(title)
        timeline = enwiki.page(title)
        try:
            verbs += get_from_timeline(timeline, get_verbs)
            entities += get_from_timeline(timeline, get_entities)
        except AttributeError as e:
            logging.warning(e)
    return verbs, entities

In [10]:
titles = [f"{ordinal(i)}_century" for i in range(20)]

In [12]:
verbs, entities = get_from_pages([str(i) for i in tqdm(range(1000))])

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

  # This is added back by InteractiveShellApp.init_path()







In [23]:
len(entities)

198

In [14]:
df = pd.DataFrame([v.vector for v in verbs], index=[v.text for v in verbs])

In [17]:
df.shape

(126, 300)

# Cluster

In [37]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(df.values)

In [47]:
clusters = [(l, v.lemma_) ]

In [53]:
d = defaultdict(dict)
for l,v,c in zip(kmeans.labels_, verbs.keys(), verbs.values()):
    d[l][v] = c

In [55]:
for k,v in d.items():
    d[k] = Counter(v)

In [57]:
for k in d:
    print(k, d[k].most_common(1))

6 [(arises, 1)]
2 [(starts, 1)]
4 [(established, 1)]
8 [(unified, 1)]
7 [(attacked, 1)]
1 [(defeated, 1)]
5 [(led, 1)]
0 [(writes, 1)]
9 [(invades, 1)]
3 [(dies, 1)]


# Tensorboard

In [28]:
df.to_csv('data/verbs.vecs', index=None, header=None, sep='\t')

In [33]:
with open('data/verbs.names', 'w') as f:
    f.write('\n'.join([v.lemma_ for v in verbs]))

In [95]:
# 

In [97]:
d = defaultdict(dict)
for (label, text), count in c.items():
    d[label][text] = count

In [99]:
for k,v  in d.items():
    d[k] = Counter(v)

In [101]:
for k,v in d.items():
    print(k)
    print(v.most_common(10))

CARDINAL
[('1500', 7), ('1521', 6), ('1600', 6), ('1565', 5), ('three', 4), ('two', 4), ('1093', 4), ('1099', 4), ('1536', 4), ('1541', 4)]
GPE
[('England', 34), ('France', 30), ('China', 29), ('Spain', 28), ('Portugal', 21), ('Russia', 16), ('Hungary', 15), ('Indonesia', 14), ('India', 14), ('the Ottoman Empire', 13)]
LOC
[('North America', 10), ('Europe', 10), ('the Holy Roman Empire', 4), ('Americas', 4), ('Asia', 4), ('the East Indies', 4), ('Africa', 3), ('Earth', 3), ('Mexica', 2), ('Western Europe', 2)]
PERSON
[('Malacca', 7), ('Mongols', 6), ('Java', 6), ('Majapahit', 5), ('Joan', 5), ('Ming Dynasty', 5), ('Ambon', 5), ('Famine', 5), ('Kievan Rus', 4), ('Edward', 4)]
ORG
[('VOC', 6), ('Mataram', 5), ('Java', 4), ('Islam', 4), ('the Roman Catholic Church', 3), ('Church', 3), ('Singhasari', 3), ('Sumatra', 3), ('King of England', 3), ('Russo-Turkish War', 3)]
DATE
[('present-day', 16), ('1556', 8), ('1598', 7), ('1513', 6), ('1547', 6), ('1572', 6), ('1506', 5), ('1560', 5), ('15

In [88]:
d1 = {'c': 2, "d": 3}
d2 = {'c': 4, "d": 35}

TypeError: unsupported operand type(s) for |=: 'dict' and 'dict'