# Data

In [1]:
import zipfile
import json
import pandas as pd
import gensim
import tqdm

In [2]:
def _read_zip(zipped_wiki):
    with zipfile.ZipFile(zipped_wiki) as zf:
        for i, fname in enumerate(tqdm.tqdm(zf.namelist())):
            with zf.open(fname) as f:
                yield json.loads(f.read())
                if i>1000: break
                
#next(_read_zip("../data/interim/wik_pages.zip"))

In [3]:
def read_zip(zipped_wiki):
    df = pd.DataFrame(_read_zip(zipped_wiki))
    return df[df.content.str.len()>0]

df = read_zip("../data/interim/wik_pages.zip")[:100]

  0%|          | 858/587377 [00:00<03:23, 2877.50it/s]


In [4]:
df.head()

Unnamed: 0,content,title
1,200px\nA GNU Free Documentation License ( GNU ...,GNU Free Documentation License
2,Figyelem !\nEz a szöveg az eredeti angol válto...,GFDL v1.1
3,"A Debian , illetve a Debian GNU/Linux rendszer...",Debian társadalmi szerződés
4,": A szabad szoftver olyan szoftver , amelyet a...",Debian szabad szoftver irányelvek
5,A Debian elnevezés egy gyűjtőfogalom .\nA több...,Debian


# Methods

In [7]:
import gensim
import spacy
from collections import Counter
nlp = spacy.hu.Hungarian()

In [8]:
def spacy_vectorize(text):
    return list(Counter([tok.lower for tok in nlp(text) 
            if not tok.is_stop and tok.is_alpha]).items())

spacy_vectorize("egy megy 2 nem de igen")

[(1648, 1)]

In [9]:
from collections import Counter

df["repr"] = [spacy_vectorize(text) for text in df.content.values]

In [10]:
df.head()

Unnamed: 0,content,title,repr
1,200px\nA GNU Free Documentation License ( GNU ...,GNU Free Documentation License,"[(1661, 5), (1663, 2), (1667, 1), (1671, 1), (..."
2,Figyelem !\nEz a szöveg az eredeti angol válto...,GFDL v1.1,"[(2264, 1), (1972, 3), (1788, 11), (2269, 4), ..."
3,"A Debian , illetve a Debian GNU/Linux rendszer...",Debian társadalmi szerződés,"[(2089, 15), (3642, 3), (3644, 1), (3646, 1), ..."
4,": A szabad szoftver olyan szoftver , amelyet a...",Debian szabad szoftver irányelvek,"[(1683, 5), (2112, 4), (3849, 1), (2467, 2), (..."
5,A Debian elnevezés egy gyűjtőfogalom .\nA több...,Debian,"[(2089, 39), (3987, 3), (3988, 1), (3990, 1), ..."


In [11]:
model = gensim.models.TfidfModel(df.repr.values)

In [12]:
tfidf_corpus = model[df.repr.values]

In [14]:
tfidf_corpus[0]

[(1661, 0.15027756783142354),
 (1663, 0.05068014401948027),
 (1667, 0.0462026450834377),
 (1671, 0.035180513086649184),
 (1674, 0.03924845405887184),
 (1678, 0.5980687224730362),
 (1681, 0.013908382049131713),
 (1683, 0.10811911507222444),
 (1685, 0.06458852606861197),
 (1687, 0.032294263034305985),
 (1689, 0.0462026450834377),
 (1691, 0.019033381569496185),
 (1693, 0.1520404320584408),
 (1698, 0.02667976268169393),
 (1701, 0.025340072009740134),
 (1702, 0.04429019354202175),
 (1704, 0.0462026450834377),
 (1706, 0.025542761265124437),
 (1709, 0.12917705213722394),
 (1710, 0.03924845405887184),
 (1712, 0.02046908040859391),
 (1713, 0.0462026450834377),
 (1715, 0.0462026450834377),
 (1717, 0.0462026450834377),
 (1719, 0.0462026450834377),
 (1724, 0.035180513086649184),
 (1726, 0.02310132254171885),
 (1730, 0.015190905746445018),
 (1732, 0.02415838108986066),
 (1734, 0.0462026450834377),
 (1736, 0.0462026450834377),
 (1738, 0.030055513566284706),
 (1740, 0.02415838108986066),
 (1741, 0.02