In [32]:
import pandas as pd
import numpy as np
import os
import string
from nltk.corpus import stopwords
import nltk
import huspacy
import re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import gensim
import statsmodels.api as sm

## 3.1. Data Preprocessing

Reading in the files as list of strings with each element of the list being a document

In [2]:
# old documents (before the 16th century)
old_files = [f for f in os.listdir('../data/old') if f.endswith('.txt')]
old_documents = []
for filename in old_files:
    with open(os.path.join('../data/old',filename),'r',encoding='utf-8') as f:
        old_documents.append(f.read())

# new documents (from 20th century)
new_files = [f for f in os.listdir('../data/new') if f.endswith('.txt')]
new_documents = []
for filename in new_files:
    with open(os.path.join('../data/new',filename),'r',encoding='utf-8') as f:
        new_documents.append(f.read())

In [3]:
# inspecting the results of reading in the files
print("An example document in old Hungarian:")
print(old_documents[8])
print("------------------------------------------")
print("An example document from the 20th century:")
print(new_documents[10])

An example document in old Hungarian:
Krisztus feltámada , mint ön nagy kínjából asszony , mi is örülünk .
Krisztus legyen reményünk .
Kyrie eleison 
------------------------------------------
An example document from the 20th century:
Nincsen apám, se anyám,
se istenem, se hazám,
se bölcsőm, se szemfedőm,
se csókom, se szeretőm.

Harmadnapja nem eszek,
se sokat, se keveset.
Húsz esztendőm hatalom,
húsz esztendőm eladom.

Hogyha nem kell senkinek,
hát az ördög veszi meg.
Tiszta szívvel betörök,
ha kell, embert is ölök.

Elfognak és felkötnek,
áldott földdel elfödnek
s halált hozó fű terem
gyönyörűszép szívemen.


In [None]:
# only needs to be run once, download time: 2m 48.4s
nltk.download("stopwords")
huspacy.download()

[nltk_data] Downloading package stopwords to C:\Users\Kim
[nltk_data]     Levente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['c:\\ProgramData\\anaconda3\\python.exe', '-m', 'pip', 'install', 'hu_core_news_lg @ https://huggingface.co/huspacy/hu_core_news_lg/resolve/v3.8.0/hu_core_news_lg-any-py3-none-any.whl']


In [5]:
def TextPreprocessor(text, nlp):
    """ Preprocesses documents

    Parameters
    ----------
    text string, document to be cleaned
    nlp Spacy Language, pipeline for tokenization
    
    Returns
    -------
    text_clean string, cleaned document
    """
    # make text lowercase, replace linebreak with space
    text_low = text.lower()
    text_low = text_low.replace("\n"," ")

    # some texts contain indicators of which line the text is at
    # in curly brackets
    text_low = re.sub(r'\{[^}]*\}', '', text_low)

    # tokenize the document
    tokens = [token.text for token in nlp(text_low)]
    
    # exclude stopwords and punctuations
    stop_words = set(stopwords.words("Hungarian"))
    tokens_clean = [word for word in tokens if word not in stop_words and word not in string.punctuation]

    # cleaned tokens are joined by space
    text_clean = " ".join(tokens_clean)

    return text_clean

In [None]:
# clean texts, runtime: < 20s
nlp_hun = huspacy.load()
old_docs_clean = [TextPreprocessor(doc,nlp_hun) for doc in old_documents]
new_docs_clean = [TextPreprocessor(doc,nlp_hun) for doc in new_documents]

Inspecting the result of preprocessing an old text and a new one

In [7]:
print(old_documents[4])
print("--------------")
print(old_docs_clean[4])

ez vég pusztaságról megemlékeznétek , el ne vesznétek , az régi jó nevet megelevenítenétek , kereszténységnek jó vértei  lennétek .
szegény Mátyás király vala békességben , mert országa vala egyességben , vitézek valának nála tisztességben , az urak valának nagy egyenességben .
ti Úristen ellen ne háborganátok , régi dekrétumot csak megtartanátok , az dézsmát igazán kiszolgáltatnátok , koronként Istennek vele áldoznátok .
Isten , Szűz Mária háborútól védjen , Hatvanban gyűlétek , hogy jó végre legyen , jó Lajos királyunk diadalmat vegyen , minden tanácsotokban ő jó véget tegyen .
Pesti Beke Ferenc szíve kétségében , ki az vendég népnek bízik erejében , török császárt töri hízelkedésében , minden ennek ő elvetett beszédében .
Geszti László diák szerzé ez éneket , Magyarország vala nagy fő szükségében , az végek valának mind elveszendőben , ezerötszázhuszonöt esztendőben .
--------------
vég pusztaságról megemlékeznétek vesznétek régi nevet megelevenítenétek kereszténységnek vértei   len

In [8]:
print(new_documents[4])
print("--------------")
print(new_docs_clean[4])

Haragszom én arra szóra,
Ki a papot úgy megszólja,
Mert a papnak nincs bundája,
Hideg a reverendája.
Heje-huja, hopp!
Haragszom én arra szóra,
Ki a mestert úgy megszólja;
A mesternek nincs kalapja,
Sapkában jár az utcára.
Heje-huja, hopp!
Heje-huja, szűröm ujja!
A cigány a nótám fújja.
Gyere, rózsám, táncoljunk hát,
Járjuk el a magyar nótát!
Heje-huja, hopp!
--------------
haragszom szóra papot megszólja papnak bundája hideg reverendája heje-huja hopp haragszom szóra mestert megszólja mesternek kalapja sapkában jár utcára heje-huja hopp heje-huja szűröm ujja cigány nótám fújja gyere rózsám táncoljunk hát járjuk magyar nótát heje-huja hopp


In [9]:
# get example stop words in Hungarian
stopwords.words("Hungarian")[0:20]

['a',
 'ahogy',
 'ahol',
 'aki',
 'akik',
 'akkor',
 'alatt',
 'által',
 'általában',
 'amely',
 'amelyek',
 'amelyekben',
 'amelyeket',
 'amelyet',
 'amelynek',
 'ami',
 'amit',
 'amolyan',
 'amíg',
 'amikor']

In [10]:
# join the lists of cleaned docs for further analysis
docs_clean = old_docs_clean + new_docs_clean

## 3.2. Embedding Models
### 1. Count Vectorization

In [None]:
vectorizer1 = CountVectorizer(min_df=3)
X1_sparse = vectorizer1.fit_transform(docs_clean)

# for further analysis, the result is converted to np array
X1 = X1_sparse.toarray()

In [None]:
# check results of count vectorization
res1 = pd.DataFrame(X1,
             columns=vectorizer1.get_feature_names_out())
res1.head()

Unnamed: 0,adatott,anya,asszony,avagy,bús,császár,egyet,egyszer,előtt,ember,...,énnekem,értem,ím,óriás,ön,ördög,úr,út,ők,őt
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,6,2
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2.TF-IDF Vectorization

In [29]:
vectorizer2 = TfidfVectorizer(min_df=3)
X2_sparse = vectorizer2.fit_transform(docs_clean)

# for further analysis, the result is converted to np array
X2 = X2_sparse.toarray()

In [30]:
# check results
res2 = pd.DataFrame(X2,
             columns=vectorizer2.get_feature_names_out())
res2.head()

Unnamed: 0,adatott,anya,asszony,avagy,bús,császár,egyet,egyszer,előtt,ember,...,énnekem,értem,ím,óriás,ön,ördög,úr,út,ők,őt
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098056,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.251926,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065377,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.39226,0.116993
3,0.0,0.0,0.0,0.0,0.0,0.147517,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3.Word2Vec

Tokenize documents

In [34]:
tokens = []
for doc in docs_clean:
    tokens_doc = [token.text for token in nlp_hun(doc)]
    tokens.append(tokens_doc)

In [36]:
model3 = gensim.models.Word2Vec(tokens, min_count=3,
                                vector_size=100, window=5)

To obtain a vector for each document, the average vector is taken for each document

In [None]:
def doc_to_vec(doc, model, nlp):
    """ Create vectorization of document

    Parameters
    ----------
    doc string, document
    model gensim model, trained Gensim embedding model
    nlp Spacy Language, pipeline for tokenization
    """
    doc_tokenized = [token.text for token in nlp(doc)]
    vectors_doc = [model.wv[word] for word in doc_tokenized if word in model.wv]
    if vectors_doc:
        return np.mean(vectors_doc,axis=0)
    else:
        return np.zeros(model.vector_size)

In [40]:
X3 = np.array([doc_to_vec(doc, model3, nlp_hun) for doc in docs_clean])

In [49]:
# check results
res3 = pd.DataFrame(X3)
res3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.002244,0.001623,-0.000537,0.001316,0.003629,-0.005622,0.003781,0.00798,-0.006457,-0.002318,...,0.003732,0.002547,0.00039,0.001084,0.006025,0.004052,0.00603,-0.007116,0.000481,-0.002375
1,-0.003693,-0.000373,0.002138,0.004965,0.002197,-0.003049,0.001486,0.007668,-0.001398,-0.003745,...,0.00397,0.003533,-0.000111,0.003653,0.002671,0.006953,0.000796,-0.003545,-0.001128,-0.001213
2,-0.000736,0.002211,0.00062,-0.001203,0.000775,-0.004858,0.003722,0.006906,-0.003368,-0.002369,...,0.005016,0.00182,-0.001016,-0.000841,0.004749,0.003226,0.005285,-0.002883,0.00084,-0.00253
3,-0.003605,0.001556,0.000164,-0.00011,0.002121,-0.007405,0.001435,0.008053,-0.001886,-0.004451,...,0.00459,0.002568,0.000543,-0.001041,0.004756,0.005816,0.004995,-0.003899,-0.00273,-0.000988
4,-0.001873,0.005404,0.001668,0.000284,0.002191,-0.00644,0.004461,0.009583,-0.004795,-0.002811,...,0.004487,0.003881,-0.000142,-0.001175,0.006528,0.005194,0.004814,-0.003809,0.002161,-0.000324


## 3.3. Description of Classification Models and Results
### Logistic Regression

In [None]:
# target is defined as 1 if old and 0 otherwise

In [None]:
for X in [X1,X2,X3]:
    

### KNN (K-nearest neighbours algorithm)