# Evaluation of the Frequency against a Reference Corpus (Whole Corpus)

To evaluate the frequency counts from the previous step, a reference corpus was constructed to compare the values. The reference corpus contains lemmatized and tokenized Latin classical texts from the LASLA corpus (LINK).

For instance, the relative frequency of 'annos' in the reference corpus is 0.02%. By contrast, the relative frequency of 'annos' in the inscriptions is 2.4%, which indicates its importance. The frequency of the function words 'et', 'in', and 'est' in the inscriptions is similar to that in the reference corpus.

Finally, the z-score and the chi-square statistical procedure were applied to quantify the difference between the observed frequency and the expected frequency and measure whether these differences are statistically signiticant.

In [1]:
import pandas as pd 
import os 
import re 
from io import open
from conllu import parse
from nltk.tokenize import sent_tokenize, word_tokenize
import pickle
import matplotlib.pyplot as plt
import collections
from collections import Counter
from nltk.util import ngrams
from scipy.stats import chi2_contingency
from statsmodels.stats.proportion import proportions_ztest
from statsmodels.stats.proportion import proportions_chisquare
import numpy as np

In [2]:
##open the dataset of funerary inscriptions (172,958 rows)
Inscriptions = pd.read_csv("/Users/u0154817/OneDrive - KU Leuven/Documents/ICLL Prague June 2023/Output/Tituli_Sepulcrales_new.csv")

In [3]:
len(Inscriptions)

172958

In [4]:
##create a list of all the tokens in the interpretive texts
inscriptions_words = []

for i,inscription in enumerate(Inscriptions['inscription_interpretive_cleaning']):
    inscription = str(inscription)
    tokenized_inscription = word_tokenize(inscription) ##tokenize the inscription with NLTK
    for word in tokenized_inscription:
        word = word.lower() ##lower the word
        word = re.sub(r'v', 'u', word) ##we observed that in the LASLA corpus 'vixit' is written 'uixit'
        inscriptions_words.append(word)

In [5]:
##token size of the inscriptions (2,007,668)
len(inscriptions_words)

2007668

In [6]:
counter_inscriptions = Counter(inscriptions_words) ##calculate raw frequency
ten_most_common = counter_inscriptions.most_common(10) ##10 most frequent words

In [7]:
ten_most_common

[('uixit', 67834),
 ('manibus', 65975),
 ('dis', 65783),
 ('et', 55056),
 ('annos', 50095),
 ('in', 31591),
 ('hic', 27439),
 ('fecit', 26399),
 ('bene', 22934),
 ('est', 22740)]

In [8]:
##open the file containing the LASLA tokenized sentences
file = open(r'\Users\u0154817\OneDrive - KU Leuven\Documents\ICLL Prague June 2023\Sources\lasla_tokenized_text', "rb")
tok_sent=pickle.load(file)
file.close()

In [9]:
##number of sentences tokenized (95,747)
len(tok_sent)

95747

In [10]:
##create a bag of words for the LASLA tokenized sentences
LASLA_words = []
for sentence in tok_sent:
    for token in sentence:
        LASLA_words.append(token)

In [11]:
##token size of the LASLA corpus (1,809,855)
len(LASLA_words)

1809855

In [12]:
counter_LASLA = Counter(LASLA_words) ##calculate raw frequency
counter_LASLA.most_common(10) ##10 most frequent words in LASLA

[('et', 45884),
 ('in', 31742),
 ('que', 30197),
 ('non', 22530),
 ('est', 21309),
 ('cum', 15633),
 ('ut', 15288),
 ('ad', 12485),
 ('quod', 11904),
 ('qui', 11648)]

In [13]:
##number of unique words in LASLA texts (123,461)
len(counter_LASLA)

123461

In [14]:
##calculate the relative frequency in the LASLA corpus of the 10 most frequent words in inscriptions
for i in ten_most_common:
    word = i[0]
    raw_frequnecy = counter_LASLA[word]
    perc_relative_frequency = raw_frequnecy / len(LASLA_words) * 100
    print(word, perc_relative_frequency)

uixit 0.0032046766177400952
manibus 0.023537797226849663
dis 0.016962684855969124
et 2.535230722903216
annos 0.022874760685248265
in 1.7538421586259672
hic 0.13945868591682758
fecit 0.03282030880926925
bene 0.05591608167505132
est 1.1773871387486843


In [15]:
def calculate_z_score(item, corpus_counter, reference_counter): 
    count=np.array([corpus_counter[item], reference_counter[item]]) ##frequency of the word in the corpus_counter and in the reference_corpus_counter
    n=np.array([2007668, 1809855]) ##size of the corpora of tokens
    return proportions_ztest(count, n)

In [16]:
def calculate_chi_square(item, corpus_counter, reference_counter): 
    count=np.array([corpus_counter[item], reference_counter[item]])
    print(count)
    n=np.array([2007668, 1809855]) ##size of the corpora of tokens
    return proportions_chisquare(count, n)

In [17]:
for i in  counter_inscriptions.most_common(10):
    print(i)
    a, b=calculate_z_score(i[0], counter_inscriptions, counter_LASLA) 
    print(a, f'{b:.3g}') ##z score, p score
    print(calculate_chi_square(i[0], counter_inscriptions, counter_LASLA))
    if calculate_chi_square(i[0], counter_inscriptions, counter_LASLA)[1]<0.05:
        print('significant')

('uixit', 67834)
249.1715870529962 0
[67834    58]
(62086.479794508814, 0.0, (array([[6.783400e+04, 1.939834e+06],
       [5.800000e+01, 1.809797e+06]]), array([[  35704.98353409, 1971963.01646591],
       [  32187.01646591, 1777667.98353409]])))
[67834    58]
significant
('manibus', 65975)
243.47610283102364 0
[65975   426]
(59280.612649783194, 0.0, (array([[6.597500e+04, 1.941693e+06],
       [4.260000e+02, 1.809429e+06]]), array([[  34920.85388038, 1972747.14611962],
       [  31480.14611962, 1778374.85388038]])))
[65975   426]
significant
('dis', 65783)
243.81466339169936 0
[65783   307]
(59445.59008480767, 0.0, (array([[6.578300e+04, 1.941885e+06],
       [3.070000e+02, 1.809548e+06]]), array([[  34757.29632015, 1972910.70367985],
       [  31332.70367985, 1778522.29632015]])))
[65783   307]
significant
('et', 55056)
12.590459014535217 2.38e-36
[55056 45884]
(158.51965819669073, 2.3828368925497687e-36, (array([[  55056., 1952612.],
       [  45884., 1763971.]]), array([[  53085.20