In [1]:
import os
import re
import medlatin
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
def sparsity(vector):
    """
    Calculates the sparsity of a vector of elements >= 0.
    0 indicates a dense vector, 1 indicates a sparse vector.
    See https://math.stackexchange.com/questions/117860/how-to-define-sparseness-of-a-vector.
    """
    L1 = sum(vector)
    L2 = np.linalg.norm(vector)
    k = len(vector)

    sparsity = (np.sqrt(k) - L1 / L2)/(np.sqrt(k) - 1)

    return sparsity

In [3]:
# load the style vectors from csv files
path = 'csvfiles'
tfidf_dict = {}
for filename in os.listdir(path):
    if filename.startswith('tfidf'):
        ngram, rank = re.findall('\d+', filename)
        ngram = int(ngram)
        rank = int(rank)
        if ngram in tfidf_dict:
            tfidf_dict[ngram][rank] = np.genfromtxt(os.path.join(path, filename), delimiter=',')
        else:
            tfidf_dict[ngram] = {}
            tfidf_dict[ngram][rank] = np.genfromtxt(os.path.join(path, filename), delimiter=',')

In [4]:
sparsity_dict = {}

for n in tfidf_dict:
    sparsity_dict[n] = {}
    for r in tfidf_dict[n]:
        sparsity_dict[n][r] = [sparsity(vector) for vector in tfidf_dict[n][r]]
        print(f'Mean sparsity at n={n}, r={r}: {np.mean(sparsity_dict[n][r])}')

Mean sparsity at n=2, r=100: 0.33025763968037447
Mean sparsity at n=2, r=200: 0.4345269271831532
Mean sparsity at n=2, r=300: 0.5124080061467745
Mean sparsity at n=3, r=100: 0.3478194310893455
Mean sparsity at n=3, r=200: 0.4084470498825543
Mean sparsity at n=3, r=300: 0.44526194146965226
Mean sparsity at n=4, r=100: 0.41014357142764624
Mean sparsity at n=4, r=200: 0.4589500197955713
Mean sparsity at n=4, r=300: 0.48779551836953855


In [5]:
zeroes_dict = {}

for n in tfidf_dict:
    zeroes_dict[n] = {}
    for r in tfidf_dict[n]:
        zeroes_dict[n][r] = [len(vector[vector == 0]) / len(vector) for vector in tfidf_dict[n][r]]
        print(f'Mean fraction of 0\'s at n={n}, r={r}: {np.mean(zeroes_dict[n][r])}')

Mean fraction of 0's at n=2, r=100: 0.18180769230769228
Mean fraction of 0's at n=2, r=200: 0.3096923076923077
Mean fraction of 0's at n=2, r=300: 0.45386538461538467
Mean fraction of 0's at n=3, r=100: 0.25782692307692306
Mean fraction of 0's at n=3, r=200: 0.35004807692307693
Mean fraction of 0's at n=3, r=300: 0.4115384615384615
Mean fraction of 0's at n=4, r=100: 0.41009615384615383
Mean fraction of 0's at n=4, r=200: 0.486375
Mean fraction of 0's at n=4, r=300: 0.5328910256410256


In [6]:
print('The shortest text is no. 73 and the longest is no. 236.')
n = 3
r = 300
mean_sparsity_text73 = []
mean_zeroes_text73 = []
mean_sparsity_text236 = []
mean_zeroes_text236 = []
for n in tfidf_dict:
    for r in tfidf_dict[n]:
        mean_sparsity_text73.append(sparsity_dict[n][r][69])
        mean_zeroes_text73.append(zeroes_dict[n][r][69])
        mean_sparsity_text236.append(sparsity_dict[n][r][202])
        mean_zeroes_text236.append(zeroes_dict[n][r][202])

mean_sparsity_text73 = sum(mean_sparsity_text73) / len(mean_sparsity_text73)
mean_zeroes_text73 = sum(mean_zeroes_text73) / len(mean_zeroes_text73)
mean_sparsity_text236 = sum(mean_sparsity_text236) / len(mean_sparsity_text236)
mean_zeroes_text236 = sum(mean_zeroes_text236) / len(mean_zeroes_text236)
print(f'The shortest text has a sparsity of {mean_sparsity_text73} and zero fraction of {mean_zeroes_text73}')
print(f'The longest text has a sparsity of {mean_sparsity_text236} and zero fraction of {mean_zeroes_text236}')

The shortest text is no. 73 and the longest is no. 236.
The shortest text has a sparsity of 0.6062758807396295 and zero fraction of 0.7138888888888889
The longest text has a sparsity of 0.29063042565638314 and zero fraction of 0.03944444444444444


In [7]:
medlatin1_texts, medlatin1_authors, medlatin_titles = medlatin.load_medlatin('../MedLatin/Corpora/MedLatinEpi')

author_changes_idx = [i for i in range(len(medlatin1_authors)) if medlatin1_authors[i-1] != medlatin1_authors[i]]
author_changes_names = [medlatin1_authors[i] for i in author_changes_idx]

# Clara, and Misc have too few texts and they mess with the clustering
# let's try running the experiments without their texts to see the results
from_dante_idx = author_changes_idx[author_changes_names.index('Dante')]
to_misc_idx = author_changes_idx[author_changes_names.index('Misc')]
from_vigna_idx = author_changes_idx[author_changes_names.index('PierDellaVigna')]

# this results in 260 texts
medlatin1_texts = medlatin1_texts[from_dante_idx:to_misc_idx] + medlatin1_texts[from_vigna_idx:]

In [8]:
chars = [len(text) for text in medlatin1_texts]

In [9]:
density_df_dict = {}
for n in tfidf_dict:
    density_df_dict[n] = {}
    print(n)
    for r in tfidf_dict[n]:
        density_df_dict[n][r] = {}
        density_df_dict[n][r]['df'] = pd.DataFrame(columns=['chars', 'sparsity', 'zero_frac'])
        sparsities = []
        zeroes = []
        print(r)
        for vec in tfidf_dict[n][r]:
            sparsities.append(sparsity(vec))
            zeroes.append(len(vec[vec == 0]) / len(vec))
        density_df_dict[n][r]['df']['chars'] = chars
        density_df_dict[n][r]['df']['sparsity'] = sparsities
        density_df_dict[n][r]['df']['zero_frac'] = zeroes
        density_df_dict[n][r]['corrs'] = density_df_dict[n][r]['df'].corr()
        print(density_df_dict[n][r]['corrs'])

2
100
              chars  sparsity  zero_frac
chars      1.000000 -0.559812  -0.714809
sparsity  -0.559812  1.000000   0.874772
zero_frac -0.714809  0.874772   1.000000
200
              chars  sparsity  zero_frac
chars      1.000000 -0.645313  -0.781772
sparsity  -0.645313  1.000000   0.912125
zero_frac -0.781772  0.912125   1.000000
300
              chars  sparsity  zero_frac
chars      1.000000 -0.676991  -0.847319
sparsity  -0.676991  1.000000   0.913318
zero_frac -0.847319  0.913318   1.000000
3
100
              chars  sparsity  zero_frac
chars      1.000000 -0.652849  -0.728533
sparsity  -0.652849  1.000000   0.958308
zero_frac -0.728533  0.958308   1.000000
200
              chars  sparsity  zero_frac
chars      1.000000 -0.707441  -0.772657
sparsity  -0.707441  1.000000   0.972300
zero_frac -0.772657  0.972300   1.000000
300
              chars  sparsity  zero_frac
chars      1.000000 -0.733455  -0.798782
sparsity  -0.733455  1.000000   0.976896
zero_frac -0.798782  0.976896

In [10]:
n = 3
r = 300
density_df_dict[n][r]['corrs']

Unnamed: 0,chars,sparsity,zero_frac
chars,1.0,-0.733455,-0.798782
sparsity,-0.733455,1.0,0.976896
zero_frac,-0.798782,0.976896,1.0


In [11]:
density_df_dict[n][r]['df'].iloc[density_df_dict[n][r]['df']['chars'].argmin()]

chars        334.000000
sparsity       0.628509
zero_frac      0.771667
Name: 69, dtype: float64

In [12]:
density_df_dict[n][r]['df'].iloc[density_df_dict[n][r]['df']['chars'].argmax()]

chars        24160.000000
sparsity         0.309818
zero_frac        0.041667
Name: 202, dtype: float64