In [13]:
import medlatin
import bcubed
import re
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import calinski_harabasz_score, rand_score
from sklearn.dummy import DummyClassifier
from tqdm.notebook import tqdm

In [2]:
# load the style vectors from csv files
path = 'csvfiles'
tfidf_dict = {}
for filename in os.listdir(path):
    if filename.startswith('tfidf'):
        ngram, rank = re.findall('\d+', filename)
        ngram = int(ngram)
        rank = int(rank)
        if ngram in tfidf_dict:
            tfidf_dict[ngram][rank] = np.genfromtxt(os.path.join(path, filename), delimiter=',')
        else:
            tfidf_dict[ngram] = {}
            tfidf_dict[ngram][rank] = np.genfromtxt(os.path.join(path, filename), delimiter=',')

medlatin_emb = np.genfromtxt('csvfiles/embeddings.csv', delimiter=',')

with open('csvfiles/author_changes_names.txt', 'r', encoding='utf-8') as f:
    author_names = [name.strip() for name in f.readlines()]

with open('csvfiles/author_changes_idx.txt', 'r', encoding='utf-8') as f:
    author_idx = [int(idx.strip()) for idx in f.readlines()]

In [3]:
# there are four different authors in the reduced set of texts
n_authors = len(author_names)
kmean_dict = {}
for ngram in tfidf_dict:
    kmean_dict[ngram] = {}
    for rank in tfidf_dict[ngram]:
        kmean_dict[ngram][rank] = medlatin.repeat_kmeans(tfidf_dict[ngram][rank], clusters=n_authors)
        # for i in range(100):
        #     kmeans_tfidf = KMeans(n_clusters=n_authors, n_init='auto').fit(word_char_tfidf)
        #     #kmeans_emb = KMeans(n_clusters=n_authors, n_init='auto').fit(medlatin1_emb_mean)
        #     kmean_dict[ngram][rank].append(kmeans_tfidf.labels_)
        #     #kmean_100_emb.append(kmeans_emb.labels_)

        # kmean_dict[ngram][rank] = np.array(kmean_dict[ngram][rank])

In [4]:
ngrams_list = [2, 3, 4]
ranks_list = [100, 200, 300]
y = np.zeros(260, dtype=int)
y[author_idx[0]:author_idx[1]] = 0
y[author_idx[1]:author_idx[2]] = 1
y[author_idx[2]:author_idx[3]] = 2
y[author_idx[3]:] = 3

In [5]:
ldict = {idx: set([author_names[val]]) for idx, val in enumerate(y)}

In [6]:
multi_index = pd.MultiIndex.from_product([['precision', 'recall', 'fscore'], ngrams_list], names=["bcubed_measure", "ngram"])
bcubed_df = pd.DataFrame(index=multi_index, columns=ranks_list)
rand_df = pd.DataFrame(columns=ranks_list, index=ngrams_list)
chi_df = pd.DataFrame(columns=ranks_list, index=ngrams_list)

In [7]:
for ngram in tqdm(kmean_dict):
    for rank in tqdm(kmean_dict[ngram]):
        precisions = []
        recalls = []
        fscores = []
        rands = []
        chis = []
        for idx, clustering in enumerate(kmean_dict[ngram][rank]):

            cdict = {idx: set([val]) for idx, val in enumerate(clustering)}

            precision = bcubed.precision(cdict, ldict)
            precisions.append(precision)
            recall = bcubed.recall(cdict, ldict)
            recalls.append(recall)
            fscores.append(bcubed.fscore(precision, recall))
            rands.append(rand_score(y, clustering))
            chis.append(calinski_harabasz_score(tfidf_dict[ngram][rank], clustering))

        bcubed_df.loc[('precision', ngram), rank] = np.mean(precisions)
        bcubed_df.loc[('recall', ngram), rank] = np.mean(recalls)
        bcubed_df.loc[('fscore', ngram), rank] = np.mean(fscores)
        rand_df.loc[(ngram, rank)] = np.mean(rands)
        chi_df.loc[(ngram, rank)] = np.mean(chis)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
pd.options.display.float_format = '{:.3f}'.format
bcubed_df

Unnamed: 0_level_0,Unnamed: 1_level_0,100,200,300
bcubed_measure,ngram,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
precision,2,0.726,0.749,0.732
precision,3,0.74,0.762,0.789
precision,4,0.754,0.763,0.785
recall,2,0.595,0.659,0.689
recall,3,0.632,0.709,0.716
recall,4,0.621,0.659,0.692
fscore,2,0.65,0.697,0.704
fscore,3,0.678,0.729,0.748
fscore,4,0.68,0.705,0.733


In [11]:
rand_df

Unnamed: 0,100,200,300
2,0.732,0.758,0.76
3,0.74,0.778,0.799
4,0.755,0.774,0.794


In [12]:
chi_df

Unnamed: 0,100,200,300
2,13.211,10.79,9.453
3,10.549,8.981,8.851
4,10.072,8.812,8.019


In [14]:
dummy_clf_prior = DummyClassifier()
dummy_clf_uniform = DummyClassifier(strategy='uniform')

In [22]:
dummy_clf_prior.fit(tfidf_dict[2][200], y)
prior_clustering = dummy_clf_prior.predict(y)
cdict = {idx: set([val]) for idx, val in enumerate(prior_clustering)}
precision = bcubed.precision(cdict, ldict)
recall = bcubed.recall(cdict, ldict)
fscore = bcubed.fscore(precision, recall)
rand = rand_score(y, prior_clustering)
# cannot compute chi since it requires one to predict more than a single label
#chi = calinski_harabasz_score(tfidf_dict[2][200], prior_clustering)

In [23]:
print(f'Dummy classifier with majority class')
print(f'Bcubed precision: {precision:.3f}')
print(f'Bcubed recall: {recall:.3f}')
print(f'Bcubed f-score: {fscore:.3f}')
print(f'Rand score: {rand:.3f}')
print(f'CHI score: nan')

Dummy classifier with majority class
Bcubed precision: 0.416
Bcubed recall: 1.000
Bcubed f-score: 0.588
Rand score: 0.414
CHI score: nan


In [24]:
dummy_clf_uniform.fit(tfidf_dict[2][200], y)
uniform_clustering = dummy_clf_uniform.predict(y)
cdict = {idx: set([val]) for idx, val in enumerate(uniform_clustering)}
precision = bcubed.precision(cdict, ldict)
recall = bcubed.recall(cdict, ldict)
fscore = bcubed.fscore(precision, recall)
rand = rand_score(y, uniform_clustering)
chi = calinski_harabasz_score(tfidf_dict[2][200], uniform_clustering)

In [25]:
print(f'Dummy classifier with uniform class sampling')
print(f'Bcubed precision: {precision:.3f}')
print(f'Bcubed recall: {recall:.3f}')
print(f'Bcubed f-score: {fscore:.3f}')
print(f'Rand score: {rand:.3f}')
print(f'CHI score: {chi:.3f}')

Dummy classifier with uniform class sampling
Bcubed precision: 0.421
Bcubed recall: 0.262
Bcubed f-score: 0.323
Rand score: 0.541
CHI score: 1.033
