In [1]:
import os
import logging

from typing import List, Dict, Optional
from collections import namedtuple, Counter, defaultdict

import lxml.etree as ET
import numpy as np
import requests
import requests_cache

from pandas import DataFrame
from sklearn.decomposition import PCA, FactorAnalysis as FA

import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import HTML, display

plt.style.use('seaborn')
requests_cache.install_cache('importance-segmentation.cache')

## Setup the lemmas that are interesting to search for

In [2]:
import glob
lemmas = []
for text in glob.glob("/home/thibault/dev/corpus-builder/output/*.xml"):
    xml = ET.parse(text)
    lemmas.extend([
        y[:y.find("(")]
        for x in xml.xpath("//w[@ana]/@lemma") 
        for y in x.split("|")
        if y
    ])
lemmas = sorted(list(set(lemmas)))


SERVER = "http://localhost:8888/blacklab-server/latin-corpus-index/hits"
GLOBAL_WINDOW = 10
MIN_FREQ = 10
COOC_MIN_FREQ = 10
NORMALIZATION: str = "binarize"
IGNORE_WORDS = ("sum1", "que", "habeo", "dico2", "facio", "possum1", "do", "uideo", "uolo3", "iam", "sic",
               "tamen", "tam", "puto", "res", "suus")
TEXTS = ["urn:cts:latinLit:phi1294.phi002.perseus-lat2", "urn:cts:latinLit:phi1103.phi001.lascivaroma-lat1"]
TEXT_SPLITS = {
    "urn:cts:latinLit:phi1294.phi002.perseus-lat2": 2,  # Stop at the second element of citation
    "urn:cts:latinLit:phi1103.phi001.lascivaroma-lat1": 1
}
BAD_CACHE = defaultdict(Counter) # {textlist: {word: count}}
POS = [
    "ADJadv.mul", "ADJadv.ord", "ADJcar", "ADJdis", "ADJmul", "ADJord", 
    #"ADJqua", 
    "ADV", "ADVint", "ADVint", "ADVneg", "ADVrel", 
    "CONcoo", "CONsub", "INJ",
    "NOMcom", "NOMpro", 
    "OUT", "PRE", "PROdem", 
    "PROind", "PROint", "PROper", "PROpos", "PROpos", "PROref", "PROrel", "PUNC", 
    "VER", "VERaux", "FOR"
]
POS = [
    "ADJadv.mul", "ADJadv.ord", "ADJcar", "ADJdis", "ADJmul", "ADJord", 
    #"ADJqua", 
    #"ADV", 
    "ADVint", "ADVint", "ADVneg", "ADVrel", 
    "CON", "CONcoo", "CONsub", "INJ", 
    #"NOMcom", "NOMpro",
    "OUT", "PRE", "PROdem", 
    "PROind", "PROint", "PROper", "PROpos", "PROpos", "PROref", "PROrel", "PUNC", 
    #"VER", 
    "VERaux", "FOR"
]
PLOT = False

## Set up the functions

### Object types

In [3]:
Token = namedtuple("Token", ["text", "lemma", "pos", "ref"])
Hit = namedtuple("Hit", ["left", "right", "text_id", "ref", "lemma"])
Analysis = namedtuple("Analysis", ["df", "freq", "pivots", "seconds", "decomp", "transformed"])

### Window / processing of raw results

In [4]:
def filter_ref(words: List[Token], ref: str, text: str = None):
    main = ref.split(".")[:TEXT_SPLITS.get(text)]
    return [
        word
        for word in words
        if word.ref.split(".")[:TEXT_SPLITS.get(text)] == main
    ]


def get_window(hit, window, serious_window=False, filter_pos=None):
    out = hit.left[-window:] + hit.right[:min(len(hit.right), window)]
    if filter_pos:
        out = [tok for tok in out if tok.pos not in filter_pos]
    if serious_window is False:
        return out
    return filter_ref(out, ref=hit.ref, text=hit.text_id)


def generate_token_list(segment: Dict[str, List[str]], ignore_pos: List[str]):
    return [
        Token(*tok)
        for tok in zip(segment["word"], segment["lemma"], segment["pos"], segment["ref"])
        if tok[2] not in ignore_pos
    ]

### General Frequency building function

In [5]:
def get_general_frequencies(
    texts: List[str],
    words: List[str],
    max_query: int = 200,
    # Blacklab specific system
    server: str = "http://localhost:8888/blacklab-server/latin-corpus-index/hits",
):
    filters = " OR ".join(["docId:"+txt.replace(":", "\\:") for txt in texts])
    cache_id = tuple(sorted(texts))
    toks = Counter()
    
    from_cache = []
    for word in set(words):
        if cache_id in BAD_CACHE and word in BAD_CACHE[cache_id]:
            toks[word] = BAD_CACHE[cache_id][word]
            words.remove(word)
            from_cache.append(word)
    logging.info("[GlobalFrequency] Hitting cache for " + ", ".join(from_cache))
            
    if words:
        words = sorted(list(set(words)))
        for n in range(0, len(words), max_query):
            word_subset = words[n:n+max_query]
            logging.info("query", word_subset)
            req = requests.post(server, data={
            #    "patt": f"[]",
                "patt": "["+" | ".join([f"lemma=\"{word}\"" for word in set(word_subset)]) + "]",
                "filter": filters,
                "outputformat": "json",
                "wordsaroundhit": 0,
                "group": "hit:lemma:s",
                "first":0,
                "maxretrieve": -1,
                "waitfortotal": "yes",
                "maxcount": -1
            })
            json = req.json()
            toks.update({
                tok["identityDisplay"]: tok["size"]
                for tok in json["hitGroups"]
            })
    
    for word in toks:
        BAD_CACHE[cache_id][word] = toks[word]
    return toks  

### Blacklab query function

In [6]:
def query(
    texts: List[str],
    words: List[str],
    window: int = GLOBAL_WINDOW,
    ignore_pos: Optional[List[str]] = None,
    
    # Blacklab specific system
    server: str = "http://localhost:8888/blacklab-server/latin-corpus-index/hits",
    start: int = 0,
    hits_per_page: int = 200,
    debug: bool = False,
    sub: bool = True # Request automatically next pages
):
    """ Retrieve words and cooccurrents
    
    :param texts: Lists of text identifiers that are looked into (URNs / docId)
    :param ignore_pos: POS to ignore in counting cooccurences (like punct)
    :param filter_pos: POS to count in the window but not as a feature for the cooccurences
    :param lemma: Lemma which are searched
    :param window: Number of words on the left and on the right to keep
    :param serious_window: Keep only cooccurrences in the same document segment.
    
    """
    ignore_pos = ignore_pos or []
    
    req = requests.post(server, data={
        "patt": "["+" | ".join([f"lemma=\"{word}\"" for word in list(set(words))]) + "]",
        "filter": " OR ".join(["docId:"+txt.replace(":", "\\:") for txt in texts]), 
        "usecontent": "fi",  # Slower but better for my purposes
        "outputformat": "json",
        "number": hits_per_page,
        "includetokencount": "yes",
        "wordsaroundhit": window*2,
        "first":start
    })
    json = req.json()

    tokens_per_doc: Dict[str, int] = {
        val["docId"][0]: val["lengthInTokens"]
        for val in json["docInfos"].values()
        if isinstance(val, dict)
    }
    # Treating hits
    hits = []
    for hit in json["hits"]:
        hits.append(Hit(
            left=generate_token_list(hit["left"], ignore_pos=ignore_pos),
            right=generate_token_list(hit["right"], ignore_pos=ignore_pos),
            text_id=json["docInfos"][hit["docPid"]]["docId"][0],
            ref=hit["match"]["ref"][0],
            lemma=hit["match"]["lemma"][0]
        ))
    
    # Treating next page
    if json["summary"]["windowHasNext"] and sub:
        next_start = json["summary"]["windowFirstResult"] + json["summary"]["requestedWindowSize"]
        if debug:
            print("Hitting {} for {}".format(next_start, "|".join(words)))
        subhits, subdocs = query(
            words=words,
            server=server,
            window=window,
            texts=texts,
            hits_per_page=hits_per_page,
            start=next_start,
            debug=debug
        )
        tokens_per_doc.update(subdocs)
        hits.extend(subhits)

    return hits, tokens_per_doc

### Word searching

In [7]:
def search_words(
    texts: List[str],
    lemma: List[str],
    window: int,
    ignore_pos: Optional[List[str]] = None,
    filter_pos: Optional[List[str]] = None,
    serious_window: bool = False,
    
    # Blacklab specific system
    server: str = "http://localhost:8888/blacklab-server/latin-corpus-index/hits",
    start: int = 0,
    hits_per_page: int = 50,
    debug: bool = False,
    sub: bool = True # Request automatically next pages
):
    """ Retrieve words and cooccurrents
    
    :param texts: Lists of text identifiers that are looked into
    :param ignore_pos: POS to ignore in counting cooccurences (like punct)
    :param filter_pos: POS to count in the window but not as a feature for the cooccurences
    :param lemma: Lemma which are searched
    :param window: Number of words on the left and on the right to keep
    :param serious_window: Keep only cooccurrences in the same document segment.
    
    """
    ignore_pos = ignore_pos or []
    filter_pos = filter_pos or []
    
    hits, freqs = query(
        texts=texts, words=lemma, window=window, ignore_pos=ignore_pos,
        # Textlab specific
        server=server, hits_per_page=hits_per_page, sub=sub, debug=debug
    )
    
    out = defaultdict(Counter) # Cooccurences counter {Lemma: {Cooccurrent: Counter}}
    occs = Counter()  # Number of match / Lemma
    GenFreq = Counter()  # General frequency of all met token
    
    for hit in hits:
        out[hit.lemma].update(
            Counter([
                tok.lemma
                for tok in get_window(hit=hit, window=window, serious_window=serious_window, filter_pos=filter_pos)
            ])
        )
        occs[hit.lemma] += 1 
        
        
    return out, occs

### Dataframe and integrated results

In [16]:
def generate_dataframe(
    lemma_connections: Dict[str, Dict[str, int]],
    lemma_count: Dict[str, int],
    floor: int,
    cooc_floor: int,
    ignore_words: Optional[List[str]] = None
):
    ignore_words = ignore_words or []
    df = DataFrame({
        k:v
        for k, v in lemma_connections.items()
        if len(v) and lemma_count[k] >= floor
    }).transpose()
    
    # Drop items where the number of cooccurrences is too low
    df.drop([
        col
        for col, val in df.sum().iteritems()
        if val < cooc_floor or col in ignore_words# or col in pivots  # Remove pivots as feature ?
    ], axis=1, inplace=True)
    return df

def parse_and_window(
    texts: List[str],
    lemma: List[str],
    window: int,
    serious_window=False,
    floor: int = 5,
    ignore_pos=["PUNC"],
    filter_pos=None,  # Filter OUT
    cooc_floor: int = 3,
    ignore_words=IGNORE_WORDS,
    normalization: Optional[str] = None # binarize, None, ratio, log
) -> (Dict[str, Dict[str, int]], Dict[str, int], Dict[str, int]):
    
    
    # 1. Retrieve and treat first order words ("pivots")
    out, occs = search_words(
        lemma=lemma,
        texts=texts,
        serious_window=serious_window,
        window=window,
        ignore_pos=ignore_pos,
        filter_pos=filter_pos
    )
    
    # Warns about data that does not meet a minimal threshold
    for k, v in out.items():
        if len(v) and occs[k] < floor and k in lemma:
            logging.warning(f"{k} [pivot] because for "
                            f" insuff. freq ({occs[k]} < {floor} limit)")
    
    FirstDF = generate_dataframe(
        lemma_connections=out,
        lemma_count=occs,
        floor=floor,
        cooc_floor=cooc_floor,
        ignore_words=ignore_words
    )
    
    pivots = FirstDF.index.tolist()
    second_zone = FirstDF.columns.tolist()
    print("Second zone", second_zone)

    # 2. Retrieve and treat second order words ("seconds")
    sec_out, sec_occs = search_words(
        lemma=second_zone,
        texts=texts,
        serious_window=serious_window,
        window=window,
        ignore_pos=ignore_pos,
        filter_pos=filter_pos
    )
    out.update(sec_out)
    occs.update(sec_occs)
    
    
    DF_Pivot = generate_dataframe(
        lemma_connections=out,
        lemma_count=occs,
        floor=floor,
        cooc_floor=cooc_floor,
        ignore_words=ignore_words
    )
    
    general_frequency: bool = normalization not in {"binarize", None}
    GenFreq = Counter()
    if general_frequency:
        GenFreq.update(get_general_frequencies(words=DF_Pivot.columns.to_list(), texts=texts))
    
    if normalization:
        if normalization.startswith("ratio"):
            # Normalisation: quel part des occurences globales des coocurrents représente chacune des occurences rencontrées 
            DF_Pivot = DF_Pivot.divide([GenFreq[col] for col in DF_Pivot.columns])
        if "-log" in normalization:
            DF_Pivot = DF_Pivot.apply(np.log10)
        if normalization == "binarize":
            DF_Pivot = DF_Pivot.fillna(0).apply(lambda x: x != 0)
    
    return DF_Pivot, occs, pivots, second_zone, GenFreq

### Generate adversarial PCA / FA

In [21]:
def generate_adversarial(
    texts,
    lemma,
    window,
    normalization,
    floor,
    cooc_floor,
    ignore_pos,
    filter_pos
):
    clean_out, clean_occs, clean_pivots, clean_second_zone, GenFreq = parse_and_window(
        texts,
        lemma=lemma,
        window=window,
        serious_window=True,
        normalization=normalization,
        floor=floor,
        cooc_floor=cooc_floor,
        ignore_pos=ignore_pos,
        filter_pos=filter_pos
    )
    
    display(HTML(clean_out.to_html()))
    display(HTML(clean_out.fillna(0).to_html()))
    
    print(f"{clean_out.shape} Shape")
    fa_clean = PCA(n_components=clean_out.shape[0])
    fa_clean_transformed = fa_clean.fit_transform(clean_out.fillna(0))
    
    dirty_out, dirty_occs, dirty_pivots, dirty_second_zone, _ = parse_and_window(
        texts,
        lemma=lemma,
        window=window,
        serious_window=False,
        normalization=normalization,
        floor=floor,
        cooc_floor=cooc_floor,
        ignore_pos=ignore_pos,
        filter_pos=filter_pos
    )
    print(f"{dirty_out.shape} Shape")
    fa_dirty = PCA(n_components=dirty_out.shape[0])
    fa_dirty_transformed = fa_dirty.fit_transform(dirty_out.fillna(0))
    
    return (
        Analysis(clean_out, clean_occs, clean_pivots, clean_second_zone, fa_clean, fa_clean_transformed),
        Analysis(dirty_out, dirty_occs, dirty_pivots, dirty_second_zone, fa_dirty, fa_dirty_transformed),
        GenFreq
    )

### Get research lemma

In [10]:
EXCLUDED = "ad2 ipse munus uolo3 facio rumpo uxor ille nos meus uir res porto janua Uenus".split() + list(IGNORE_WORDS)
WORDS = sorted([
    "mentula", "cunnus", "lasciuus", "paedico2", 
    "futuo", "culus", "irrumo", "fello", "fellator",
    "castus", "improbus", "probus",
    "effeminatus"
    #"caco" #, "medium", 
])

EXTEND = False
if EXTEND:
    WORDS.extend(sorted(list(set([
        l 
        for l in lemmas
        if l not in EXCLUDED
    ]))))

### Plot automatique

In [11]:
def plot(clean, dirty, gen_freq, plotly=True, pyplot=True):
    
    vocab = clean.df.index.tolist() + dirty.df.index.tolist()
    milestone = clean.df.shape[0]
    XY = np.concatenate([
        clean.transformed[:,(0,1)],
        dirty.transformed[:,(0,1)]
    ], axis=0)
    
    pyplot_fig = None
    plotly_fig = None
    
    if pyplot:
        plt.figure(figsize=(20,20), dpi=300)
        plt.scatter(
            XY[:,0],
            XY[:,1],
            marker="x",
            color="black",
            s=30,
            linewidths=1,
            label="Crosses"
        )
        plt.xlabel("PC1",size=30)
        plt.ylabel("PC2",size=30)
        plt.title("Espace sémantique (Rouge: Segmenté, Bleu: Continu)",size=20)
        for i, word in enumerate(vocab):
            plt.annotate(
                f"{word} {clean.freq[word] if i < milestone else dirty.freq[word]}",
                xy=(XY[i,0],XY[i,1]),
                color="r" if i < milestone else "b"
            )
        pyplot_fig = plt.gcf()
        
    if plotly:
        merged_df = DataFrame(XY, columns=["pc1", "pc2"], index=vocab)
        merged_df["categorie"] = [
            "Segmenté / Pivot" if word in clean.pivots else "Segmenté / Seconds"
            for word in clean.df.index.tolist()
        ] + [
            "Non-Segmenté / Pivot" if word in dirty.pivots else "Non-Segmenté / Pivots"
            for word in dirty.df.index.tolist()
        ] 

        merged_df["frequency"] = [
            GenFreq[word]
            for word in clean.df.index.tolist()
        ] + [
            GenFreq[word]
            for word in dirty.df.index.tolist()
        ] 

        plotly_fig = px.scatter(
            merged_df,
            x="pc1", y="pc2", 
            color="categorie",
            size="frequency",
            text=merged_df.index,
            color_discrete_sequence=["darkblue", "lightblue", "darkgreen", "lightgreen"],
            labels={
                "pc1": f"PC1 (Segmnenté: {clean.decomp.explained_variance_ratio_[0]*100:.2f}%, " \
                            + f"Non-Segmenté: {dirty.decomp.explained_variance_ratio_[0]*100:.2f}%)",
                "pc2": f"PC2 (Segmnenté: {clean.decomp.explained_variance_ratio_[1]*100:.2f}%, " \
                            + f"Non-Segmenté: {dirty.decomp.explained_variance_ratio_[1]*100:.2f}%)",
                "categorie": "Catégorie",
                "frequency": "Fréquence globale",
            }
        )

        plotly_fig.update_traces(textposition='top center')

        plotly_fig.update_layout(
            height=800,
            title_text='Analyse factorielle des lemmes de la sexualité sur un corpus poétique'
        )

    return plotly_fig, pyplot_fig

def generate_compilation(Clean, Dirty, title):
    return {
        "mots": title,
        "clean-axe1": Clean.decomp.explained_variance_ratio_[0],
        "clean-axe2": Clean.decomp.explained_variance_ratio_[1],
        "dirty-axe1": Dirty.decomp.explained_variance_ratio_[0],
        "dirty-axe2": Dirty.decomp.explained_variance_ratio_[1],
        "clean-features": Clean.df.shape[1],
        "dirty-features": Dirty.df.shape[1]
    }

## Analysis

In [12]:
AllAnalysis = []

### Sexuality

In [22]:
Clean, Dirty, GenFreq = generate_adversarial(
    texts=TEXTS,
    lemma=WORDS,
    window=GLOBAL_WINDOW,
    normalization=NORMALIZATION,
    floor=MIN_FREQ,
    cooc_floor=COOC_MIN_FREQ,
    ignore_pos=["PUNC", "OUT"],
    filter_pos=POS
)

if PLOT:
    plotly_fig, pyplot_fig = plot(Clean, Dirty, GenFreq, plotly=True, pyplot=True)
    pyplot_fig.show()
    plotly_fig.show()
    #fig.write_html("importance-segmentation.html")
    #print(f"file://{os.getcwd()}/importance-segmentation.html")
    
AllAnalysis.append(generate_compilation(Clean, Dirty, "Mentula et al."))




Second zone ['castus2', 'rogo', 'deus', 'cunnus', 'uoco', 'puella', 'paedico2', 'mentula', 'puer', 'peto', 'caput', 'culus', 'effeminatus']


Unnamed: 0,desum1,castus2,soror,duco,templum,magnus,quoque,amor,semper,uenio,manus1,rogo,nox,precor,usque,pono,deus,pudor,cunnus,uoco,phoebus,fero,pars,pater,promitto,moueo,malo,puella,capillus,uersus1,scio,paedico2,placeo,cano,opus1,mentula,bene,hic2,lego2,grandis,fur,specto,sto,inguen,nunc,medius,eo1,puer,peto,longus,credo,libellus,maritus1,iubeo,uerbum,dono,multus,quaero,praesto1,licet1,caput,carmen1,sanctus,culus,lingua,uir,uxor,fama,uirgo,blandus,dignus,moneo,annus,mens,modo1,lasciuus,pomum,custos,amica,uiuo,posco,numus,flacco,cena,nascor,locus,uix,saepe,leuis1,nimium2,miser,superus,uendo,poeta,uox,mollis,timeo,grauis,poena,furtum,patior,natus1,futuo,philaenus,soleo,teneo,nouus,dies,nego,causa,os1,procul,aqua,cupio,nolo,domus,turba,urbs,nudus,lex,tristis,reddo,lingo,Uenus,fio,ludo,loquor,rideo,nomen,mater,trado,bonus,sacer,cinna,debeo,uotum,frons1,tener,dominus,forte,munus,diues,mitto,senex1,amicus1,improbus,amo,nosco,audio,caesar,tono,iaceo,paruus2,emo,Iuppiter,roma,felix
mentula,0.146341,0.064516,0.029412,0.0625,0.037037,0.023952,0.031746,0.029412,0.011494,0.020833,0.054545,0.022727,0.036364,0.064516,0.02439,0.015152,0.037383,0.060606,0.258065,0.047059,0.136364,0.008696,0.026316,0.013699,0.133333,0.096774,0.022222,0.058333,0.028571,0.05,0.039474,0.04,0.058824,0.333333,0.068182,0.027397,0.04,0.018868,0.010526,0.044444,0.033333,0.046512,0.111111,0.086957,0.014815,0.081633,0.015152,0.031447,0.019802,0.013158,0.016393,0.008403,0.023256,0.014493,0.085714,0.037037,0.016393,0.014085,0.020833,0.014493,0.088889,0.011111,0.09375,0.136364,0.055556,0.021277,0.015873,0.021739,0.083333,0.064516,0.057143,0.047619,0.016667,0.05,0.012195,0.027027,0.043478,0.0625,0.071429,0.025,0.030303,0.028571,0.090909,0.017857,0.023256,0.02381,0.027778,0.01087,0.018868,0.022222,0.020408,0.02381,0.02439,0.018868,0.025641,0.02439,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
paedico2,,,,,,,,,0.022989,,0.009091,0.007576,,,,,,0.030303,,0.011765,,,,,,,,,,0.025,0.013158,,0.014706,,,0.013699,0.013333,0.018868,,,0.033333,,,,,0.020408,,0.025157,0.009901,,,,0.069767,,,,0.02459,0.028169,,,,,,0.090909,,,,,0.041667,,,,,,0.012195,0.027027,0.043478,,,,,,,,0.023256,0.02381,,0.021739,,,0.020408,,,0.018868,,,0.018182,0.020408,0.105263,0.142857,0.04,0.04,0.0625,0.071429,0.024691,0.018868,0.023256,0.011765,0.027778,0.022727,0.011111,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
culus,,,,,,,0.015873,,0.011494,,,,,,0.02439,,,,0.032258,0.011765,,,,,,,,,,,0.013158,0.08,,,,0.041096,,0.018868,,,0.033333,0.023256,,,,,,0.012579,0.009901,,,,,,,,,,0.020833,,0.044444,,,0.363636,,,0.015873,,,,,,,,,0.027027,,,0.071429,0.025,,,,,,,,0.01087,,,0.061224,0.02381,,,0.025641,,,,,,,0.04,,,,,,,0.009259,,,0.411765,0.02381,0.015873,0.014286,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
futuo,,0.064516,,0.03125,,,,,,0.006944,,0.015152,,,,0.015152,,0.030303,0.064516,,,,,0.027397,0.066667,,0.022222,0.016667,,,0.013158,0.08,0.014706,,,,0.013333,,,,0.033333,,,,0.014815,,0.015152,0.006289,0.009901,,,,0.023256,,0.028571,,,,0.020833,0.007246,,,,,0.027778,0.010638,0.015873,,0.083333,,,0.047619,,,,,,0.0625,,,,,,,,,0.027778,,,,0.020408,,,,,,0.018182,,0.105263,,,,0.0625,,0.012346,0.018868,,,0.018519,0.022727,,,,0.031746,,0.025641,0.016393,0.014706,0.025641,0.02439,0.042553,0.019608,0.214286,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
cunnus,0.02439,0.096774,,,,,,,,,,,,,,0.015152,0.037383,0.060606,,0.011765,,,,,0.066667,0.096774,0.066667,0.025,,,,,0.014706,0.222222,0.045455,0.109589,,,,,,,,0.173913,,,,0.012579,0.009901,,0.008197,,,,,,,,0.020833,0.007246,0.066667,,,0.045455,0.027778,0.021277,0.031746,,,,0.028571,,,0.2,,0.027027,,,,0.0125,,,,,,,,,,,0.040816,0.02381,,,,,,,,0.071429,0.08,,0.0625,0.071429,0.012346,,0.023256,,,,0.011111,,,,,,,,,,,,0.357143,0.108108,0.012048,0.018868,0.017241,0.023256,0.010638,0.022727,0.052632,,,,,,,,,,,,,,,,,,,,,,,,,,
improbus,,0.096774,,,,,,,,0.006944,0.009091,0.007576,,,0.04878,0.015152,0.028037,0.030303,,,,0.017391,,,,,,0.025,,,0.013158,,,,,,,0.018868,0.021053,,0.033333,,,,,0.040816,,,0.049505,0.013158,,0.008403,0.023256,0.028986,,,0.008197,0.028169,0.020833,0.007246,0.022222,,,,0.027778,0.010638,0.031746,0.021739,,,,,0.016667,,,0.054054,,,,,,,0.045455,,,0.02381,0.027778,0.032609,,,0.020408,0.02381,,0.018868,,,0.036364,,0.052632,0.071429,0.04,,,,,0.018868,,,,,0.011111,,,0.015873,,,,,,0.02439,0.021277,0.019608,,,,,,0.023256,0.010638,,,0.012658,0.027778,0.074074,0.031746,0.075,0.026316,0.033333,0.008929,0.05,0.009346,0.022727,0.013333,,,,,,,,,,,,,,
irrumo,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.12,,,,,,,,,0.1,,,,,,,,,,,,0.023256,,,,,,,0.007246,,,,,,,,,,,,,,,,,,0.0625,,,,,,,,,,,,,,,,,,,,,0.105263,0.142857,0.08,,0.03125,,,,,,,0.022727,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.008929,,,,,0.020408,,,,,,,,,,,,,
lasciuus,0.02439,0.096774,,,,0.005988,0.015873,0.029412,,0.006944,,0.007576,0.018182,0.032258,,,0.009346,,0.032258,,,,0.026316,0.013699,,,,0.033333,,0.05,0.013158,0.04,,0.111111,0.045455,0.013699,,,0.021053,0.022222,,0.023256,,,0.014815,,,0.006289,,,,0.033613,,0.014493,0.028571,0.018519,0.008197,0.014085,,0.007246,,0.033333,0.03125,0.045455,0.027778,0.010638,,0.021739,,0.032258,,0.047619,,,0.012195,,0.043478,0.0625,0.035714,,,,,0.017857,,0.02381,,0.01087,,0.022222,,,,,0.025641,0.02439,,0.020408,0.052632,0.071429,,,,,,0.037736,,0.011765,,,0.011111,,,0.031746,,0.012821,,0.029412,0.051282,0.073171,0.021277,,,0.054054,,0.132075,,,0.010638,,,,,,,,0.026316,0.033333,0.008929,,,,0.013333,,0.012821,0.057143,0.03,0.036364,0.022727,0.01087,0.033333,0.02439,0.010989,,,,
fello,,0.032258,,,,,,,,,,0.015152,,,0.02439,,,,,,,,,,,,,,,,0.013158,0.04,,,,,,,,,,,,0.043478,,0.020408,,,,,,,,,,,,,,,,,,,0.027778,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.071429,,,,,0.009259,,,,0.011905,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.028571,,,,,,,,,,,
uoco,0.02439,0.032258,0.176471,,,0.005988,,,0.022989,0.020833,,0.022727,0.018182,,,0.030303,0.009346,0.030303,0.032258,0.141176,,0.017391,,0.027397,0.066667,,0.022222,0.008333,0.028571,0.025,0.039474,0.04,0.014706,0.111111,0.022727,0.054795,0.013333,0.037736,0.021053,0.022222,,0.023256,,,0.007407,0.020408,,0.012579,,0.039474,0.016393,0.008403,,0.014493,,0.018519,0.008197,,0.020833,0.007246,,0.011111,0.03125,0.045455,,0.021277,0.047619,0.021739,0.041667,0.032258,0.028571,,,,0.02439,,0.043478,0.0625,0.035714,0.0125,,,0.045455,0.160714,0.023256,0.02381,,0.01087,0.018868,,0.020408,,,0.018868,0.025641,,,,,,,0.12,,0.071429,0.024691,,,0.011765,0.009259,0.022727,,,,,0.014286,0.051282,0.04918,,0.025641,,0.021277,,,,,0.018868,,0.023256,0.074468,0.090909,,0.012658,0.027778,0.111111,0.015873,0.025,,,0.0625,,0.028037,0.022727,0.02,0.020408,0.012821,,0.02,0.072727,0.022727,0.021739,0.033333,,0.010989,0.038462,0.041667,0.014706,0.018868


Unnamed: 0,desum1,castus2,soror,duco,templum,magnus,quoque,amor,semper,uenio,manus1,rogo,nox,precor,usque,pono,deus,pudor,cunnus,uoco,phoebus,fero,pars,pater,promitto,moueo,malo,puella,capillus,uersus1,scio,paedico2,placeo,cano,opus1,mentula,bene,hic2,lego2,grandis,fur,specto,sto,inguen,nunc,medius,eo1,puer,peto,longus,credo,libellus,maritus1,iubeo,uerbum,dono,multus,quaero,praesto1,licet1,caput,carmen1,sanctus,culus,lingua,uir,uxor,fama,uirgo,blandus,dignus,moneo,annus,mens,modo1,lasciuus,pomum,custos,amica,uiuo,posco,numus,flacco,cena,nascor,locus,uix,saepe,leuis1,nimium2,miser,superus,uendo,poeta,uox,mollis,timeo,grauis,poena,furtum,patior,natus1,futuo,philaenus,soleo,teneo,nouus,dies,nego,causa,os1,procul,aqua,cupio,nolo,domus,turba,urbs,nudus,lex,tristis,reddo,lingo,Uenus,fio,ludo,loquor,rideo,nomen,mater,trado,bonus,sacer,cinna,debeo,uotum,frons1,tener,dominus,forte,munus,diues,mitto,senex1,amicus1,improbus,amo,nosco,audio,caesar,tono,iaceo,paruus2,emo,Iuppiter,roma,felix
mentula,0.146341,0.064516,0.029412,0.0625,0.037037,0.023952,0.031746,0.029412,0.011494,0.020833,0.054545,0.022727,0.036364,0.064516,0.02439,0.015152,0.037383,0.060606,0.258065,0.047059,0.136364,0.008696,0.026316,0.013699,0.133333,0.096774,0.022222,0.058333,0.028571,0.05,0.039474,0.04,0.058824,0.333333,0.068182,0.027397,0.04,0.018868,0.010526,0.044444,0.033333,0.046512,0.111111,0.086957,0.014815,0.081633,0.015152,0.031447,0.019802,0.013158,0.016393,0.008403,0.023256,0.014493,0.085714,0.037037,0.016393,0.014085,0.020833,0.014493,0.088889,0.011111,0.09375,0.136364,0.055556,0.021277,0.015873,0.021739,0.083333,0.064516,0.057143,0.047619,0.016667,0.05,0.012195,0.027027,0.043478,0.0625,0.071429,0.025,0.030303,0.028571,0.090909,0.017857,0.023256,0.02381,0.027778,0.01087,0.018868,0.022222,0.020408,0.02381,0.02439,0.018868,0.025641,0.02439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
paedico2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022989,0.0,0.009091,0.007576,0.0,0.0,0.0,0.0,0.0,0.030303,0.0,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.013158,0.0,0.014706,0.0,0.0,0.013699,0.013333,0.018868,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.020408,0.0,0.025157,0.009901,0.0,0.0,0.0,0.069767,0.0,0.0,0.0,0.02459,0.028169,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.012195,0.027027,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.02381,0.0,0.021739,0.0,0.0,0.020408,0.0,0.0,0.018868,0.0,0.0,0.018182,0.020408,0.105263,0.142857,0.04,0.04,0.0625,0.071429,0.024691,0.018868,0.023256,0.011765,0.027778,0.022727,0.011111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
culus,0.0,0.0,0.0,0.0,0.0,0.0,0.015873,0.0,0.011494,0.0,0.0,0.0,0.0,0.0,0.02439,0.0,0.0,0.0,0.032258,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013158,0.08,0.0,0.0,0.0,0.041096,0.0,0.018868,0.0,0.0,0.033333,0.023256,0.0,0.0,0.0,0.0,0.0,0.012579,0.009901,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.044444,0.0,0.0,0.363636,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.071429,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01087,0.0,0.0,0.061224,0.02381,0.0,0.0,0.025641,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.009259,0.0,0.0,0.411765,0.02381,0.015873,0.014286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
futuo,0.0,0.064516,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.006944,0.0,0.015152,0.0,0.0,0.0,0.015152,0.0,0.030303,0.064516,0.0,0.0,0.0,0.0,0.027397,0.066667,0.0,0.022222,0.016667,0.0,0.0,0.013158,0.08,0.014706,0.0,0.0,0.0,0.013333,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.014815,0.0,0.015152,0.006289,0.009901,0.0,0.0,0.0,0.023256,0.0,0.028571,0.0,0.0,0.0,0.020833,0.007246,0.0,0.0,0.0,0.0,0.027778,0.010638,0.015873,0.0,0.083333,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.105263,0.0,0.0,0.0,0.0625,0.0,0.012346,0.018868,0.0,0.0,0.018519,0.022727,0.0,0.0,0.0,0.031746,0.0,0.025641,0.016393,0.014706,0.025641,0.02439,0.042553,0.019608,0.214286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cunnus,0.02439,0.096774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015152,0.037383,0.060606,0.0,0.011765,0.0,0.0,0.0,0.0,0.066667,0.096774,0.066667,0.025,0.0,0.0,0.0,0.0,0.014706,0.222222,0.045455,0.109589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173913,0.0,0.0,0.0,0.012579,0.009901,0.0,0.008197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.007246,0.066667,0.0,0.0,0.045455,0.027778,0.021277,0.031746,0.0,0.0,0.0,0.028571,0.0,0.0,0.2,0.0,0.027027,0.0,0.0,0.0,0.0125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040816,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.08,0.0,0.0625,0.071429,0.012346,0.0,0.023256,0.0,0.0,0.0,0.011111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.357143,0.108108,0.012048,0.018868,0.017241,0.023256,0.010638,0.022727,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
improbus,0.0,0.096774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006944,0.009091,0.007576,0.0,0.0,0.04878,0.015152,0.028037,0.030303,0.0,0.0,0.0,0.017391,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.013158,0.0,0.0,0.0,0.0,0.0,0.0,0.018868,0.021053,0.0,0.033333,0.0,0.0,0.0,0.0,0.040816,0.0,0.0,0.049505,0.013158,0.0,0.008403,0.023256,0.028986,0.0,0.0,0.008197,0.028169,0.020833,0.007246,0.022222,0.0,0.0,0.0,0.027778,0.010638,0.031746,0.021739,0.0,0.0,0.0,0.0,0.016667,0.0,0.0,0.054054,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.02381,0.027778,0.032609,0.0,0.0,0.020408,0.02381,0.0,0.018868,0.0,0.0,0.036364,0.0,0.052632,0.071429,0.04,0.0,0.0,0.0,0.0,0.018868,0.0,0.0,0.0,0.0,0.011111,0.0,0.0,0.015873,0.0,0.0,0.0,0.0,0.0,0.02439,0.021277,0.019608,0.0,0.0,0.0,0.0,0.0,0.023256,0.010638,0.0,0.0,0.012658,0.027778,0.074074,0.031746,0.075,0.026316,0.033333,0.008929,0.05,0.009346,0.022727,0.013333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
irrumo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.007246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,0.142857,0.08,0.0,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008929,0.0,0.0,0.0,0.0,0.020408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lasciuus,0.02439,0.096774,0.0,0.0,0.0,0.005988,0.015873,0.029412,0.0,0.006944,0.0,0.007576,0.018182,0.032258,0.0,0.0,0.009346,0.0,0.032258,0.0,0.0,0.0,0.026316,0.013699,0.0,0.0,0.0,0.033333,0.0,0.05,0.013158,0.04,0.0,0.111111,0.045455,0.013699,0.0,0.0,0.021053,0.022222,0.0,0.023256,0.0,0.0,0.014815,0.0,0.0,0.006289,0.0,0.0,0.0,0.033613,0.0,0.014493,0.028571,0.018519,0.008197,0.014085,0.0,0.007246,0.0,0.033333,0.03125,0.045455,0.027778,0.010638,0.0,0.021739,0.0,0.032258,0.0,0.047619,0.0,0.0,0.012195,0.0,0.043478,0.0625,0.035714,0.0,0.0,0.0,0.0,0.017857,0.0,0.02381,0.0,0.01087,0.0,0.022222,0.0,0.0,0.0,0.0,0.025641,0.02439,0.0,0.020408,0.052632,0.071429,0.0,0.0,0.0,0.0,0.0,0.037736,0.0,0.011765,0.0,0.0,0.011111,0.0,0.0,0.031746,0.0,0.012821,0.0,0.029412,0.051282,0.073171,0.021277,0.0,0.0,0.054054,0.0,0.132075,0.0,0.0,0.010638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.033333,0.008929,0.0,0.0,0.0,0.013333,0.0,0.012821,0.057143,0.03,0.036364,0.022727,0.01087,0.033333,0.02439,0.010989,0.0,0.0,0.0,0.0
fello,0.0,0.032258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015152,0.0,0.0,0.02439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013158,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.020408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.009259,0.0,0.0,0.0,0.011905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
uoco,0.02439,0.032258,0.176471,0.0,0.0,0.005988,0.0,0.0,0.022989,0.020833,0.0,0.022727,0.018182,0.0,0.0,0.030303,0.009346,0.030303,0.032258,0.141176,0.0,0.017391,0.0,0.027397,0.066667,0.0,0.022222,0.008333,0.028571,0.025,0.039474,0.04,0.014706,0.111111,0.022727,0.054795,0.013333,0.037736,0.021053,0.022222,0.0,0.023256,0.0,0.0,0.007407,0.020408,0.0,0.012579,0.0,0.039474,0.016393,0.008403,0.0,0.014493,0.0,0.018519,0.008197,0.0,0.020833,0.007246,0.0,0.011111,0.03125,0.045455,0.0,0.021277,0.047619,0.021739,0.041667,0.032258,0.028571,0.0,0.0,0.0,0.02439,0.0,0.043478,0.0625,0.035714,0.0125,0.0,0.0,0.045455,0.160714,0.023256,0.02381,0.0,0.01087,0.018868,0.0,0.020408,0.0,0.0,0.018868,0.025641,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0,0.071429,0.024691,0.0,0.0,0.011765,0.009259,0.022727,0.0,0.0,0.0,0.0,0.014286,0.051282,0.04918,0.0,0.025641,0.0,0.021277,0.0,0.0,0.0,0.0,0.018868,0.0,0.023256,0.074468,0.090909,0.0,0.012658,0.027778,0.111111,0.015873,0.025,0.0,0.0,0.0625,0.0,0.028037,0.022727,0.02,0.020408,0.012821,0.0,0.02,0.072727,0.022727,0.021739,0.033333,0.0,0.010989,0.038462,0.041667,0.014706,0.018868




(17, 157) Shape
Second zone ['castus2', 'magnus', 'rogo', 'deus', 'cunnus', 'uoco', 'puella', 'placeo', 'paedico2', 'mentula', 'puer', 'peto', 'libellus', 'caput', 'culus', 'saepe', 'effeminatus']
(21, 327) Shape


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

### Books

In [None]:
words = ["carmen1", "lego2", "scribo", "libellus", "poeta", "liber1"]
Clean, Dirty, GenFreq = generate_adversarial(
    texts=TEXTS,
    lemma=words,
    window=GLOBAL_WINDOW,
    normalization=NORMALIZATION,
    floor=MIN_FREQ,
    cooc_floor=COOC_MIN_FREQ,
    ignore_pos=["PUNC", "OUT"],
    filter_pos=POS
)

if PLOT:
    plotly_fig, pyplot_fig = plot(Clean, Dirty, GenFreq, plotly=True, pyplot=True)
    pyplot_fig.show()
    plotly_fig.show()
    #fig.write_html("importance-segmentation.html")
    #print(f"file://{os.getcwd()}/importance-segmentation.html")
    
AllAnalysis.append(generate_compilation(Clean, Dirty, "Carmen et al."))

### Puer et Puella

In [None]:
Clean, Dirty, GenFreq = generate_adversarial(
    texts=TEXTS,
    lemma=["puer", "puella", "uir"],
    window=GLOBAL_WINDOW,
    normalization=NORMALIZATION,
    floor=MIN_FREQ,
    cooc_floor=COOC_MIN_FREQ,
    ignore_pos=["PUNC", "OUT"],
    filter_pos=POS
)

if PLOT or True:
    plotly_fig, pyplot_fig = plot(Clean, Dirty, GenFreq, plotly=True, pyplot=True)
    pyplot_fig.show()
    plotly_fig.show()
    #fig.write_html("importance-segmentation.html")
    #print(f"file://{os.getcwd()}/importance-segmentation.html")
    
AllAnalysis.append(generate_compilation(Clean, Dirty, "Puer et al."))

## Analysis of Analysis

In [None]:
evolution = DataFrame(AllAnalysis)
show = []
for category in [col.split("-")[1] for col in evolution.columns if "-" in col]: 
    if f"ratio-{category}" in evolution.columns:
        continue
    evolution[f"ratio-{category}"] = evolution[f"dirty-{category}"] / evolution[f"clean-{category}"]
    show.append(f"ratio-{category}")
    
SHOW_ALL = True
if SHOW_ALL:
    display(HTML(evolution.to_html()))
else:
    display(HTML(evolution[["mots", *show]].to_html()))
    
print(evolution.to_markdown(floatfmt="0.2f", index=False))

## Playground

In [None]:

#fa_dirty.noise_variance_

In [None]:

                                

    
get_general_frequencies(words=['soror', 'audeo', 'duco', 'priapium', 'sum1'], texts=TEXTS)