In [1]:
AVAILABLE_GPU = 1

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= f"{AVAILABLE_GPU}" # ALWAYS look the one with 0% usage
tf_device=f'/gpu:{AVAILABLE_GPU}'

In [2]:
from datasets import load_dataset, Dataset
import pandas as pd
import re
from transformers import AutoModel, AutoTokenizer
import torch
from torch.nn.functional import normalize
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def printred(text, word):
    index = 0
    while index < len(text):
        if text[index:index+len(word)] == word:
            print('\033[91m' + text[index:index+len(word)] + '\033[0m', end='')
            index += len(word)
        else:
            print(text[index], end='')
            index += 1

def find_sub_list(sl,l):
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            return ind,ind+sll

printred("There was a lot of persons.", "person")

There was a lot of [91mperson[0ms.

In [4]:
old_dataset = Dataset.from_pandas(
    pd.read_csv("./data/old-spanish-corpus-chunked.tsv", sep="\t", dtype={'source_id': str, 'source_text_id': str})
)
print("OLD dataset:", old_dataset)

old_dataset_latam = old_dataset.filter(lambda example: example["source"] == "19th century Latam Newspapers")
print("OLD LATAM dataset:", old_dataset_latam)

OLD dataset: Dataset({
    features: ['source', 'source_id', 'source_id_text', 'title', 'date', 'place', 'text'],
    num_rows: 1518717
})


Filter: 100%|██████████████| 1518717/1518717 [00:12<00:00, 121230.57 examples/s]

OLD LATAM dataset: Dataset({
    features: ['source', 'source_id', 'source_id_text', 'title', 'date', 'place', 'text'],
    num_rows: 21310
})





In [5]:
modern_dataset = load_dataset("large_spanish_corpus", "EUBookShop", trust_remote_code=True)["train"] # TODO: chage the modern spanish dataset
print("MODERN dataset:", modern_dataset)

MODERN dataset: Dataset({
    features: ['text'],
    num_rows: 8214959
})


In [6]:
HF_CHECKPOINT = "dccuchile/bert-base-spanish-wwm-uncased"
tokenizer = AutoTokenizer.from_pretrained(HF_CHECKPOINT)

model_old = torch.load("./output/latam-old-spanish-beto-uncased.pt")
model_new = AutoModel.from_pretrained(HF_CHECKPOINT)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_old = model_old.to(device)
model_new = model_new.to(device)

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
searchs = [
    ["gente", "jente"],
    ["servidores"],
    ["humanitaria"],
    ["humor"],
    ["privilegiado", "privilejiado"], 
    ["ventajosa"],
    ["hombre"],  
    ["placer"], 
    ["razón", "razon"], 
    ["juicio"], 
    ["excitación", "exitación", "exitacion"],
    ["urbanidad"],
    ["academia"],
    ["miserable"], 
    ["nebulosas", "nebulosa"],
    ["privado"],
    ["diablo"], 
    ["luces", "luzes"],
    ["genio", "jenio"],
    ["mujeres"]
]

MAX_N_OLD = 50

In [8]:
data = {"word":[], "orth":[], "text":[], "period":[], "cluster_new":[], "cluster_old":[], "embedding_new":[], "embedding_old":[]}

for search in searchs:
    w = search[0]

    i = 0
    for example in old_dataset_latam:
        text = example["text"]
        for s in search:
            match = re.search(r'\b' + re.escape(s) + r'\b', text)
            if match:
                # Word found!
                inp = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
                idx, idxF = find_sub_list(tokenizer(s)['input_ids'][1:-1], inp['input_ids'].tolist()[0])
                assert tokenizer.decode(inp['input_ids'][0][idx:idxF]) == s, f"Expected {s} but got {tokenizer.decode(inp['input_ids'][0][idx:idxF])}"
                with torch.no_grad():
                    input_ids = inp["input_ids"].to(device)
                    attention_mask = inp["attention_mask"].to(device)
                    embedding_new_old = model_new(input_ids, attention_mask=attention_mask, output_hidden_states=True).last_hidden_state[0][idx].cpu()
                    embedding_old_old = model_old(input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1][0][idx].cpu()

                data['word'].append(w)
                data['orth'].append(s)
                data['text'].append(text)
                data['period'].append('old')
                data['cluster_new'].append(None)
                data['cluster_old'].append(None)
                data['embedding_new'].append(normalize(embedding_new_old, p=2, dim=-1))
                data['embedding_old'].append(normalize(embedding_old_old, p=2, dim=-1))

                i += 1
                break
        if i == MAX_N_OLD:
            break
    print(f"{w}: found {i} in OLD ", end="")

    j = 0
    MAX_N_NEW = i
    for example in modern_dataset:
        text = example["text"]
        match = re.search(r'\b' + re.escape(w) + r'\b', text)
        if match:
            # Word found!
            inp = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
            idx, idxF = find_sub_list(tokenizer(w)['input_ids'][1:-1], inp['input_ids'].tolist()[0])
            assert tokenizer.decode(inp['input_ids'][0][idx:idxF]) == w, f"Expected {w} but got {tokenizer.decode(inp['input_ids'][0][idx:idxF])}"
            with torch.no_grad():
                input_ids = inp["input_ids"].to(device)
                attention_mask = inp["attention_mask"].to(device)
                embedding_new_new = model_new(input_ids, attention_mask=attention_mask, output_hidden_states=True).last_hidden_state[0][idx].cpu()
                embedding_old_new = model_old(input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1][0][idx].cpu()
            
            data['word'].append(w)
            data['orth'].append(w)
            data['text'].append(text)
            data['period'].append('new')
            data['cluster_new'].append(None)
            data['cluster_old'].append(None)
            data['embedding_new'].append(normalize(embedding_new_new, p=2, dim=-1))
            data['embedding_old'].append(normalize(embedding_old_new, p=2, dim=-1))
            
            j += 1
        if j == MAX_N_NEW:
            break
    print(f"and {j} in NEW")

df = pd.DataFrame(data)
df

gente: found 50 in OLD and 50 in NEW
servidores: found 50 in OLD and 50 in NEW
humanitaria: found 21 in OLD and 21 in NEW
humor: found 50 in OLD and 50 in NEW
privilegiado: found 47 in OLD and 47 in NEW
ventajosa: found 26 in OLD and 26 in NEW
hombre: found 50 in OLD and 50 in NEW
placer: found 50 in OLD and 50 in NEW
razón: found 50 in OLD and 50 in NEW
juicio: found 50 in OLD and 50 in NEW
excitación: found 42 in OLD and 39 in NEW
urbanidad: found 37 in OLD and 10 in NEW
academia: found 15 in OLD and 15 in NEW
miserable: found 50 in OLD and 50 in NEW
nebulosas: found 28 in OLD and 17 in NEW
privado: found 50 in OLD and 50 in NEW
diablo: found 50 in OLD and 50 in NEW
luces: found 50 in OLD and 50 in NEW
genio: found 50 in OLD and 50 in NEW
mujeres: found 50 in OLD and 50 in NEW


Unnamed: 0,word,orth,text,period,cluster_new,cluster_old,embedding_new,embedding_old
0,gente,gente,"Llevóle entonces a la azotea de "" Casa , que s...",old,,,"[tensor(0.0526), tensor(-0.0099), tensor(0.056...","[tensor(0.0583), tensor(-0.0013), tensor(0.032..."
1,gente,jente,Um. 9. EL TEMA. NOVENA SESIÓN. A las 11 y 11 m...,old,,,"[tensor(0.0150), tensor(-0.0125), tensor(0.022...","[tensor(0.0208), tensor(0.0295), tensor(0.0194..."
2,gente,jente,Verdaderamente cívicas y domésticas ; arrase l...,old,,,"[tensor(-0.0097), tensor(-0.0229), tensor(0.00...","[tensor(0.0218), tensor(0.0414), tensor(0.0301..."
3,gente,jente,"Llueven de 27 en 27 sobre mis costillas, como ...",old,,,"[tensor(-0.0174), tensor(-0.0051), tensor(0.01...","[tensor(-0.0280), tensor(0.0320), tensor(0.016..."
4,gente,jente,"do-el Pique-la Hormiga-la Pulga, un Ratón,dos ...",old,,,"[tensor(-8.6513e-05), tensor(0.0079), tensor(0...","[tensor(0.0080), tensor(0.0582), tensor(0.0359..."
...,...,...,...,...,...,...,...,...
1686,mujeres,mujeres,"Y en algunos países, como España, la diferenci...",new,,,"[tensor(0.0446), tensor(0.0081), tensor(0.0304...","[tensor(-0.0083), tensor(0.0119), tensor(0.013..."
1687,mujeres,mujeres,Mientras las propuestas del pacto de confianza...,new,,,"[tensor(-0.0235), tensor(-0.0301), tensor(0.08...","[tensor(-0.0370), tensor(-0.0170), tensor(0.02..."
1688,mujeres,mujeres,Y es evidente que el desempleo afecta mucho má...,new,,,"[tensor(0.0677), tensor(0.0124), tensor(0.0379...","[tensor(0.0153), tensor(-0.0008), tensor(0.036..."
1689,mujeres,mujeres,"Agradecemos al ponente, Sr. Lage, que haya inc...",new,,,"[tensor(0.0355), tensor(-0.0040), tensor(0.042...","[tensor(-0.0151), tensor(-0.0020), tensor(0.04..."


In [9]:
def get_silhouette(tensors, kmeans):
    n = kmeans.n_clusters
    labels = kmeans.labels_
    if n == 1: return 0 # doesn't allow 1-cluster solutions
    X = np.array([tensor.flatten().numpy() for tensor in tensors])
    return silhouette_score(X, labels=labels, metric='euclidean')

def get_delta_score(tensors, kmeans):
    # TODO: implement delta_score
    return

def elbow_method(inertia_values):
    deltas = []
    for i in range(1, len(inertia_values)):
        deltas.append(inertia_values[i - 1] - inertia_values[i])
    max_curvature_index = deltas.index(max(deltas))
    return max_curvature_index + 2

def clustering(df, metric):
    best_score_new, best_n_new, inertias_new = -1, 0, []
    best_score_old, best_n_old, inertias_old = -1, 0, []

    for n in range(1, len(df)):
        kmeans_new = KMeans(n_clusters=n, random_state=0, n_init='auto')
        kmeans_new.fit([t.numpy() for t in df['embedding_new'].tolist()])
        df[f'cluster_new_{n}'] = kmeans_new.labels_
        if (metric == 'inertia'):
            inertias_new.append(kmeans_new.inertia_)
        else:
            score_new = compute_metric(df['embedding_new'], kmeans_new, metric)
            if (metric == 'silhouette') and (score_new > best_score_new):
                best_score_new, best_n_new = score_new, n
            elif (metric == 'compactness') and (score_new > best_score_new):
                best_score_new, best_n_new = score_new, n

        kmeans_old = KMeans(n_clusters=n, random_state=0, n_init='auto')
        kmeans_old.fit([t.numpy() for t in df['embedding_old'].tolist()])
        df[f'cluster_old_{n}'] = kmeans_old.labels_
        if (metric == 'inertia'):
            inertias_old.append(kmeans_old.inertia_)
        else:
            score_old = compute_metric(df['embedding_old'], kmeans_old, metric)
            if (metric == 'silhouette') and (score_old > best_score_old):
                best_score_old, best_n_old = score_old, n
            elif (metric == 'compactness') and (score_old > best_score_old):
                best_score_old, best_n_old = score_old, n
    
    if metric == 'inertia':
        best_n_new = elbow_method(inertias_new)
        best_n_old = elbow_method(inertias_old)
        if SHOULD_PRINT:
            plt.figure(figsize=(10, 5))
            plt.subplot(1, 2, 1)
            plt.plot(range(1, len(inertias_new) + 1), inertias_new, marker='o')
            plt.xlabel('Number of clusters')
            plt.ylabel('Inertia')
            plt.title('Inertia (new model)')

            plt.subplot(1, 2, 2)
            plt.plot(range(1, len(inertias_old) + 1), inertias_old, marker='o')
            plt.xlabel('Number of clusters')
            plt.ylabel('Inertia')
            plt.title('Inertia (old model)')

            plt.suptitle(f"Inertia for word '{df.iloc[0]['word']}'")
            plt.show()

    df['cluster_new'] = df[f'cluster_new_{best_n_new}']
    df = df.drop(columns=[f'cluster_new_{n}' for n in range(1, len(df))])
    df['cluster_old'] = df[f'cluster_old_{best_n_old}']
    df = df.drop(columns=[f'cluster_old_{n}' for n in range(1, len(df))])

    if SHOULD_PRINT:
        print("Best number of clusters (new):", best_n_new, f"[max {len(df)}]")
        print("Best number of clusters (old):", best_n_old, f"[max {len(df)}]")
    return df

def compute_metric(tensors, kmeans, method="silhouette"):
    return get_silhouette(tensors, kmeans) if method == "silhouette" else get_delta_score(tensors, kmeans)

def cluster_df(df, metric="silhouette"):
    result_df = pd.DataFrame()
    for word, group in df.groupby('word'):
        if SHOULD_PRINT:
            print(f"{word}")
        group_cl = clustering(group, metric)
        result_df = pd.concat([result_df, group_cl], ignore_index=True)
        assert len(group) == len(group_cl), f"{len(group)} != {len(group_cl)} for word {word}"

    #result_df = result_df.set_index('id')
    #result_df = result_df.reindex(df['id'])
    result_df = result_df.drop(columns=["embedding_new", "embedding_old"])
    return result_df.reset_index()

In [14]:
import warnings
warnings.filterwarnings('ignore')

SHOULD_PRINT = True
CLUSTERING_METRIC = "silhouette" # "silhouette", "inertia" or "delta"
# TODO: Add new option (after delta metric) to allow differentiating between period clustering (different clustering for each) ...
# ... And TODO: THEN, compare the centroids of the clusters to determine wethere it's the same cluster or not

result_df = cluster_df(df, CLUSTERING_METRIC)
result_df

academia
Best number of clusters (new): 2 [max 30]
Best number of clusters (old): 2 [max 30]
diablo
Best number of clusters (new): 4 [max 100]
Best number of clusters (old): 2 [max 100]
excitación
Best number of clusters (new): 2 [max 81]
Best number of clusters (old): 2 [max 81]
genio
Best number of clusters (new): 3 [max 100]
Best number of clusters (old): 2 [max 100]
gente
Best number of clusters (new): 2 [max 100]
Best number of clusters (old): 2 [max 100]
hombre
Best number of clusters (new): 2 [max 100]
Best number of clusters (old): 2 [max 100]
humanitaria
Best number of clusters (new): 2 [max 42]
Best number of clusters (old): 2 [max 42]
humor
Best number of clusters (new): 2 [max 100]
Best number of clusters (old): 2 [max 100]
juicio
Best number of clusters (new): 3 [max 100]
Best number of clusters (old): 2 [max 100]
luces
Best number of clusters (new): 3 [max 100]
Best number of clusters (old): 2 [max 100]
miserable
Best number of clusters (new): 2 [max 100]
Best number of c

Unnamed: 0,index,word,orth,text,period,cluster_new,cluster_old
0,0,academia,academia,151 elipsis cuando se suprime el nombre ó el v...,old,0,1
1,1,academia,academia,"REDACTOR, JOSH L. CAMACHO La Ciencia, con su p...",old,0,1
2,2,academia,academia,"Yaro, por no tener partida en el Presupuesto d...",old,0,1
3,3,academia,academia,fué destinado al Convento de Murcia en cuya cé...,old,0,0
4,4,academia,academia,Las aptitudes físicas y morales que necesita q...,old,0,1
...,...,...,...,...,...,...,...
1686,1686,ventajosa,ventajosa,El BEI lanza una oferta de canje de deuda por ...,new,1,28
1687,1687,ventajosa,ventajosa,La resolución de las controversias entre la ad...,new,0,2
1688,1688,ventajosa,ventajosa,223 h) aprobará en un plazo de treinta días la...,new,0,2
1689,1689,ventajosa,ventajosa,La consulta es especialmente ventajosa cuando ...,new,1,8


In [15]:
result_df.to_csv('./output/ssd-clustered-silhouette.tsv', sep='\t', index=False)