### Imports

In [None]:
!pip install -U huggingface_hub
!pip install -U datasets
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

import torch

from transformers import AutoTokenizer, AutoModelForMaskedLM

from datasets import Dataset, DatasetDict, load_from_disk, ClassLabel
import datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Datasets

In [None]:
# Path selection
# file_path_gl = '/content/drive/My Drive/4º Carrera/NLP/Proyecto/gl_dataset.csv'  # PATH: Luis
file_path_gl = '/content/drive/MyDrive/4.Curso/NLP/Proyecto/gl_dataset.csv'  # PATH: Iker

# file_path_pt = '/content/drive/My Drive/4º Carrera/NLP/Proyecto/pt_dataset.csv'  # PATH: Luis
file_path_pt = '/content/drive/MyDrive/4.Curso/NLP/Proyecto/pt_dataset.csv'  # PATH: Iker

### Funciones para tokenizar y usar los modelos BERT con los datasets

In [None]:
language_label = ClassLabel(names=['gl', 'pt'])

def convertir_a_datasetDict(df):
  train_df = df[df.split=='train']
  val_df = df[df.split=='val']
  test_df = df[df.split=='test']

  train_dataset = Dataset.from_pandas(train_df)
  val_dataset = Dataset.from_pandas(val_df)
  test_dataset = Dataset.from_pandas(test_df)

  train_dataset = train_dataset.cast_column('language', language_label)
  val_dataset = val_dataset.cast_column('language', language_label)
  test_dataset = test_dataset.cast_column('language', language_label)

  dataset_dict = DatasetDict({
      'train': train_dataset,
      'val':val_dataset,
      'test': test_dataset
  })

  return dataset_dict


In [None]:
def tokenize(batch):
  return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
# def extract_hidden_states(batch):
#     # Place model inputs on the GPU
#     inputs = {k:v.to("cuda") for k,v in batch.items()
#               if k in tokenizer.model_input_names}
#     # Extract last hidden states
#     with torch.no_grad():
#         outputs = model(**inputs)
#         last_hidden_state = outputs.hidden_states[-1]
#     # Return vector for [CLS] token
#     return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

Pillamos un subset de 5000 lineas por dataset

In [None]:
# Cargar ambos datasets
df_pt = pd.read_csv(file_path_pt)
df_gl = pd.read_csv(file_path_gl)

# Seleccionar subconjuntos de cada dataset
def select_subset(df, train_size, test_size, val_size):
    train = df[df['split'] == 'train'].head(train_size)
    test = df[df['split'] == 'test'].head(test_size)
    val = df[df['split'] == 'val'].head(val_size)
    return pd.concat([train, test, val])

# Seleccionar 35000 de train, 7500 de test, y 7500 de val para cada idioma
df_pt_subset = select_subset(df_pt, train_size=1750, test_size=375, val_size=375)
df_gl_subset = select_subset(df_gl, train_size=1750, test_size=375, val_size=375)

# Verificar resultados
print(f"Portugués: {len(df_pt_subset)} filas seleccionadas")
print(f"Gallego: {len(df_gl_subset)} filas seleccionadas")

Portugués: 2500 filas seleccionadas
Gallego: 2500 filas seleccionadas


In [None]:
df_subset_combined = pd.concat([df_pt_subset, df_gl_subset])
print(f"Total de filas en el conjunto combinado: {len(df_subset_combined)}")

Total de filas en el conjunto combinado: 5000


### Tokenización y uso de Bertinho solo con el texto en gallego

In [None]:
dataset_dict_gl = convertir_a_datasetDict(df_gl_subset)

Casting the dataset:   0%|          | 0/1750 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/375 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/375 [00:00<?, ? examples/s]

In [None]:
print(dataset_dict_gl)

DatasetDict({
    train: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__'],
        num_rows: 1750
    })
    val: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__'],
        num_rows: 375
    })
    test: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__'],
        num_rows: 375
    })
})


In [None]:
bertinho = "dvilares/bertinho-gl-base-cased"

In [None]:
# Cargamos el tokenizer y el model de bertinho
tokenizer = AutoTokenizer.from_pretrained(bertinho)

tokenizer_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/220k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Bertinho necesita una funcion de tokenizacion especial, donde el max_length tiene que ser 512
def tokenize_bertinho(batch):
  return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

In [None]:
# Si está en pandas, lo pasamos a torch
dataset_dict_gl.set_format(type="torch")

# Transformarmamos todo el dataset (los 3 splits) a token_ids.
df_encoded_gl = dataset_dict_gl.map(tokenize_bertinho, batched=True, batch_size=64)

Map:   0%|          | 0/1750 [00:00<?, ? examples/s]

Map:   0%|          | 0/375 [00:00<?, ? examples/s]

Map:   0%|          | 0/375 [00:00<?, ? examples/s]

In [None]:
print(df_encoded_gl)

DatasetDict({
    train: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1750
    })
    val: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 375
    })
    test: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 375
    })
})


Si queremos persistir el encoded dataset_dict lo guardamos

In [None]:
# df_encoded_gl.save_to_disk("df_encoded_gl")

Si ya lo tenemos guardado podemos cargar el encoded dataset_dict directamente

In [None]:
"""
df_encoded_gl = load_from_disk("df_encoded_gl")

print(df_encoded_gl)
"""

Uso de Bertinho con lo generado por el tokenizer

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForMaskedLM.from_pretrained(bertinho, output_hidden_states=True).to(device)

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at dvilares/bertinho-gl-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClas

In [None]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.hidden_states[-1]
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
df_encoded_gl.set_format("torch", columns=["input_ids", "attention_mask", "language"])

In [None]:
#hide_output
df_hidden_gl = df_encoded_gl.map(extract_hidden_states, batched=True, batch_size=128)

Map:   0%|          | 0/1750 [00:00<?, ? examples/s]

Map:   0%|          | 0/375 [00:00<?, ? examples/s]

Map:   0%|          | 0/375 [00:00<?, ? examples/s]

In [None]:
print(df_hidden_gl["train"].column_names)

['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state']


In [None]:
del model
del tokenizer

### Tokenización y uso de BERTimbau solo con el texto en portugués

In [None]:
dataset_dict_pt = convertir_a_datasetDict(df_pt_subset)

Casting the dataset:   0%|          | 0/1750 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/375 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/375 [00:00<?, ? examples/s]

In [None]:
print(dataset_dict_pt)

DatasetDict({
    train: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__'],
        num_rows: 1750
    })
    val: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__'],
        num_rows: 375
    })
    test: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__'],
        num_rows: 375
    })
})


In [None]:
dataset_dict_pt['train'][0]

{'text': 'Deve a Ré ser condenada a pagar ao Autor a diferença entre os vencimentos pagos desde julho de 2011 e o vencimento que venha a ser determinado nos termos dos pedidos formulados em ou, aos quais deverão acrescer juros de mora desde a data de vencimento de cada uma das prestações até integral pagamento, a liquidar em execução de sentença ou após a entrega dos documentos requeridos a final;',
 'split': 'train',
 'language': 1,
 '__index_level_0__': 0}

In [None]:
bertimbau = "neuralmind/bert-base-portuguese-cased"

In [None]:
# Cargamos el tokenizer y el model de bertinbau
tokenizer = AutoTokenizer.from_pretrained(bertimbau)

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Si está en pandas, lo pasamos a torch
dataset_dict_pt.set_format(type="torch")

# Transformarmamos todo el dataset (los 3 splits) a token_ids.
df_encoded_pt = dataset_dict_pt.map(tokenize, batched=True, batch_size=128) # Ajustar el batch_size para que no pete la RAM

Map:   0%|          | 0/1750 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/375 [00:00<?, ? examples/s]

Map:   0%|          | 0/375 [00:00<?, ? examples/s]

In [None]:
print(df_encoded_pt)

DatasetDict({
    train: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1750
    })
    val: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 375
    })
    test: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 375
    })
})


Si queremos persistir el encoded dataset_dict lo guardamos

In [None]:
# df_encoded_pt.save_to_disk("df_encoded_gl")

Si ya lo tenemos guardado podemos cargar el encoded dataset_dict directamente

In [None]:
"""
df_encoded_pt = load_from_disk("df_encoded_gl")

print(df_encoded_pt)
"""

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForMaskedLM.from_pretrained(bertimbau, output_hidden_states=True).to(device)

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
df_encoded_pt.set_format("torch", columns=["input_ids", "attention_mask", "language"])

In [None]:
#hide_output
df_hidden_pt = df_encoded_pt.map(extract_hidden_states, batched=True, batch_size=64)

Map:   0%|          | 0/1750 [00:00<?, ? examples/s]

Map:   0%|          | 0/375 [00:00<?, ? examples/s]

Map:   0%|          | 0/375 [00:00<?, ? examples/s]

In [None]:
print(df_hidden_pt['train'].column_names)

['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state']


### Combinar datasets con diferentes BERTs

In [None]:
combined_df = DatasetDict({"train": datasets.concatenate_datasets([df_hidden_gl["train"], df_hidden_pt["train"]]),
                           "val": datasets.concatenate_datasets([df_hidden_gl["val"], df_hidden_pt["val"]]),
                           "test": datasets.concatenate_datasets([df_hidden_gl["test"], df_hidden_pt["test"]])})

In [None]:
print(combined_df)

DatasetDict({
    train: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
        num_rows: 3500
    })
    val: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
        num_rows: 750
    })
    test: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
        num_rows: 750
    })
})


### Clasificacion

In [None]:
def create_sets(df):
  X_train = np.array(df["train"]["hidden_state"])
  X_valid = np.array(df["val"]["hidden_state"])
  X_test = np.array(df["test"]["hidden_state"])
  y_train = np.array(df["train"]["language"])
  y_valid = np.array(df["val"]["language"])
  y_test = np.array(df["test"]["language"])

  print(X_train.shape, X_valid.shape, X_test.shape)
  return X_train, X_valid, X_test, y_train, y_valid, y_test

In [None]:
X_train, X_valid, X_test, y_train, y_valid, y_test = create_sets(combined_df)

(3500, 768) (750, 768) (750, 768)


In [None]:
def classify_text(X_train, y_train, X_valid, y_valid, X_test, y_test, feature_name="features"):
  # Logistic Regression
  lr = LogisticRegression(random_state=42).fit(X_train, y_train)
  print(f'The accuracy of the LogisticRegression for eval set using {feature_name} is {round(lr.score(X_valid, y_valid), 4)}')
  print(f'The accuracy of the LogisticRegression for test set using {feature_name} is {round(lr.score(X_test, y_test), 4)}')

  # Random Forest
  rfc = RandomForestClassifier(max_depth=8, random_state=42).fit(X_train, y_train)
  print(f'The accuracy of the RandomForestClassifier for eval set using {feature_name} is {round(rfc.score(X_valid, y_valid), 4)}')
  print(f'The accuracy of the RandomForestClassifier for test set using {feature_name} is {round(rfc.score(X_test, y_test), 4)}')

  # Decision Tree
  dtc = DecisionTreeClassifier().fit(X_train, y_train)
  print(f'The accuracy of the DecisionTreeClassifier for eval set using {feature_name} is {round(dtc.score(X_valid, y_valid), 4)}')
  print(f'The accuracy of the DecisionTreeClassifier for test set using {feature_name} is {round(dtc.score(X_test, y_test), 4)}')

In [None]:
# Usar la función con bertinho y bertimbau
classify_text(X_train, y_train, X_valid, y_valid, X_test, y_test, feature_name="bertinho for galician and bertimbau for portuguese")

The accuracy of the LogisticRegression for eval set using bertinho for galician and bertimbau for portuguese is 1.0
The accuracy of the LogisticRegression for test set using bertinho for galician and bertimbau for portuguese is 1.0
The accuracy of the RandomForestClassifier for eval set using bertinho for galician and bertimbau for portuguese is 1.0
The accuracy of the RandomForestClassifier for test set using bertinho for galician and bertimbau for portuguese is 1.0
The accuracy of the DecisionTreeClassifier for eval set using bertinho for galician and bertimbau for portuguese is 0.9987
The accuracy of the DecisionTreeClassifier for test set using bertinho for galician and bertimbau for portuguese is 0.9987


## Preparación, tokenización y uso de DistilBERT con dataset combinado.

### Preparación del dataset combinado en un DatasetDict

In [None]:
subset_dict = convertir_a_datasetDict(df_subset_combined)

Casting to class labels:   0%|          | 0/3500 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/750 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/750 [00:00<?, ? examples/s]

In [None]:
print(subset_dict)

DatasetDict({
    train: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__'],
        num_rows: 3500
    })
    val: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__'],
        num_rows: 750
    })
    test: Dataset({
        features: ['text', 'split', 'language', '__index_level_0__'],
        num_rows: 750
    })
})


### Tokenization del dataset

In [None]:
# Modelo que vamos a utilizar
model_name = "distilbert/distilbert-base-multilingual-cased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
#Antes lo habíamos pasado a pandas, volvemos a pasarlo a torch
subset_dict.set_format(type="torch")

#Transformarmamos todo el dataset (los 3 splits) a token_ids.
df_encoded = subset_dict.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [None]:
print(df_encoded['train'].column_names)

['text', 'split', 'language', '__index_level_0__', 'input_ids', 'attention_mask']


### Extracción del último hidden_state usando DistilBERT

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForMaskedLM.from_pretrained(model_name, output_hidden_states=True).to(device)

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

In [None]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        outputs = model(**inputs)
        last_hidden_state = outputs.hidden_states[-1]
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
df_encoded.set_format("torch", columns=["input_ids", "attention_mask", "language"])

{'language': tensor(1),
 'input_ids': tensor([   101,    152,  18094,  45430,    169,    130,    120,  10186,  10104,
          10437,  11639,  77401,  28334,    117,  87838,  11153,  11573,  50699,
          10398,  73298,  10107,    117,  10431,  11573,  50699,  10149,  11938,
            119,  29084,    117,    182,    124,  10149,  45513,    117,    169,
          14641,  10104,  78842,  10161,  10149,  13987, 100975,  16994,  10104,
          36024,    117,  26043,  18678,  10104, 106227,  10310,  72071,  10291,
            117,  23020,  11181,  11392,  10244,    120,  11035,    120,  10302,
          13168,  37940,  85946,    132,    102,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,

In [None]:
#hide_output
df_hidden = df_encoded.map(extract_hidden_states, batched=True, batch_size=1)

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [None]:
df_hidden["train"].column_names

['text',
 'split',
 'language',
 '__index_level_0__',
 'input_ids',
 'attention_mask',
 'hidden_state']

Si queremos persistir el encoded dataset_dict lo guardamos

In [None]:
# df_hidden.save_to_disk("df_hidden")

Saving the dataset (0/1 shards):   0%|          | 0/3500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/750 [00:00<?, ? examples/s]

Si ya lo tenemos guardado podemos cargar el hidden_state dataset_dict directamente

In [None]:
"""
df_hidden = load_from_disk("df_hidden")

print(df_hidden)
"""

## Clasificación

Creación sets ya codificados

In [None]:
X_train = np.array(df_hidden["train"]["hidden_state"])
X_valid = np.array(df_hidden["val"]["hidden_state"])
X_test = np.array(df_hidden["test"]["hidden_state"])
y_train = np.array(df_hidden["train"]["language"])
y_valid = np.array(df_hidden["val"]["language"])
y_test = np.array(df_hidden["test"]["language"])

X_train.shape, X_valid.shape, X_test.shape

((3500, 768), (750, 768), (750, 768))

Usamos diferentes clasificadores

In [None]:
def classify_text(X_train, y_train, X_valid, y_valid, X_test, y_test, feature_name="features"):
  # Logistic Regression
  lr = LogisticRegression(random_state=42).fit(X_train, y_train)
  print(f'The accuracy of the LogisticRegression for eval set using {feature_name} is {round(lr.score(X_valid, y_valid), 4)}')
  print(f'The accuracy of the LogisticRegression for test set using {feature_name} is {round(lr.score(X_test, y_test), 4)}')

  # Random Forest
  rfc = RandomForestClassifier(max_depth=8, random_state=42).fit(X_train, y_train)
  print(f'The accuracy of the RandomForestClassifier for eval set using {feature_name} is {round(rfc.score(X_valid, y_valid), 4)}')
  print(f'The accuracy of the RandomForestClassifier for test set using {feature_name} is {round(rfc.score(X_test, y_test), 4)}')

  # Decision Tree
  dtc = DecisionTreeClassifier().fit(X_train, y_train)
  print(f'The accuracy of the DecisionTreeClassifier for eval set using {feature_name} is {round(dtc.score(X_valid, y_valid), 4)}')
  print(f'The accuracy of the DecisionTreeClassifier for test set using {feature_name} is {round(dtc.score(X_test, y_test), 4)}')

In [None]:
# Usar la función con distilBERT
classify_text(X_train, y_train, X_valid, y_valid, X_test, y_test, feature_name="distilBERT")

The accuracy of the LogisticRegression for eval set using distilBERT is 0.9947
The accuracy of the LogisticRegression for test set using distilBERT is 0.992
The accuracy of the RandomForestClassifier for eval set using distilBERT is 0.9933
The accuracy of the RandomForestClassifier for test set using distilBERT is 0.9867
The accuracy of the DecisionTreeClassifier for eval set using distilBERT is 0.964
The accuracy of the DecisionTreeClassifier for test set using distilBERT is 0.9627
