In [None]:
from modeling_distillemb import BertModel, BertForSequenceClassification, BertForEmbeddingLM
from distill_emb import DistillEmbSmall, DistillEmb
from config import DistillModelConfig, DistillEmbConfig
import torch
from transformers import AutoTokenizer, RwkvConfig, RwkvModel, AutoModel
from tokenizer import CharTokenizer
from knn_classifier import KNNTextClassifier
from data_loader import load_sentiment, load_ner_dataset, load_pos_dataset
from data_loader import load_news_dataset
import pandas as pd
from retrieval import build_json_pairs, top1_accuracy
import os
from transformers import GPT2LMHeadModel

In [None]:
df, classes = load_sentiment()

In [None]:
df

In [None]:
df, classes = load_pos_dataset()

In [None]:
df

In [None]:
num_input_chars=12

In [None]:
# !git clone https://huggingface.co/leobitz/distil-emb-base

In [None]:
tokenizer = CharTokenizer.from_pretrained(pretrained_directory="distil-emb-base")
distill_config = DistillEmbConfig.from_pretrained(pretrained_model_name_or_path="distil-emb-base")
distill_model = DistillEmb.from_pretrained(pretrained_model_name_or_path="distil-emb-base")

In [None]:
distill_config

In [None]:
config = DistillModelConfig(
    vocab_size=30522,
    hidden_size=768,
    num_hidden_layers=3,
    num_attention_heads=8,
    intermediate_size=3072,
    max_position_embeddings=512,
    type_vocab_size=2,
    pad_token_id=0,
    position_embedding_type="absolute",
    use_cache=True,
    classifier_dropout=None,
    embedding_type="distill",  # 'distilemb', 'fasttext'
    encoder_type='lstm', #'lstm'
    num_input_chars=num_input_chars,  # number of characters in each token
    char_vocab_size=tokenizer.char_vocab_size,
    distill_config=distill_config,
    distill_pretrained_model_name="distil-emb-base",
)


In [None]:
df = pd.read_parquet("downstream-data/masakhanews.parquet")

In [None]:
df

In [None]:
num_labels = len(df['label'].unique())
config.num_labels = num_labels
model = BertForSequenceClassification(config)

In [None]:
# input ids with (B, S, N)
char_input = torch.randint(0, config.num_input_chars, (1, 10, config.num_input_chars))
# input ids with (B, S, N)
print("char_input shape:", char_input.shape)
inputs = {
    "input_ids": char_input,
    "attention_mask":torch.tensor([[1] * char_input.size(1)]),  # attention mask for each token
    "token_type_ids": torch.tensor([[0] * char_input.size(1)]),  # token type ids for each token
}
outputs = model(**inputs)

In [None]:
tokenizer("Hello world!")

In [None]:
outputs[0].shape

In [None]:
model.save_pretrained("distil-emb-seqcls-lstm")

In [None]:
model = BertForSequenceClassification.from_pretrained("distil-emb-seqcls-lstm")

In [None]:
from datasets import Dataset, DatasetDict
df['text'] = df['headline']
# Assuming df is your dataframe
# Split the data based on the 'split' column
train_df = df[df['split'] == 'train'][['text', 'label']]
test_df = df[df['split'] == 'test'][['text', 'label']]

# Create HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
train_dataset

In [None]:
from typing import Dict, Any

def preprocess_function(examples: Dict[str, Any]):
    batch = tokenizer(
        examples["text"],
        padding=False,
        max_length=512,
        return_attention_mask=True,
    )

    batch["labels"] = examples["label"]
    return batch



tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_test = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=test_dataset.column_names,
)

In [None]:
len(train_dataset[0]['text'].split())

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
class CustomDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, features):
        batch = self.tokenizer.pad(
            features,
            padding="longest",
            max_length=512,
            return_tensors="pt",
            return_attention_mask=True
        )
        return batch

data_collator = CustomDataCollator(tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

trainer.train()