In [None]:
# Mount Google Drive and navigate to the training directory
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/TRAINING

In [None]:
# Install required libraries for dataset handling and transformers
%%capture
!pip install datasets
!pip install transformers

In [None]:
# Import necessary libraries and modules
import srsly
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import Trainer
from transformers import TrainingArguments
import numpy as np
from datasets import load_metric
import pandas as pd
import datasets

In [None]:
# Load annotations from a JSONL file
annotations_raw = [x for x in srsly.read_jsonl(r"../00_data/Classifier/annots_n400.jsonl")]
# Display the first annotation for inspection
annotations_raw[0]

In [None]:
# Process annotations to mark relevant paragraphs
processed_annotations = []
for annotation in annotations_raw:
    indices = [0]
    for label in annotation["label"]:
        start = label[0]
        end = label[1]
        indices.append(start)
        indices.append(end)
    parts = []
    for start_idx, end_idx in zip(indices, indices[1:] + [None]):
        if not any(c.isalnum() for c in annotation["text"][start_idx:end_idx]):
            continue
        if [start_idx, end_idx, "Relevant"] in annotation["label"]:
            processed_annotations.append({
                "text": annotation["text"][start_idx:end_idx].replace("\n", " ").replace("\t", " "),
                "label": 1
            })
        else:
            processed_annotations.append({
                "text": annotation["text"][start_idx:end_idx].replace("\n", " ").replace("\t", " "),
                "label": 0
            })
# Convert processed annotations to a DataFrame
annotations_df = pd.DataFrame(processed_annotations)

In [None]:
# Display the count of each label in the dataset
annotations_df["label"].value_counts()

In [None]:
# Display the processed annotations DataFrame
annotations_df

In [None]:
# Load the pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("agne/jobGBERT")
tokenizer = AutoTokenizer.from_pretrained("agne/jobGBERT")

In [None]:
# Split the dataset into training, testing, and validation sets
df_train, df_test_val = train_test_split(annotations_df, shuffle=True, random_state=42)
df_test, df_val = train_test_split(df_test_val, test_size=0.5, shuffle=True, random_state=42)

In [None]:
# Display the sizes of the training, testing, and validation sets
print(len(df_train), len(df_test), len(df_val))

In [None]:
# Define functions for tokenizing the dataset
def tokenize_dataset(dataframe):
    """
    Tokenize the dataset using the pre-trained tokenizer.

    Args:
        dataframe (pd.DataFrame): The dataset to tokenize.

    Returns:
        datasets.Dataset: The tokenized dataset.
    """
    return dataframe.map(tokenize_row, remove_columns=["text"])

def tokenize_row(row):
    """
    Tokenize a single row of the dataset.

    Args:
        row (dict): A dictionary containing the text and label.

    Returns:
        dict: A dictionary with tokenized text and label.
    """
    tokenized = tokenizer(row['text'], padding=True, truncation=True, max_length=512)
    tokenized['label'] = row['label']
    return tokenized

In [None]:
# Tokenize the training, testing, and validation datasets
dataset_train = tokenize_dataset(datasets.Dataset.from_pandas(df_train))
dataset_test = tokenize_dataset(datasets.Dataset.from_pandas(df_test))
dataset_val = tokenize_dataset(datasets.Dataset.from_pandas(df_val))

In [None]:
# Define training arguments for the model
training_args = TrainingArguments(
    output_dir="/content/model_checkpoints",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=10,
    load_best_model_at_end=True,
    save_total_limit=1
)

In [None]:
# Define a function to compute evaluation metrics
def compute_metrics(eval_predictions):
    """
    Compute evaluation metrics for the model.

    Args:
        eval_predictions (tuple): A tuple containing logits and labels.

    Returns:
        dict: A dictionary with computed metrics.
    """
    metric = load_metric("f1")
    logits, labels = eval_predictions
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

In [None]:
# Initialize the Trainer with the model, training arguments, and datasets
trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Import additional metrics for evaluation
from sklearn.metrics import classification_report, f1_score

In [None]:
# Evaluate the model on the test dataset
predictions = trainer.predict(dataset_test)
print(f1_score(predictions.predictions.argmax(axis=1), predictions.label_ids))
print(classification_report(predictions.predictions.argmax(axis=1), predictions.label_ids))

In [None]:
# Import seaborn for visualization
import seaborn as sns

In [None]:
# Create a DataFrame for predictions and labels
evaluation_df = pd.DataFrame(
    [predictions.predictions.argmax(axis=1), predictions.label_ids],
    index=["predictions", "labels"]
).T

In [None]:
# Plot a heatmap of the confusion matrix
sns.heatmap(pd.crosstab(evaluation_df["predictions"], evaluation_df["labels"]), annot=True, fmt="")

In [None]:
# Save the trained model
trainer.save_model("../00_data/Classifier/model_classification_jobgbert")