In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

ModuleNotFoundError: No module named 'pandas'

In [None]:
## Reading the csv
FILE_PATH = "../airline_incidents.csv"
df = pd.read_csv(FILE_PATH)

# Ensure column names are correct
df = df.rename(columns={"report": "text", "part failure": "label"})

In [None]:
## Extact unique labels
unique_labels = sorted(df["label"].unique())
print(f"Unique labels found: {unique_labels}")

## Define class label feature
class_label = ClassLabel(names = unique_labels)

In [None]:
# Step 3: Convert Labels to Integer Format
df["label"] = df["label"].apply(lambda x: class_label.str2int(x))

In [None]:
# Step 4: Split Dataset into Train/Validation
train_size = int(0.8 * len(df))
train_df = df[:train_size]
val_df = df[train_size:]

In [None]:
# Convert Pandas DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset})

In [None]:
# Step 5: Tokenize Text Data Using BERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

dataset_dict = dataset_dict.map(tokenize_function, batched=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Step 6: Define Model with Correct Number of Labels
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=len(unique_labels)
).to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    gradient_accumulation_steps=2,
    fp16=True if torch.cuda.is_available() else False,
    fp16_full_eval=True if torch.cuda.is_available() else False,
    report_to="none",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    tokenizer=tokenizer
)

In [None]:
# Step 9: Train Model
trainer.train()

In [None]:
# Step 10: Save Model and Tokenizer for Future Use
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

In [None]:
# Step 12: Evaluate Model
trainer.evaluate()