In [1]:
import pandas as pd
import torch
import os
#os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

ModuleNotFoundError: No module named 'torch'

In [None]:
# Set device (use single GPU to avoid DataParallel issues)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# 1. Load and preprocess dataset
def load_dataset(file_path):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file {file_path} does not exist. Please check the path and file name.")
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        raise ValueError(f"Failed to read {file_path}. Check delimiter or encoding (e.g., try delimiter=';' or encoding='utf-8'). Error: {str(e)}")
    
    if 'text' not in df.columns or 'label' not in df.columns:
        possible_label_cols = [col for col in df.columns if 'label' in col.lower() or 'sentiment' in col.lower()]
        raise ValueError(
            f"The dataset at {file_path} must contain 'text' and 'label' columns. "
            f"Found: {list(df.columns)}. Possible label columns: {possible_label_cols}. "
            f"Consider renaming the label column (e.g., df.rename(columns={{'{possible_label_cols[0]}': 'label'}}))."
        )
    
    # Check for NaN labels
    nan_count = df['label'].isna().sum()
    if nan_count > 0:
        print(f"Warning: Found {nan_count} rows with NaN labels in {file_path}.")
        if nan_count == len(df):
            raise ValueError(
                f"All labels in {file_path} are NaN. Expected labels: 0, 1, 2 for positive, negative, neutral. "
                f"Check the CSV content or regenerate the dataset."
            )
        print("Removing rows with NaN labels.")
        df = df.dropna(subset=['label'])
    
    # Check if dataset is empty
    if df.empty:
        raise ValueError(f"After removing NaN labels, no data remains in {file_path}. Please provide a dataset with valid labels.")
    
    # Check for valid labels
    unique_labels = df['label'].unique()
    if len(unique_labels) < 2:
        raise ValueError(
            f"Dataset at {file_path} has fewer than 2 unique labels: {unique_labels}. "
            f"Expected at least 2 of [0, 1, 2] for positive, negative, neutral."
        )
    
    return df[['text', 'label']]

# Define paths
home_dir = os.path.expanduser("~")  # Expands ~ to /home/hagos
amharic_data_path = os.path.join(home_dir, "amharic_data.csv")
tigrinya_data_path = os.path.join(home_dir, "tigrinya_data.csv")

# Load datasets
try:
    amharic_df = load_dataset(amharic_data_path)
    tigrinya_df = load_dataset(tigrinya_data_path)
except FileNotFoundError as e:
    print(e)
    print("Please ensure the dataset files are in the home directory (~) and run again.")
    raise
except ValueError as e:
    print(e)
    print("Please check the dataset for valid 'text' and 'label' columns and non-NaN labels.")
    raise

In [None]:
# 2. Inspect dataset statistics
print("Amharic dataset size:", len(amharic_df))
print("Amharic label distribution:\n", amharic_df['label'].value_counts())
print("Tigrinya dataset size:", len(tigrinya_df))
print("Tigrinya label distribution:\n", tigrinya_df['label'].value_counts())

In [None]:
# 3. Encode string labels to numerical values
'''

def encode_labels(df, label_column='label'):
    # Define fixed mapping for expected labels
    label2id = {
        'positive': 1, 'negative': -1, 'neutral': 0,
        '1': 1, '1': -1, '0': 0,
        1: 1, -1: -1, 0: 0
    }
    id2label = {1: 'positive', -1: 'negative', 0: 'neutral'}
    
    # Map labels to numerical values
    df['label'] = df[label_column].map(label2id)
    
    # Check for unmapped labels
    if df['label'].isna().any():
        unmapped = df[df['label'].isna()][label_column].unique()
        raise ValueError(
            f"Found unmapped labels: {unmapped}. Expected: [-1, 0, 1] or ['positive', 'negative', 'neutral']."
        )
    
    # Validate labels are numerical and in range
    if not pd.api.types.is_numeric_dtype(df['label']):
        raise ValueError(f"Label encoding failed. 'label' column contains non-numerical values: {df['label'].unique()}")
    
    num_labels = 3  # Fixed for positive, negative, neutral
    if not all(df['label'].isin(range(num_labels))):
        raise ValueError(f"Labels must be in range [-1, {num_labels-1}]. Found: {df['label'].unique()}")
    
    # Ensure types are correct for transformers
    label2id = {str(k): int(v) for k, v in label2id.items()}
    id2label = {int(k): str(v) for k, v in id2label.items()}
    
    return df, label2id, id2label, num_labels

# Encode labels
try:
    amharic_df, amharic_label2id, amharic_id2label, amharic_num_labels = encode_labels(amharic_df)
    tigrinya_df, tigrinya_label2id, tigrinya_id2label, tigrinya_num_labels = encode_labels(tigrinya_df)
except ValueError as e:
    print(e)
    print("Please ensure all labels are valid (0, 1, 2 or 'positive', 'negative', 'neutral').")
    raise

# Print label mappings and number of labels
print("Amharic label mapping:", amharic_label2id)
print("Tigrinya label mapping:", tigrinya_label2id)
print("Amharic num_labels:", amharic_num_labels)
print("Tigrinya num_labels:", tigrinya_num_labels)

# Convert to Hugging Face Dataset
amharic_dataset = Dataset.from_pandas(amharic_df)
tigrinya_dataset = Dataset.from_pandas(tigrinya_df)
''' 
def encode_labels(df, label_column='label'):
    # Define fixed mapping for expected labels
    label2id = {
        'positive': 1, 'negative': -1, 'neutral': 0,
        '1': 1, '-1': -1, '0': 0,
        1: 1, -1: -1, 0: 0
    }
    id2label = {1: 'positive', -1: 'negative', 0: 'neutral'}
    
    # Map labels to numerical values
    df['label'] = df[label_column].map(label2id)
    
    # Check for unmapped labels
    if df['label'].isna().any():
        unmapped = df[df['label'].isna()][label_column].unique()
        raise ValueError(
            f"Found unmapped labels: {unmapped}. Expected: [-1, 0, 1] or ['positive', 'negative', 'neutral']."
        )
    
    # Validate labels are numerical and in range
    if not pd.api.types.is_numeric_dtype(df['label']):
        raise ValueError(f"Label encoding failed. 'label' column contains non-numerical values: {df['label'].unique()}")
    
    valid_labels = {-1, 0, 1}  # Define valid label set
    if not all(df['label'].isin(valid_labels)):
        raise ValueError(f"Labels must be in [-1, 0, 1]. Found: {df['label'].unique()}")
    
    # Ensure types are correct for transformers
    label2id = {str(k): int(v) for k, v in label2id.items()}
    id2label = {int(k): str(v) for k, v in id2label.items()}
    
    num_labels = 3  # Fixed for positive, negative, neutral
    return df, label2id, id2label, num_labels

# Encode labels
try:
    amharic_df, amharic_label2id, amharic_id2label, amharic_num_labels = encode_labels(amharic_df)
    tigrinya_df, tigrinya_label2id, tigrinya_id2label, tigrinya_num_labels = encode_labels(tigrinya_df)
except ValueError as e:
    print(e)
    print("Please ensure all labels are valid (-1, 0, 1 or 'positive', 'negative', 'neutral').")
    raise

# Print label mappings and number of labels
print("Amharic label mapping:", amharic_label2id)
print("Tigrinya label mapping:", tigrinya_label2id)
print("Amharic num_labels:", amharic_num_labels)
print("Tigrinya num_labels:", tigrinya_num_labels)

# Convert to Hugging Face Dataset
amharic_dataset = Dataset.from_pandas(amharic_df)
tigrinya_dataset = Dataset.from_pandas(tigrinya_df)

In [None]:
# 4. Initialize tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Tokenize datasets
amharic_tokenized = amharic_dataset.map(tokenize_function, batched=True)
tigrinya_tokenized = tigrinya_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
amharic_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tigrinya_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Split Amharic dataset for training and validation
amharic_split = amharic_tokenized.train_test_split(test_size=0.2, seed=42)
amharic_train = amharic_split["train"]
amharic_val = amharic_split["test"]

# Validate labels in training and validation sets
train_labels = np.unique(amharic_train['label'])
val_labels = np.unique(amharic_val['label'])
print("Amharic train labels:", train_labels)
print("Amharic val labels:", val_labels)
if len(train_labels) != amharic_num_labels:
    print(f"Warning: Only {len(train_labels)}/{amharic_num_labels} labels in training set. Consider increasing test_size or collecting more data.")

In [None]:
# 5. Initialize model
# Note: Warning about uninitialized weights is expected
model = XLMRobertaForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=amharic_num_labels,
    id2label=amharic_id2label,
    label2id=amharic_label2id
).to(device)

In [None]:
# 6. Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [None]:
# 7. Fine-tune on Amharic data
training_args = TrainingArguments(
    output_dir=os.path.join(home_dir, "AM_Tig_Transfer_Learning", "amharic_model"),
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir=os.path.join(home_dir, "AM_Tig_Transfer_Learning", "logs"),
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=amharic_train,
    eval_dataset=amharic_val,
    compute_metrics=compute_metrics,
)

print("Fine-tuning on Amharic data...")
trainer.train()

# Save the fine-tuned model
model.save_pretrained(os.path.join(home_dir, "AM_Tig_Transfer_Learning", "amharic_model"))
tokenizer.save_pretrained(os.path.join(home_dir, "AM_Tig_Transfer_Learning", "amharic_model"))


In [None]:
# 8. Transfer learning to Tigrinya
# Split Tigrinya dataset
tigrinya_split = tigrinya_tokenized.train_test_split(test_size=0.2, seed=42)
tigrinya_train = tigrinya_split["train"]
tigrinya_val = tigrinya_split["test"]

# Validate Tigrinya labels
tigrinya_train_labels = np.unique(tigrinya_train['label'])
tigrinya_val_labels = np.unique(tigrinya_val['label'])
print("Tigrinya train labels:", tigrinya_train_labels)
print("Tigrinya val labels:", tigrinya_val_labels)

# Update training arguments for Tigrinya
training_args.output_dir = os.path.join(home_dir, "AM_Tig_Transfer_Learning", "tigrinya_model")
training_args.num_train_epochs = 3
training_args.per_device_train_batch_size = 8
training_args.per_device_eval_batch_size = 8
training_args.learning_rate = 2e-5
training_args.eval_strategy = "epoch"

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tigrinya_train,
    eval_dataset=tigrinya_val,
    compute_metrics=compute_metrics,
)

print("Fine-tuning on Tigrinya data...")
trainer.train()

# Save the final model
model.save_pretrained(os.path.join(home_dir, "AM_Tig_Transfer_Learning", "tigrinya_model"))
tokenizer.save_pretrained(os.path.join(home_dir, "AM_Tig_Transfer_Learning", "tigrinya_model"))

In [None]:
# 9. Evaluate on Tigrinya test set
print("Evaluating on Tigrinya validation set...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")