In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

# Function to load data with different encodings
def load_data(file_path):
    encodings = ['utf-8', 'latin1', 'ISO-8859-1']
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            continue
    raise ValueError("Failed to decode the file with tried encodings.")

# Load the dataset
df_all = load_data('df_all_group3.csv')

# Ensure labels are integers
df_all['label'] = df_all['label'].astype(int)

# Perform group-aware split (80% train, 20% test)
gss = GroupShuffleSplit(test_size=0.2, random_state=42)
train_indices, test_indices = next(gss.split(df_all, groups=df_all['group']))
df_train = df_all.loc[train_indices]
df_test = df_all.loc[test_indices]

In [None]:
df_test.to_csv('df_test_group.csv', index=False)

In [None]:
df_train.to_csv('df_train_group.csv', index=False)

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

# Load the dataset
df_all = pd.read_csv('df_all_group3.csv')

# Ensure labels are integers
df_all['label'] = df_all['label'].astype(int)

# Perform group-aware split (80% train, 20% test)
gss = GroupShuffleSplit(test_size=0.2, random_state=42)
train_indices, test_indices = next(gss.split(df_all, groups=df_all['group']))
df_train = df_all.loc[train_indices]
df_test = df_all.loc[test_indices]

# Define metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    accuracy = accuracy_score(p.label_ids, preds)
    return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}

# Model names
model_names = [
    "w11wo/javanese-bert-small-imdb-classifier",
    "w11wo/javanese-gpt2-small-imdb-classifier",
    "w11wo/javanese-distilbert-small-imdb-classifier"
]

# Fixed hyperparameters
learning_rate = 5e-5
batch_size = 16
num_epochs = 3

k_folds = 5
gkf = GroupKFold(n_splits=k_folds)

# Store cross-validation results for each model
cross_val_results = []

# Perform cross-validation for each model
for model_name in model_names:
    print(f"\nCross-validating model: {model_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    fold_metrics = []
    
    # Prepare datasets for each fold
    for fold, (train_index, val_index) in enumerate(gkf.split(df_train['sentence'], df_train['label'], groups=df_train['group'])):
        print(f"Fold {fold+1}")
        
        train_data = df_train.iloc[train_index]
        val_data = df_train.iloc[val_index]
        
        # Convert to Hugging Face datasets
        train_dataset = Dataset.from_pandas(train_data)
        val_dataset = Dataset.from_pandas(val_data)
        
        # Preprocess data
        def preprocess_function(examples):
            return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)
        
        tokenized_train = train_dataset.map(preprocess_function, batched=True)
        tokenized_val = val_dataset.map(preprocess_function, batched=True)
        
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        
        # Load model
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
        
        # Define training arguments
        training_args = TrainingArguments(
            output_dir=f'/scratch/lf93/iw/group_result/{model_name}/fold_{fold}',
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_epochs,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="epoch",  # Save the model after each epoch
            load_best_model_at_end=False,
            push_to_hub=False
        )
        
        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        
        # Train the model
        trainer.train()
        
        # Save the model after each fold
        trainer.save_model(f'/scratch/lf93/iw/group_result/{model_name}/fold_{fold}')
        
        # Evaluate on validation set
        val_results = trainer.predict(tokenized_val)
        fold_metrics.append(compute_metrics(val_results))
    
    # Average metrics for the current model
    avg_metrics = {
        'Model': model_name,
        'Accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'F1 Score': np.mean([m['f1'] for m in fold_metrics]),
        'Precision': np.mean([m['precision'] for m in fold_metrics]),
        'Recall': np.mean([m['recall'] for m in fold_metrics])
    }
    cross_val_results.append(avg_metrics)

# Create a DataFrame for the cross-validation results
cross_val_results_df = pd.DataFrame(cross_val_results)

# Display the cross-validation results table
print("\nCross-Validation Results:")
print(cross_val_results_df)

# Save the cross-validation results to a CSV file
cross_val_results_df.to_csv('cross_val_results.csv', index=False)

# Select the best model for each model type
best_models = cross_val_results_df.loc[cross_val_results_df.groupby('Model')['F1 Score'].idxmax()]

# Train and evaluate each best model on the test set
test_results = []

for model_name in best_models['Model']:
    print(f"\nTraining and evaluating {model_name} on the test set.")
    
    # Combine the train and validation data for the model
    train_dataset = Dataset.from_pandas(df_train)
    
    # Preprocess the combined dataset
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def preprocess_function(examples):
        return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)
    
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
    
    # Define training arguments for final training
    training_args = TrainingArguments(
        output_dir=f'/scratch/lf93/iw/group_result/{model_name}/final',
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        push_to_hub=False
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    
    # Train on the full combined dataset
    trainer.train()
    
    # Save the final trained model
    trainer.save_model(f'/scratch/lf93/iw/group_result/{model_name}/final_model')
    
    # Preprocess test data
    test_dataset = Dataset.from_pandas(df_test)
    tokenized_test = test_dataset.map(lambda x: tokenizer(x['sentence'], truncation=True, padding='max_length', max_length=128), batched=True)
    
    # Evaluate on the test set
    test_results_output = trainer.predict(tokenized_test)
    test_metrics = compute_metrics(test_results_output)
    
    # Store the test results
    test_results.append({
        'Model': model_name,
        'Test Accuracy': test_metrics['accuracy'],
        'Test F1 Score': test_metrics['f1'],
        'Test Precision': test_metrics['precision'],
        'Test Recall': test_metrics['recall']
    })

# Create a DataFrame for the final results
test_results_df = pd.DataFrame(test_results)

# Display the final test results table
print("\nTest Set Evaluation Results:")
print(test_results_df)
