In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

# Load datasets
df_train = pd.read_csv('df_train4.csv')
df_valid = pd.read_csv('df_valid4.csv')
df_test = pd.read_csv('df_test4.csv')

# Combine train and validation datasets
df_combined = pd.concat([df_train, df_valid], ignore_index=True)

# Ensure labels are integers
df_combined['label'] = df_combined['label'].astype(int)
df_test['label'] = df_test['label'].astype(int)

# Define metrics function
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    accuracy = accuracy_score(p.label_ids, preds)
    return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}

# Model names
model_names = [
    "w11wo/javanese-bert-small-imdb-classifier",
    "w11wo/javanese-gpt2-small-imdb-classifier",
    "w11wo/javanese-distilbert-small-imdb-classifier"
]

# Fixed hyperparameters
learning_rate = 5e-5
batch_size = 16
num_epochs = 3

k_folds = 5
skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

# Store cross-validation results for each model
cross_val_results = []

# Perform cross-validation for each model
for model_name in model_names:
    print(f"\nCross-validating model: {model_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    fold_metrics = []
    
    # Prepare datasets for each fold
    for fold, (train_index, val_index) in enumerate(skf.split(df_combined['sentence'], df_combined['label'])):
        print(f"Fold {fold+1}")
        
        train_data = df_combined.iloc[train_index]
        val_data = df_combined.iloc[val_index]
        
        # Convert to Hugging Face datasets
        train_dataset = Dataset.from_pandas(train_data)
        val_dataset = Dataset.from_pandas(val_data)
        
        # Preprocess data
        def preprocess_function(examples):
            return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)
        
        tokenized_train = train_dataset.map(preprocess_function, batched=True)
        tokenized_val = val_dataset.map(preprocess_function, batched=True)
        
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        
        # Load model
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
        
        # Define training arguments
        training_args = TrainingArguments(
            output_dir=f'./results/{model_name}/fold_{fold}',
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_epochs,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="no",
            load_best_model_at_end=False,
            push_to_hub=False
        )
        
        # Initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        
        # Train the model
        trainer.train()
        
        # Evaluate on validation set
        val_results = trainer.predict(tokenized_val)
        fold_metrics.append(compute_metrics(val_results))
    
    # Average metrics for the current model
    avg_metrics = {
        'Model': model_name,
        'Accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'F1 Score': np.mean([m['f1'] for m in fold_metrics]),
        'Precision': np.mean([m['precision'] for m in fold_metrics]),
        'Recall': np.mean([m['recall'] for m in fold_metrics])
    }
    cross_val_results.append(avg_metrics)

# Create a DataFrame for the cross-validation results
cross_val_results_df = pd.DataFrame(cross_val_results)

# Display the cross-validation results table
print("\nCross-Validation Results:")
print(cross_val_results_df)

# Train on combined data and evaluate on the test set
test_results = []

for model_name in model_names:
    print(f"\nTraining and evaluating {model_name} on the test set.")
    
    # Combine the train and validation data for the model
    train_dataset = Dataset.from_pandas(df_combined)
    
    # Preprocess the combined dataset
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def preprocess_function(examples):
        return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)
    
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
    
    # Define training arguments for final training
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}/final',
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        push_to_hub=False
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    
    # Train on the full combined dataset
    trainer.train()
    
    # Preprocess test data
    test_dataset = Dataset.from_pandas(df_test)
    tokenized_test = test_dataset.map(lambda x: tokenizer(x['sentence'], truncation=True, padding='max_length', max_length=128), batched=True)
    
    # Evaluate on the test set
    test_results_output = trainer.predict(tokenized_test)
    test_metrics = compute_metrics(test_results_output)
    
    # Store the test results
    test_results.append({
        'Model': model_name,
        'Test Accuracy': test_metrics['accuracy'],
        'Test F1 Score': test_metrics['f1'],
        'Test Precision': test_metrics['precision'],
        'Test Recall': test_metrics['recall']
    })

# Create a DataFrame for the final results
test_results_df = pd.DataFrame(test_results)

# Display the final test results table
print("\nTest Set Evaluation Results:")
print(test_results_df)

In [None]:
cross_val_results_df

In [None]:
test_results_df

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def load_data():
    df_train = pd.read_csv('df_train4.csv')
    df_valid = pd.read_csv('df_valid4.csv')
    df_test = pd.read_csv('df_test4.csv')
    df_combined = pd.concat([df_train, df_valid], ignore_index=True)
    df_combined['label'] = df_combined['label'].astype(int)
    df_test['label'] = df_test['label'].astype(int)
    return df_combined, df_test

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='weighted')
    accuracy = accuracy_score(p.label_ids, preds)
    return {'accuracy': accuracy, 'f1': f1, 'precision': precision, 'recall': recall}

def preprocess_function(examples, tokenizer):
    return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)

def train_and_evaluate_model(model_name, df_combined, skf):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    fold_metrics = []
    for fold, (train_index, val_index) in enumerate(skf.split(df_combined['sentence'], df_combined['label'])):
        train_data = df_combined.iloc[train_index]
        val_data = df_combined.iloc[val_index]
        train_dataset = Dataset.from_pandas(train_data)
        val_dataset = Dataset.from_pandas(val_data)
        tokenized_train = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
        tokenized_val = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
        training_args = TrainingArguments(
            output_dir=f'/scratch/lf93/iw/cv_results/{model_name}/fold_{fold}',
            learning_rate=5e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="no",
            load_best_model_at_end=False,
            push_to_hub=False
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        trainer.train()
        val_results = trainer.predict(tokenized_val)
        fold_metrics.append(compute_metrics(val_results))
    avg_metrics = {
        'Model': model_name,
        'Accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'F1 Score': np.mean([m['f1'] for m in fold_metrics]),
        'Precision': np.mean([m['precision'] for m in fold_metrics]),
        'Recall': np.mean([m['recall'] for m in fold_metrics])
    }
    return avg_metrics

def cross_validate_models(model_names, df_combined, k_folds):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
    cross_val_results = []
    for model_name in model_names:
        print(f"\nCross-validating model: {model_name}")
        avg_metrics = train_and_evaluate_model(model_name, df_combined, skf)
        cross_val_results.append(avg_metrics)
    return pd.DataFrame(cross_val_results)

def train_final_models(model_names, df_combined, df_test):
    test_results = []
    for model_name in model_names:
        print(f"\nTraining and evaluating {model_name} on the test set.")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        train_dataset = Dataset.from_pandas(df_combined)
        tokenized_train = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
        training_args = TrainingArguments(
            output_dir=f'/scratch/lf93/iw/cv_results/{model_name}/final',
            learning_rate=5e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            push_to_hub=False
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        trainer.train()
        test_dataset = Dataset.from_pandas(df_test)
        tokenized_test = test_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
        test_results_output = trainer.predict(tokenized_test)
        test_metrics = compute_metrics(test_results_output)
        test_results.append({
            'Model': model_name,
            'Test Accuracy': test_metrics['accuracy'],
            'Test F1 Score': test_metrics['f1'],
            'Test Precision': test_metrics['precision'],
            'Test Recall': test_metrics['recall']
        })
    return pd.DataFrame(test_results)

# Main execution
df_combined, df_test = load_data()

model_names = [
    "w11wo/javanese-bert-small-imdb-classifier",
    "w11wo/javanese-gpt2-small-imdb-classifier",
    "w11wo/javanese-distilbert-small-imdb-classifier"
]

cross_val_results_df = cross_validate_models(model_names, df_combined, k_folds=5)
print("\nCross-Validation Results:")
print(cross_val_results_df)

test_results_df = train_final_models(model_names, df_combined, df_test)
print("\nTest Set Evaluation Results:")
print(test_results_df)


In [None]:
test_results_df

In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import numpy as np

def load_data():
    df_train = pd.read_csv('df_train4.csv')
    df_valid = pd.read_csv('df_valid4.csv')
    df_test = pd.read_csv('df_test4.csv')
    df_combined = pd.concat([df_train, df_valid], ignore_index=True)
    df_combined['label'] = df_combined['label'].astype(int)
    df_test['label'] = df_test['label'].astype(int)
    return df_combined, df_test

def compute_metrics(p):
    y_true = p.label_ids
    y_pred = p.predictions.argmax(-1)
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_true, y_pred)
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'confusion_matrix': conf_matrix
    }

def preprocess_function(examples, tokenizer):
    return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=128)

def train_and_evaluate_model(model_name, df_combined, skf):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    fold_metrics = []
    for fold, (train_index, val_index) in enumerate(skf.split(df_combined['sentence'], df_combined['label'])):
        train_data = df_combined.iloc[train_index]
        val_data = df_combined.iloc[val_index]
        train_dataset = Dataset.from_pandas(train_data)
        val_dataset = Dataset.from_pandas(val_data)
        tokenized_train = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
        tokenized_val = val_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
        training_args = TrainingArguments(
            output_dir=f'/scratch/lf93/iw/cv_results/{model_name}/fold_{fold}',
            learning_rate=5e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            evaluation_strategy="epoch",
            save_strategy="no",
            load_best_model_at_end=False,
            push_to_hub=False
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_val,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        trainer.train()
        val_results = trainer.predict(tokenized_val)
        fold_metrics.append(compute_metrics(val_results))
    avg_metrics = {
        'Model': model_name,
        'Accuracy': np.mean([m['accuracy'] for m in fold_metrics]),
        'F1 Score': np.mean([m['f1'] for m in fold_metrics]),
        'Precision': np.mean([m['precision'] for m in fold_metrics]),
        'Recall': np.mean([m['recall'] for m in fold_metrics]),
        'Confusion Matrices': [m['confusion_matrix'] for m in fold_metrics]
    }
    return avg_metrics

def cross_validate_models(model_names, df_combined, k_folds):
    skf = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
    cross_val_results = []
    for model_name in model_names:
        print(f"\nCross-validating model: {model_name}")
        avg_metrics = train_and_evaluate_model(model_name, df_combined, skf)
        cross_val_results.append(avg_metrics)
    return pd.DataFrame(cross_val_results)

def train_final_models(model_names, df_combined, df_test):
    test_results = []
    for model_name in model_names:
        print(f"\nTraining and evaluating {model_name} on the test set.")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        train_dataset = Dataset.from_pandas(df_combined)
        tokenized_train = train_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
        training_args = TrainingArguments(
            output_dir=f'/scratch/lf93/iw/cv_results/{model_name}/final',
            learning_rate=5e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=3,
            weight_decay=0.01,
            push_to_hub=False
        )
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        trainer.train()
        test_dataset = Dataset.from_pandas(df_test)
        tokenized_test = test_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
        test_results_output = trainer.predict(tokenized_test)
        test_metrics = compute_metrics(test_results_output)
        print(f"\nEvaluation for model {model_name} on the test set:")
        print(f"Accuracy: {test_metrics['accuracy']:.2f}")
        print(f"Precision: {test_metrics['precision']:.2f}")
        print(f"Recall: {test_metrics['recall']:.2f}")
        print(f"F1 Score: {test_metrics['f1']:.2f}")
        print("Confusion Matrix:")
        print(test_metrics['confusion_matrix'])
        test_results.append({
            'Model': model_name,
            'Test Accuracy': test_metrics['accuracy'],
            'Test F1 Score': test_metrics['f1'],
            'Test Precision': test_metrics['precision'],
            'Test Recall': test_metrics['recall'],
            'Test Confusion Matrix': test_metrics['confusion_matrix']
        })
    return pd.DataFrame(test_results)




In [None]:
# Main execution
df_combined, df_test = load_data()

model_names = [
    "w11wo/javanese-bert-small-imdb-classifier",
    "w11wo/javanese-gpt2-small-imdb-classifier",
    "w11wo/javanese-distilbert-small-imdb-classifier"
]

cross_val_results_df = cross_validate_models(model_names, df_combined, k_folds=5)
print("\nCross-Validation Results:")
print(cross_val_results_df)

test_results_df = train_final_models(model_names, df_combined, df_test)
print("\nTest Set Evaluation Results:")
print(test_results_df)

In [None]:
cross_val_results_df

In [None]:
test_results_df