In [None]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import StratifiedKFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.metrics import classification_report
import numpy as np

In [None]:
def evaluate_on_unseen_data(model_name, unseen_data_path, label_column='label'):
    output_dir = f'/scratch/lf93/iw/group_result/javanese-distilbert-small-imdb-classifier/final_model'
    
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(output_dir)
    model = AutoModelForSequenceClassification.from_pretrained(output_dir)
    
    # Load the unseen data
    unseen_data = pd.read_csv(unseen_data_path, encoding='ISO-8859-1')
    
    # Create a dataset from the pandas dataframe
    unseen_dataset = Dataset.from_pandas(unseen_data)
    
    # Tokenize the unseen dataset
    tokenized_unseen = unseen_dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    
    # Initialize the Trainer
    trainer = Trainer(model=model, tokenizer=tokenizer)
    
    # Make predictions on the tokenized unseen dataset
    unseen_results_output = trainer.predict(tokenized_unseen)
    y_pred = unseen_results_output.predictions.argmax(-1)
    
    # Extract true labels
    y_true = unseen_data[label_column]
    
    # Add predictions to the original dataframe
    unseen_data['predictions'] = y_pred
    
    # Generate a classification report using sklearn
    print("Sklearn Classification Report:")
    report = classification_report(y_true, y_pred)
    print(report)

    # Generate and print a confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion Matrix:")
    print(cm)
    
    # Optionally save the predictions to a CSV file
    # unseen_data.to_csv(f'/scratch/lf93/iw/cv_results/{model_name}/unseen_predictions3.csv', index=False)

In [None]:
unseen_data = pd.read_csv(unseen_data_path, encoding='ISO-8859-1')

In [None]:
unseen_data

In [None]:
unseen_data_path = 'news.csv'
for model_name in model_names:
    evaluate_on_unseen_data(model_name, unseen_data_path)

In [None]:
unseen_data_path = 'magz.csv'
for model_name in model_names:
    evaluate_on_unseen_data(model_name, unseen_data_path)