In [3]:
import pandas as pd
import numpy as np
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
import os

# File paths
DATASET_PATH = "./fake/"
test_bodies_path = os.path.join(DATASET_PATH, 'test_bodies.csv')
test_stances_path = os.path.join(DATASET_PATH, 'test_stances_unlabeled.csv')

# Load test data
test_bodies = pd.read_csv(test_bodies_path)
test_stances = pd.read_csv(test_stances_path)

# Ensure 'Body ID' column is of the same type in both dataframes
test_stances['Body ID'] = test_stances['Body ID'].astype(str)
test_bodies['Body ID'] = test_bodies['Body ID'].astype(str)

# Merge the test datasets on 'Body ID'
merged_test_data = pd.merge(test_stances, test_bodies[['Body ID', 'articleBody']], on='Body ID', how='left')

# Extract headlines and bodies
headlines_test = merged_test_data['Headline'].fillna('')
bodies_test = merged_test_data['articleBody'].fillna('')

# Tokenizer settings (same as used for training)
max_features = 2000
MAX_SEQUENCE_LENGTH_HEADLINE = 16
MAX_SEQUENCE_LENGTH_BODY = 48
embedding_dim = 50

# Tokenize and pad the headlines and bodies
tokenizer_headline = Tokenizer(num_words=max_features, split=' ')
tokenizer_body = Tokenizer(num_words=max_features, split=' ')
tokenizer_headline.fit_on_texts(headlines_test)
tokenizer_body.fit_on_texts(bodies_test)

encoded_docs_headline_test = tokenizer_headline.texts_to_sequences(headlines_test)
padded_docs_headline_test = pad_sequences(encoded_docs_headline_test, maxlen=MAX_SEQUENCE_LENGTH_HEADLINE, padding='post')

encoded_docs_body_test = tokenizer_body.texts_to_sequences(bodies_test)
padded_docs_body_test = pad_sequences(encoded_docs_body_test, maxlen=MAX_SEQUENCE_LENGTH_BODY, padding='post')

# Initialize variables to keep track of the best model
best_model_path = None
best_accuracy = 0
best_predictions = None

# Iterate over the model files
for model_file in os.listdir(DATASET_PATH):
    if model_file.endswith('.hdf5'):
        model_path = os.path.join(DATASET_PATH, model_file)
        
        # Load the model
        model = load_model(model_path)
        
        # Predict
        predictions = model.predict([padded_docs_headline_test, padded_docs_body_test])
        predicted_classes = np.argmax(predictions, axis=1)

        # Calculate accuracy (optional: if you have the true labels)
        # accuracy = np.mean(predicted_classes == true_labels) # If true_labels are available
        
        # For now, let's assume the latest model is the best based on the validation accuracy
        current_accuracy = float(model_file.split('-')[1].replace('.hdf5', ''))
        if current_accuracy > best_accuracy:
            best_accuracy = current_accuracy
            best_model_path = model_path
            best_predictions = predicted_classes

# Map predictions to stance categories if necessary
stance_mapping = {0: 'agree', 1: 'disagree', 2: 'discuss', 3: 'unrelated'}
predicted_stances = [stance_mapping[class_id] for class_id in best_predictions]

# Save predictions to CSV
merged_test_data['Predicted_Stance'] = predicted_stances
merged_test_data.to_csv(os.path.join(DATASET_PATH, 'test_predictions.csv'), index=False)

print(f"Testing complete with the best model '{best_model_path}'. Predictions saved to 'test_predictions.csv'.")


TypeError: 'NoneType' object is not iterable