In [2]:
!pip install pycocotools

Collecting pycocotools
  Downloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (427 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.8/427.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: pycocotools
Successfully installed pycocotools-2.0.8


In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Embedding, LSTM, Input, Concatenate
from tensorflow.keras.models import Model
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from pycocotools.coco import COCO
from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [4]:
# Parameters
IMG_HEIGHT = 224
IMG_WIDTH = 224
TEXT_SEQUENCE_LENGTH = 100
VOCAB_SIZE = 20000
EMBEDDING_DIM = 128

# Paths to datasets
liar_data_path = "/kaggle/input/liar-fake-news-dataset/train.tsv"
coco_train_image_dir = "/kaggle/input/coco-2017-dataset/coco2017/train2017"
coco_val_image_dir = "/kaggle/input/coco-2017-dataset/coco2017/val2017"
coco_train_annotation_file = "/kaggle/input/coco-2017-dataset/coco2017/annotations/captions_train2017.json"
coco_val_annotation_file = "/kaggle/input/coco-2017-dataset/coco2017/annotations/captions_val2017.json"


In [5]:
def create_misinformation_model():
    # Image branch (CNN)
    image_input = Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3), name="image_input")
    x = Conv2D(32, (3, 3), activation='relu')(image_input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    
    # Text branch (LSTM)
    text_input = Input(shape=(TEXT_SEQUENCE_LENGTH,), name="text_input")
    text_embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=TEXT_SEQUENCE_LENGTH)(text_input)
    text_lstm = LSTM(128)(text_embedding)
    
    # Combine both branches
    combined = Concatenate()([x, text_lstm])
    combined = Dense(128, activation='relu')(combined)
    output = Dense(1, activation='sigmoid')(combined)
    
    # Build and compile the model
    model = Model(inputs=[image_input, text_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

# Instantiate and print the model summary
model = create_misinformation_model()
model.summary()



In [6]:
# Function to preprocess LIAR dataset
def preprocess_liar_data(filepath):
    column_names = [
        "ID", "label", "statement", "subjects", "speaker", 
        "speaker_job_title", "state_info", "party_affiliation",
        "barely_true_counts", "false_counts", "half_true_counts", 
        "mostly_true_counts", "pants_on_fire_counts", "context"
    ]
    liar_data = pd.read_csv(filepath, sep='\t', names=column_names, header=0)
    
    texts = liar_data["statement"].values
    labels = liar_data["label"].values

    # Simplify labels (0 for real, 1 for fake)
    def simplify_label(label):
        return 0 if label in ['true', 'mostly-true'] else 1

    labels = [simplify_label(label) for label in labels]

    tokenizer = Tokenizer(num_words=VOCAB_SIZE)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    text_data = pad_sequences(sequences, maxlen=TEXT_SEQUENCE_LENGTH)

    return np.array(text_data), np.array(labels)

# Preprocess LIAR data
text_data, labels = preprocess_liar_data(liar_data_path)

In [7]:
def preprocess_coco_data(image_dir, annotation_file, max_images=5000):
    coco = COCO(annotation_file)
    image_ids = coco.getImgIds()[:max_images]
    
    image_data, caption_data = [], []
    
    for img_id in image_ids:
        img_info = coco.loadImgs(img_id)[0]
        img_path = os.path.join(image_dir, img_info['file_name'])
        
        if os.path.exists(img_path):
            img = load_img(img_path, target_size=(IMG_HEIGHT, IMG_WIDTH))
            img = img_to_array(img) / 255.0
            image_data.append(img)
            
            ann_ids = coco.getAnnIds(imgIds=img_id)
            anns = coco.loadAnns(ann_ids)
            caption = anns[0]['caption']
            caption_data.append(caption)
    
    return np.array(image_data), caption_data

# Load and preprocess COCO train and validation data
train_image_data, train_captions = preprocess_coco_data(coco_train_image_dir, coco_train_annotation_file)
val_image_data, val_captions = preprocess_coco_data(coco_val_image_dir, coco_val_annotation_file)

loading annotations into memory...
Done (t=1.87s)
creating index...
index created!
loading annotations into memory...
Done (t=0.08s)
creating index...
index created!


In [8]:
# Tokenize and pad train captions
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_captions)
train_caption_data = pad_sequences(tokenizer.texts_to_sequences(train_captions), maxlen=TEXT_SEQUENCE_LENGTH)

# Tokenize and pad validation captions
val_caption_data = pad_sequences(tokenizer.texts_to_sequences(val_captions), maxlen=TEXT_SEQUENCE_LENGTH)


In [9]:
# Align dataset sizes by taking the minimum number of samples
min_samples = min(len(text_data), len(labels), len(train_image_data), len(train_caption_data))
text_data = text_data[:min_samples]
labels = labels[:min_samples]
train_image_data = train_image_data[:min_samples]
train_caption_data = train_caption_data[:min_samples]

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices(((train_image_data, train_caption_data), labels))
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices(((val_image_data, val_caption_data), labels[:len(val_caption_data)]))
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [10]:
# Load the model
model = create_misinformation_model()

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset,
    callbacks=[early_stopping]
)

Epoch 1/10




[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 79ms/step - accuracy: 0.6202 - loss: 0.7844 - val_accuracy: 0.6406 - val_loss: 0.6552
Epoch 2/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 61ms/step - accuracy: 0.6384 - loss: 0.6420 - val_accuracy: 0.5902 - val_loss: 0.7022
Epoch 3/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step - accuracy: 0.7314 - loss: 0.5373 - val_accuracy: 0.5514 - val_loss: 0.8282
Epoch 4/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 60ms/step - accuracy: 0.8102 - loss: 0.4108 - val_accuracy: 0.4994 - val_loss: 1.0288
Epoch 5/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step - accuracy: 0.8324 - loss: 0.3668 - val_accuracy: 0.5358 - val_loss: 0.9353
Epoch 6/10
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step - accuracy: 0.8364 - loss: 0.3548 - val_accuracy: 0.5728 - val_loss: 1.0308


In [11]:
# Evaluate the model
test_loss, test_accuracy = model.evaluate(val_dataset)

# Display the results
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.6346 - loss: 0.6600
Test Loss: 0.6551756262779236
Test Accuracy: 0.6406000256538391
