In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Concatenate, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder, StandardScaler

ModuleNotFoundError: No module named 'tensorflow'

In [3]:
def load_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None,
                     names=['label','file_name','label_text','statement','topic',
                            'speaker','speaker_job','state','party','barely_true_counts',
                            'false_counts','half_true_counts','mostly_true_counts',
                            'pants_on_fire_counts','venue','extracted_context'])
    return df

train_df = load_data('dataset/train2.tsv')
val_df = load_data('dataset/val2.tsv')
test_df = load_data('dataset/test2.tsv')

In [4]:
# There are 2 null values for speaker in the training data. Here we drop them
print("Before: ", train_df['speaker'].isnull().sum())
train_df = train_df.dropna(subset=['speaker'])
print("After: ", train_df['speaker'].isnull().sum())

Before:  2
After:  0


In [5]:
import regex as re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

def preprocess_data(data):

    data['statement'] = data['statement'].fillna('')  # Fill empty values as string
    data['statement'] = data['statement'].str.lower()  # Convert to lowercase
    data['statement'] = data['statement'].apply(lambda x: re.sub(r'[^\w\s]', '', x))  # Remove punctuation
    data['statement'] = data['statement'].apply(lambda x: re.sub(r'\d+', '', x))  # Remove numbers

    # len(word) > 2 might be unncessary. Let's try
    data['statement'] = data['statement'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words and len(word) > 2]))  # Remove stopwords

    data['processed_statement'] = data['statement'].apply(lambda x: word_tokenize(x))    # Tokenize

    lemmatizer = WordNetLemmatizer()  # Initialize lemmatizer
    data['processed_statement'] = data['processed_statement'].apply(
        lambda token_list: [lemmatizer.lemmatize(word) for word in token_list])

    return data

train_df = preprocess_data(train_df)
val_df = preprocess_data(val_df)
test_df = preprocess_data(test_df)
train_df["statement"]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,statement
0,says annies list political group supports thir...
1,decline coal start started natural gas took st...
2,hillary clinton agrees john mccain voting give...
3,health care reform legislation likely mandate ...
4,economic turnaround started end term
...,...
10235,larger number shark attacks florida cases vote...
10236,democrats become party atlanta metro area blacks
10237,says alternative social security operates galv...
10238,lifting cuban embargo allowing travel cuba


In [6]:
# Convert processed_statement from list of words to strings
train_texts = train_df['processed_statement'].apply(lambda x: ' '.join(x))
val_texts = val_df['processed_statement'].apply(lambda x: ' '.join(x))
test_texts = test_df['processed_statement'].apply(lambda x: ' '.join(x))

# Encode the labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['label_text'])
val_labels = label_encoder.transform(val_df['label_text'])
test_labels = label_encoder.transform(test_df['label_text'])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences (i.e., list of integers corresponding to words)
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

# Pad Sequences to Ensure Consistent Length
max_len = 100
train_sequences = pad_sequences(train_sequences, maxlen=max_len, padding='post')
val_sequences = pad_sequences(val_sequences, maxlen=max_len, padding='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Load GloVe Embeddings
def load_glove_embeddings(glove_path, vocab):
    embeddings_index = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    # Create an embedding matrix
    embedding_matrix = np.zeros((len(vocab) + 1, 100))
    for word, i in vocab.items():
        if word in embeddings_index:
            embedding_matrix[i] = embeddings_index[word]

    return embedding_matrix

# Get vocabulary from tokenizer
vocab = tokenizer.word_index

# Load GloVe embeddings (replace with the path to your downloaded GloVe file)
embedding_matrix = load_glove_embeddings('glove.6B.100d.txt', vocab)


In [7]:
# Replicating the model from the paper:
# "Where is your Evidence: Improving Fact-checking by Justification Modeling"

# Define the BiLSTM Model
# Input Layer
statement_input = Input(shape=(max_len,), name='statement_input')

# Embedding Layer with GloVe embeddings
embedding_layer = Embedding(
    input_dim=len(vocab) + 1,  # Add 1 for padding
    output_dim=100,            # GloVe embedding dimension
    weights=[embedding_matrix],  # Set the pre-trained weights
    trainable=False,            # Freeze GloVe embeddings
    name='embedding_layer'
)

# Apply the embedding layer
embedding = embedding_layer(statement_input)

# BiLSTM Layer
bilstm = Bidirectional(LSTM(32), name='bilstm_layer')(embedding)

# Softmax Output Layer
output = Dense(len(label_encoder.classes_), activation='softmax', name='output_layer')(bilstm)

# Compile the Model
model_s = Model(inputs=statement_input, outputs=output)
model_s.compile(optimizer=Adam(), loss=CategoricalCrossentropy(), metrics=['accuracy'])

# Model summary
model_s.summary()


In [8]:
from tensorflow.keras.utils import to_categorical

# One-hot encode the labels
train_labels_one_hot = to_categorical(train_labels, num_classes=len(label_encoder.classes_))
val_labels_one_hot = to_categorical(val_labels, num_classes=len(label_encoder.classes_))
test_labels_one_hot = to_categorical(test_labels, num_classes=len(label_encoder.classes_))

# Train the Model
model_s.fit(train_sequences, train_labels_one_hot, validation_data=(val_sequences, val_labels_one_hot), epochs=10, batch_size=32) # We train for 10 epochs like in the paper

Epoch 1/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 77ms/step - accuracy: 0.2127 - loss: 1.7537 - val_accuracy: 0.2562 - val_loss: 1.7214
Epoch 2/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 75ms/step - accuracy: 0.2546 - loss: 1.7101 - val_accuracy: 0.2586 - val_loss: 1.7064
Epoch 3/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 72ms/step - accuracy: 0.2753 - loss: 1.6881 - val_accuracy: 0.2500 - val_loss: 1.7076
Epoch 4/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 78ms/step - accuracy: 0.2858 - loss: 1.6666 - val_accuracy: 0.2445 - val_loss: 1.7082
Epoch 5/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 70ms/step - accuracy: 0.3032 - loss: 1.6471 - val_accuracy: 0.2445 - val_loss: 1.7238
Epoch 6/10
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 71ms/step - accuracy: 0.3260 - loss: 1.6288 - val_accuracy: 0.2375 - val_loss: 1.7427
Epoch 7/10
[1m3

<keras.src.callbacks.history.History at 0x7a8a9c8e2b60>

In [9]:
# Evaluate on Test Data
test_loss, test_accuracy = model_s.evaluate(test_sequences, test_labels_one_hot)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}") # %24.6
# The paper performs with %23 accuracy on the test set while our model performs with %24.6

[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.1976 - loss: 1.7984
Test Loss: 1.777524471282959
Test Accuracy: 0.21704813838005066


In [10]:
# Train a logistic regression model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train the Logistic Regression model
lr_model = LogisticRegression(max_iter=2000, multi_class='ovr')
lr_model.fit(train_sequences, train_labels)

# Make predictions
train_preds = lr_model.predict(train_sequences)
val_preds = lr_model.predict(val_sequences)
test_preds = lr_model.predict(test_sequences)

# Evaluate the model
print(f'Training Accuracy: {accuracy_score(train_labels, train_preds)}') # %21.5
print(f'Validation Accuracy: {accuracy_score(val_labels, val_preds)}') # %18.2
print(f'Test Accuracy: {accuracy_score(test_labels, test_preds)}')  # %20.6

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training Accuracy: 0.21488571986716157
Validation Accuracy: 0.1822429906542056
Test Accuracy: 0.20599842146803474


In [12]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Train the SVM model
svm_model = LinearSVC(random_state=42)
svm_model.fit(train_sequences, train_labels)

# Make predictions
train_preds_svm = svm_model.predict(train_sequences)
val_preds_svm = svm_model.predict(val_sequences)
test_preds_svm = svm_model.predict(test_sequences)

# Evaluate the model
print(f'Training Accuracy (SVM): {accuracy_score(train_labels, train_preds_svm)}') # %21
print(f'Validation Accuracy (SVM): {accuracy_score(val_labels, val_preds_svm)}') # %19.9
print(f'Test Accuracy (SVM): {accuracy_score(test_labels, test_preds_svm)}') # 19.9


Training Accuracy (SVM): 0.21058800546981832
Validation Accuracy (SVM): 0.19937694704049844
Test Accuracy (SVM): 0.19889502762430938


