In [None]:
import random
import tensorflow as tf
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.compat.v1.Session(config=config)

In [None]:
r = random.Random()
lim_unigram = 5000
target_size = 4
hidden_size = 100
train_keep_prob = 0.6
l2_alpha = 0.00001
learn_rate = 0.01
clip_ratio = 5
batch_size_train = 500
epochs = 90

In [None]:
def load_data(file_train_instances, file_train_bodies, file_test_instances, file_test_bodies):
    train_instances = pd.read_csv(file_train_instances)
    train_bodies = pd.read_csv(file_train_bodies)
    test_instances = pd.read_csv(file_test_instances)
    test_bodies = pd.read_csv(file_test_bodies)
    return train_instances, train_bodies, test_instances, test_bodies

In [None]:
def concatenate_text(data, bodies):
    # Convert 'Body ID' columns to the same type (string)
    data['Body ID'] = data['Body ID'].astype(str)
    bodies['Body ID'] = bodies['Body ID'].astype(str)

    # Merge headline and body ID with the actual body text
    data = data.merge(bodies, how='left', on='Body ID')
    return data

In [None]:
def pipeline_train(train_instances, train_bodies, lim_unigram=lim_unigram):
    # Concatenate text for vectorization
    train_instances = concatenate_text(train_instances, train_bodies)

    # Separate headline and body text
    headlines = train_instances['Headline'].astype(str)
    bodies = train_instances['articleBody'].astype(str)

    # Vectorization
    bow_vectorizer_headline = CountVectorizer(max_features=lim_unigram)
    bow_vectorizer_body = CountVectorizer(max_features=lim_unigram)
    tfidf_vectorizer_headline = TfidfVectorizer(max_features=lim_unigram)
    tfidf_vectorizer_body = TfidfVectorizer(max_features=lim_unigram)

    # Fit and transform
    bow_headline = bow_vectorizer_headline.fit_transform(headlines)
    bow_body = bow_vectorizer_body.fit_transform(bodies)
    tfidf_headline = tfidf_vectorizer_headline.fit_transform(headlines)
    tfidf_body = tfidf_vectorizer_body.fit_transform(bodies)

    # Determine the feature size
    feature_size_headline = bow_headline.shape[1] + tfidf_headline.shape[1]
    feature_size_body = bow_body.shape[1] + tfidf_body.shape[1]

    # Prepare feature set
    train_set_headline = np.concatenate([bow_headline.toarray(), tfidf_headline.toarray()], axis=1)
    train_set_body = np.concatenate([bow_body.toarray(), tfidf_body.toarray()], axis=1)
    train_stances = train_instances['Stance'].values

    print("Train set headline shape:", train_set_headline.shape)
    print("Train set body shape:", train_set_body.shape)

    return train_set_headline, train_set_body, train_stances, bow_vectorizer_headline, bow_vectorizer_body, tfidf_vectorizer_headline, tfidf_vectorizer_body, feature_size_headline, feature_size_body

In [None]:
def pipeline_test(test_instances, test_bodies, bow_vectorizer_headline, bow_vectorizer_body, tfidf_vectorizer_headline, tfidf_vectorizer_body):
    test_instances = concatenate_text(test_instances, test_bodies)

    # Separate headline and body text
    headlines = test_instances['Headline'].astype(str)
    bodies = test_instances['articleBody'].astype(str)

    bow_headline = bow_vectorizer_headline.transform(headlines)
    bow_body = bow_vectorizer_body.transform(bodies)
    tfidf_headline = tfidf_vectorizer_headline.transform(headlines)
    tfidf_body = tfidf_vectorizer_body.transform(bodies)

    test_set_headline = np.concatenate([bow_headline.toarray(), tfidf_headline.toarray()], axis=1)
    test_set_body = np.concatenate([bow_body.toarray(), tfidf_body.toarray()], axis=1)

    print("Test set headline shape:", test_set_headline.shape)
    print("Test set body shape:", test_set_body.shape)

    return test_set_headline, test_set_body

In [None]:
def attention(inputs, attention_size):
    hidden_size = inputs.shape[-1]  # D value - hidden size of the input
    w_omega = tf.Variable(tf.random.normal([hidden_size, attention_size], stddev=0.1))
    b_omega = tf.Variable(tf.random.normal([attention_size], stddev=0.1))
    u_omega = tf.Variable(tf.random.normal([attention_size], stddev=0.1))

    #Weight matrix for transforming the input vectors.
    #bias vector added to the transformed input.
    #Context vector used to compute the importance scores for the input vectors.


    v = tf.tanh(tf.tensordot(inputs, w_omega, axes=1) + b_omega)
    vu = tf.tensordot(v, u_omega, axes=1, name='vu')  # [B, T, attention_size] * [attention_size] -> [B, T]
    alphas = tf.nn.softmax(vu, name='alphas')  # (B, T) shape #normalised attention weights

    tf.print("Alphas shape:", tf.shape(alphas))
    tf.print("Alphas values:", alphas)

    output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), axis=1) #weighted sum of input vectors
    return output

In [None]:
def build_model(feature_size_headline, feature_size_body, hidden_size, attention_size, target_size, learning_rate):
    # Define placeholders
    headline_pl = tf.compat.v1.placeholder(tf.float32, [None, feature_size_headline], 'headline')
    body_pl = tf.compat.v1.placeholder(tf.float32, [None, feature_size_body], 'body')
    stances_pl = tf.compat.v1.placeholder(tf.int64, [None], 'stances')
    keep_prob_pl = tf.compat.v1.placeholder(tf.float32)

    # Apply attention mechanism to headline and body separately
    headline_expanded = tf.expand_dims(headline_pl, -1)
    body_expanded = tf.expand_dims(body_pl, -1)
    attention_headline = attention(headline_expanded, attention_size)
    attention_body = attention(body_expanded, attention_size)

    # Concatenate the attended representations
    combined_representation = tf.concat([attention_headline, attention_body], axis=1)

    # Define the rest of the model
    hidden_layer = tf.nn.dropout(tf.nn.relu(tf.compat.v1.layers.dense(combined_representation, hidden_size)), rate=1-keep_prob_pl)
    logits_flat = tf.nn.dropout(tf.compat.v1.layers.dense(hidden_layer, target_size), rate=1-keep_prob_pl)
    loss = tf.reduce_sum(tf.compat.v1.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_flat, labels=stances_pl))

    # Optimizer
    train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)

    return headline_pl, body_pl, stances_pl, keep_prob_pl, logits_flat, loss, train_op

In [None]:
def load_model(sess, checkpoint_dir):
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        saver = tf.compat.v1.train.Saver()
        try:
            saver.restore(sess, latest_checkpoint)
            print("Model restored from", latest_checkpoint)
        except tf.errors.NotFoundError as e:
            print(f"Error loading checkpoint: {e}")
            # Initialize any variables that were not restored
            uninitialized_vars = sess.run(tf.compat.v1.report_uninitialized_variables())
            if uninitialized_vars:
                sess.run(tf.compat.v1.variables_initializer(uninitialized_vars))
                print("Initialized missing variables.")
    else:
        raise ValueError(f"No valid checkpoint found in the directory: {checkpoint_dir}")


In [None]:
if __name__ == '__main__':
    mode = 'load'  # or any other mode you have
    model_dir = '/content/model'
    hidden_size = 100
    attention_size = 50
    target_size = 4
    learning_rate = 0.01

    # Load and process data
    file_train_instances = "train_stances.csv"
    file_train_bodies = "train_bodies.csv"
    file_test_instances = "competition_test_stances_unlabeled.csv"
    file_test_bodies = "test_bodies.csv"

    raw_train, raw_train_bodies, raw_test, raw_test_bodies = load_data(file_train_instances, file_train_bodies, file_test_instances, file_test_bodies)
    train_set_headline, train_set_body, train_stances, bow_vectorizer_headline, bow_vectorizer_body, tfidf_vectorizer_headline, tfidf_vectorizer_body, feature_size_headline, feature_size_body = pipeline_train(raw_train, raw_train_bodies, lim_unigram=lim_unigram)
    test_set_headline, test_set_body = pipeline_test(raw_test, raw_test_bodies, bow_vectorizer_headline, bow_vectorizer_body, tfidf_vectorizer_headline, tfidf_vectorizer_body)

    if mode == 'load':
        with tf.compat.v1.Session() as sess:
            headline_pl, body_pl, stances_pl, keep_prob_pl, logits_flat, loss, train_op = build_model(feature_size_headline, feature_size_body, hidden_size, attention_size, target_size, learning_rate)
            sess.run(tf.compat.v1.global_variables_initializer())  # Initialize all variables first
            load_model(sess, model_dir)

            # Predict
            test_feed_dict = {headline_pl: test_set_headline, body_pl: test_set_body, keep_prob_pl: 1.0}
            test_pred = sess.run(logits_flat, feed_dict=test_feed_dict)
            print("Predictions:", test_pred)


Train set headline shape: (11998, 6568)
Train set body shape: (11998, 10000)
Test set headline shape: (558, 6568)
Test set body shape: (558, 10000)


  hidden_layer = tf.nn.dropout(tf.nn.relu(tf.compat.v1.layers.dense(combined_representation, hidden_size)), rate=1-keep_prob_pl)
  logits_flat = tf.nn.dropout(tf.compat.v1.layers.dense(hidden_layer, target_size), rate=1-keep_prob_pl)


Error loading checkpoint: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

Graph execution error:

Detected at node 'save_3/RestoreV2' defined at (most recent call last):
    File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    File "/usr/lib/pyth

  if uninitialized_vars:


Predictions: [[ 0.00085893 -0.00212668  0.00698408 -0.0041001 ]
 [ 0.00052901 -0.00132071  0.0043053  -0.00253585]
 [ 0.00079664 -0.00196361  0.00650147 -0.00380023]
 ...
 [ 0.0017212  -0.0042304   0.01426833 -0.00826148]
 [ 0.00030932 -0.00080613  0.00255376 -0.00151637]
 [ 0.0007027  -0.00171776  0.0058007  -0.00335982]]


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
predicted_stances = np.argmax(test_pred, axis=1)

In [None]:
# Load the test stances file to get Body IDs
test_stances_df = pd.read_csv('competition_test_stances_subset.csv')

# Map the stances to integers
stance_to_int = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3}
test_stances_df['Stance'] = test_stances_df['Stance'].map(stance_to_int)

# Extract the true stances and Body IDs
true_test_stances = test_stances_df['Stance'].values
body_ids = test_stances_df['Body ID'].values
headline = test_stances_df['Headline'].values

# Ensure you have the correct number of predictions
assert len(predicted_stances) == len(test_stances_df), "Mismatch in number of predictions and test stances"

result_df = pd.DataFrame({
    'True Stance': true_test_stances,
    'Predicted Stance': predicted_stances
})

# Drop rows with NaN values
result_df.dropna(inplace=True)

# Extract the true and predicted stances after removing NaNs
true_test_stances_clean = result_df['True Stance'].values
predicted_stances_clean = result_df['Predicted Stance'].values


# Compute accuracy
accuracy = accuracy_score(true_test_stances_clean, predicted_stances_clean)
print("Accuracy on the test set:", accuracy)

Accuracy on the test set: 0.17625899280575538
