In [None]:
import random
import tensorflow as tf
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
r = random.Random()
lim_unigram = 5000
target_size = 4
hidden_size = 100
train_keep_prob = 0.6
l2_alpha = 0.00001
learn_rate = 0.01
clip_ratio = 5
batch_size_train = 500
epochs = 90

In [None]:
def load_data(file_train_instances, file_train_bodies, file_test_instances, file_test_bodies):
    train_instances = pd.read_csv(file_train_instances)
    train_bodies = pd.read_csv(file_train_bodies)
    test_instances = pd.read_csv(file_test_instances)
    test_bodies = pd.read_csv(file_test_bodies)
    return train_instances, train_bodies, test_instances, test_bodies

In [None]:
def concatenate_text(data, bodies):
    # Convert 'Body ID' columns to the same type (string)
    data['Body ID'] = data['Body ID'].astype(str)
    bodies['Body ID'] = bodies['Body ID'].astype(str)

    # Merge headline and body ID with the actual body text
    data = data.merge(bodies, how='left', on='Body ID')
    data['combined'] = data['Headline'].astype(str) + ' ' + data['articleBody'].astype(str)
    return data

In [None]:
def pipeline_train(train_instances, train_bodies, lim_unigram=lim_unigram):
    # Concatenate text for vectorization
    train_instances = concatenate_text(train_instances, train_bodies)

    # Vectorization
    bow_vectorizer = CountVectorizer(max_features=lim_unigram)
    tfreq_vectorizer = CountVectorizer(max_features=lim_unigram)
    tfidf_vectorizer = TfidfVectorizer(max_features=lim_unigram)

    # Fit and transform
    bow = bow_vectorizer.fit_transform(train_instances['combined'])
    tfreq = tfreq_vectorizer.fit_transform(train_instances['combined'])
    tfidf = tfidf_vectorizer.fit_transform(train_instances['combined'])

    # Determine the feature size
    feature_size = bow.shape[1] + tfreq.shape[1] + tfidf.shape[1]

    # Prepare feature set
    train_set = np.concatenate([bow.toarray(), tfreq.toarray(), tfidf.toarray()], axis=1)
    train_stances = train_instances['Stance'].values

    print("Train set shape:", train_set.shape)
    print("Feature size:", feature_size)

    return train_set, train_stances, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer, feature_size

In [None]:
def pipeline_test(test_instances, test_bodies, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer):
    test_instances = concatenate_text(test_instances, test_bodies)
    bow = bow_vectorizer.transform(test_instances['combined'])
    tfreq = tfreq_vectorizer.transform(test_instances['combined'])
    tfidf = tfidf_vectorizer.transform(test_instances['combined'])

    test_set = np.concatenate([bow.toarray(), tfreq.toarray(), tfidf.toarray()], axis=1)

    print("Test set shape:", test_set.shape)

    return test_set

In [None]:
def build_model(feature_size, hidden_size, target_size, learning_rate):
    # Define placeholders
    features_pl = tf.compat.v1.placeholder(tf.float32, [None, feature_size], 'features')
    stances_pl = tf.compat.v1.placeholder(tf.int64, [None], 'stances')
    keep_prob_pl = tf.compat.v1.placeholder(tf.float32)

    # Define the model architecture
    hidden_layer = tf.nn.dropout(tf.nn.relu(tf.compat.v1.layers.dense(features_pl, hidden_size)), rate=1-keep_prob_pl)
    logits_flat = tf.nn.dropout(tf.compat.v1.layers.dense(hidden_layer, target_size), rate=1-keep_prob_pl)
    loss = tf.reduce_sum(tf.compat.v1.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_flat, labels=stances_pl))

    # Optimizer
    train_op = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)

    return features_pl, stances_pl, keep_prob_pl, logits_flat, loss, train_op

In [None]:
def load_model(sess, checkpoint_dir):
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        saver = tf.compat.v1.train.Saver()
        try:
            saver.restore(sess, latest_checkpoint)
            print("Model restored from", latest_checkpoint)
        except tf.errors.NotFoundError as e:
            print(f"Error loading checkpoint: {e}")
            # Initialize any variables that were not restored
            uninitialized_vars = sess.run(tf.compat.v1.report_uninitialized_variables())
            if uninitialized_vars:
                sess.run(tf.compat.v1.variables_initializer(uninitialized_vars))
                print("Initialized missing variables.")
    else:
        raise ValueError(f"No valid checkpoint found in the directory: {checkpoint_dir}")


In [None]:
# Main script execution
if __name__ == '__main__':
    mode = 'load'  # or any other mode you have
    model_dir = '/content/model'
    hidden_size = 100
    target_size = 4
    learning_rate = 0.01

    # Load and process data
    file_train_instances = "train_stances.csv"
    file_train_bodies = "train_bodies.csv"
    file_test_instances = "test_stances_unlabeled.csv"
    file_test_bodies = "test_bodies.csv"

    raw_train, raw_train_bodies, raw_test, raw_test_bodies = load_data(file_train_instances, file_train_bodies, file_test_instances, file_test_bodies)
    train_set, train_stances, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer, feature_size = pipeline_train(raw_train, raw_train_bodies, lim_unigram=lim_unigram)
    test_set = pipeline_test(raw_test, raw_test_bodies, bow_vectorizer, tfreq_vectorizer, tfidf_vectorizer)

    if mode == 'load':
        with tf.compat.v1.Session() as sess:
            features_pl, stances_pl, keep_prob_pl, logits_flat, loss, train_op = build_model(feature_size, hidden_size, target_size, learning_rate)
            sess.run(tf.compat.v1.global_variables_initializer())  # Initialize all variables first
            load_model(sess, model_dir)

            # Predict
            test_feed_dict = {features_pl: test_set, keep_prob_pl: 1.0}
            test_pred = sess.run(logits_flat, feed_dict=test_feed_dict)
            print("Predictions:", test_pred)


Train set shape: (11998, 15000)
Feature size: 15000
Test set shape: (25413, 15000)


  hidden_layer = tf.nn.dropout(tf.nn.relu(tf.compat.v1.layers.dense(features_pl, hidden_size)), rate=1-keep_prob_pl)
  logits_flat = tf.nn.dropout(tf.compat.v1.layers.dense(hidden_layer, target_size), rate=1-keep_prob_pl)


Error loading checkpoint: Restoring from checkpoint failed. This is most likely due to a Variable name or other graph key that is missing from the checkpoint. Please ensure that you have not altered the graph expected based on the checkpoint. Original error:

Graph execution error:

Detected at node 'save/RestoreV2' defined at (most recent call last):
    File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    File "/usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    File "/usr/local/lib/python3.10/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    File "/usr/local/lib/python3.10/dist-packages/ipykernel/kernelapp.py", line 619, in start
    File "/usr/local/lib/python3.10/dist-packages/tornado/platform/asyncio.py", line 195, in start
    File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
    File "/usr/lib/python

  if uninitialized_vars:


Predictions: [[-1.8189484  -0.12561679  0.8685674  -0.13777548]
 [-0.5439312   0.28486365  0.6298088   0.15852582]
 [-1.6489872   0.2266714   0.44347236  0.0439118 ]
 ...
 [-3.5816772   1.0491668   3.0996      0.7338665 ]
 [-2.872682    1.0878494   1.0877881  -0.7959989 ]
 [-2.5751283   1.6066628   1.9447032   0.515922  ]]


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

In [None]:
predicted_stances = np.argmax(test_pred, axis=1)

In [None]:
# Load the test stances file to get Body IDs
test_stances_df = pd.read_csv('competition_test_stances.csv')

# Map the stances to integers
stance_to_int = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3}
test_stances_df['Stance'] = test_stances_df['Stance'].map(stance_to_int)

# Extract the true stances and Body IDs
true_test_stances = test_stances_df['Stance'].values
body_ids = test_stances_df['Body ID'].values
headline = test_stances_df['Headline'].values

# Ensure you have the correct number of predictions
assert len(predicted_stances) == len(test_stances_df), "Mismatch in number of predictions and test stances"

predictions_df = pd.DataFrame({
    'Body ID': body_ids,
    'Predicted Stance': predicted_stances,
    'True Stance': true_test_stances,
    'Headline': headline
})

# Export the DataFrame to a CSV file
output_file = '/predictions_with_body_ids.csv'
predictions_df.to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")


Predictions saved to /predictions_with_body_ids.csv


In [None]:
predictions_df.dropna(inplace=True)

# Extract the true and predicted stances after removing NaNs
true_test_stances_clean = predictions_df['True Stance'].values
predicted_stances_clean = predictions_df['Predicted Stance'].values


# Compute accuracy
accuracy = accuracy_score(true_test_stances_clean, predicted_stances_clean)
print("Accuracy on the test set:", accuracy)

Accuracy on the test set: 0.20473907358243848


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

# Assuming `predicted_stances` and `true_test_stances` are already defined
# Define the stance labels
stance_labels = ['agree', 'disagree', 'discuss', 'unrelated']

# Calculate the confusion matrix
cm = confusion_matrix(true_test_stances_clean, predicted_stances_clean)

# Convert the confusion matrix to a DataFrame for better readability
cm_df = pd.DataFrame(cm, index=stance_labels, columns=stance_labels)

# Calculate overall accuracy
overall_accuracy = accuracy_score(true_test_stances_clean, predicted_stances_clean)

# Calculate accuracy per class
accuracy_per_class = cm.diagonal() / cm.sum(axis=1)

# Add a totals row and accuracy column
cm_df['Overall'] = cm_df.sum(axis=1)
cm_df['% Accuracy'] = accuracy_per_class * 100
cm_df.loc['Overall'] = cm_df.sum()
cm_df.at['Overall', '% Accuracy'] = overall_accuracy * 100

# Display the confusion matrix
print(cm_df)

# Export the confusion matrix to a CSV file
cm_df.to_csv('/mnt/data/confusion_matrix.csv')

# Optionally, display the confusion matrix in a more readable format for the notebook
import ace_tools as tools; tools.display_dataframe_to_user(name="Confusion Matrix", dataframe=cm_df)


           agree  disagree  discuss  unrelated  Overall  % Accuracy
agree        1.0     115.0   1673.0      108.0   1897.0    0.052715
disagree     0.0      31.0    628.0       25.0    684.0    4.532164
discuss      2.0     225.0   3960.0      259.0   4446.0   89.068826
unrelated    1.0     791.0  16243.0     1175.0  18210.0    6.452499
Overall      4.0    1162.0  22504.0     1567.0  25237.0   20.473907


OSError: Cannot save file into a non-existent directory: '/mnt/data'