In [1]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.10.0


In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [6]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle as pkl

# Constants
PATH = './'
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Load the train data
def load_train_data():
    train_bodies = pd.read_csv(PATH + "train_bodies.csv", encoding='utf-8')
    train_headlines = pd.read_csv(PATH + "train_stances.csv", encoding='utf-8')
    train_data_set = pd.merge(train_bodies, train_headlines, how='left', on='Body ID')
    stances = {
        'Stance': {
          'agree': 0,
          'disagree': 1,
          'discuss': 2,
          'unrelated': 3,
        }
    }
    train_data_set.replace(stances, inplace=True)
    
    # Remove rows with missing 'articleBody' or 'Headline'
    train_data_set.dropna(subset=['articleBody', 'Headline'], inplace=True)
    
    # Sampling
    data_length = 8909
    unrelated_resampled = resample(train_data_set.loc[train_data_set['Stance'] == 3], replace=False, n_samples=data_length, random_state=RANDOM_SEED)
    discuss_resampled = resample(train_data_set.loc[train_data_set['Stance'] == 2], replace=False, n_samples=data_length, random_state=RANDOM_SEED)
    agree_resampled = resample(train_data_set.loc[train_data_set['Stance'] == 0], replace=True, n_samples=data_length, random_state=RANDOM_SEED)
    disagree_resampled = resample(train_data_set.loc[train_data_set['Stance'] == 1], replace=True, n_samples=data_length, random_state=RANDOM_SEED)
  
    all_resampled = [unrelated_resampled, discuss_resampled, agree_resampled, disagree_resampled]
    result = pd.concat(all_resampled).sample(frac=1)
    
    return result

# Load the test data
def load_test_data():     
    test_bodies = pd.read_csv(PATH + "competition_test_bodies.csv", encoding='utf-8')
    test_headlines = pd.read_csv(PATH + 'competition_test_stances.csv', encoding='utf-8')
    test_data_set = pd.merge(test_bodies, test_headlines, how='left', on='Body ID')
    stances = {
        'Stance': {
          'agree': 0,
          'disagree': 1,
          'discuss': 2,
          'unrelated': 3,
        }
    }
    test_data_set.replace(stances, inplace=True)
    
    # Remove rows with missing 'articleBody' or 'Headline'
    test_data_set.dropna(subset=['articleBody', 'Headline'], inplace=True)
    
    return test_data_set 

# Prepare the data for Passive-Aggressive Classifier
def prepare_data(data_set, vectorizer=None):
    X = data_set['articleBody'].fillna('') + " " + data_set['Headline'].fillna('')
    y = data_set['Stance']
    
    # Convert text to numerical features using TF-IDF
    if vectorizer is None:
        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
        X_features = vectorizer.fit_transform(X)
        # Save the vectorizer
        with open('vectorizer.pkl', 'wb') as handle:
            pkl.dump(vectorizer, handle, protocol=pkl.HIGHEST_PROTOCOL)
    else:
        X_features = vectorizer.transform(X)
    
    return X_features, y, vectorizer

if __name__ == '__main__':
    train_data = load_train_data()
    test_data = load_test_data()
    
    # Prepare training data
    X_train, y_train, vectorizer = prepare_data(train_data)
    
    # Prepare test data with the same vectorizer
    X_test, y_test, _ = prepare_data(test_data, vectorizer)
    
    # Encode labels
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)
    
    # Train the Passive-Aggressive Classifier
    pac = PassiveAggressiveClassifier(max_iter=100000, random_state=RANDOM_SEED)
    pac.fit(X_train, y_train_encoded)

    # Evaluate the model
    y_pred = pac.predict(X_test)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    
    # Save the model
    with open('pac_model.pkl', 'wb') as handle:
        pkl.dump(pac, handle, protocol=pkl.HIGHEST_PROTOCOL)
    
    # Save the label encoder
    with open('label_encoder.pkl', 'wb') as handle:
        pkl.dump(label_encoder, handle, protocol=pkl.HIGHEST_PROTOCOL)


  train_data_set.replace(stances, inplace=True)
  test_data_set.replace(stances, inplace=True)


Accuracy: 0.2135
