In [1]:
import pandas as pd
from datasets import load_dataset

# Load the dataset from the TSV file
dataset = load_dataset('csv', data_files='./data/orientation/orientation-gb-train.tsv', delimiter='\t')

# Split the dataset into train and test sets (e.g., 80-20 split)
split_dataset = dataset['train'].train_test_split(test_size=0.2)


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BasicNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(BasicNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Helper function to train and evaluate the neural network
def train_and_evaluate(train_vectors, train_labels, test_vectors, test_labels, input_size, hidden_size=50, num_classes=2, epochs=10, batch_size=16):
    # Convert data to PyTorch tensors and move to GPU
    train_vectors = torch.tensor(train_vectors, dtype=torch.float32).to(device)
    train_labels = torch.tensor(train_labels, dtype=torch.long).to(device)
    test_vectors = torch.tensor(test_vectors, dtype=torch.float32).to(device)
    test_labels = torch.tensor(test_labels, dtype=torch.long).to(device)

    # Create data loaders
    train_dataset = TensorDataset(train_vectors, train_labels)
    test_dataset = TensorDataset(test_vectors, test_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Initialize model, loss function, and optimizer
    model = BasicNN(input_size, hidden_size, num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    for epoch in range(epochs):
        model.train()
        for vectors, labels in train_loader:
            outputs = model(vectors)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for vectors, labels in test_loader:
            outputs = model(vectors)
            _, predicted = torch.max(outputs.data, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1 = f1_score(all_labels, all_preds, average='weighted')
    return f1


In [3]:
# Extract train and test data
train_texts = split_dataset['train']['text']
train_labels = split_dataset['train']['label']
test_texts = split_dataset['test']['text']
test_labels = split_dataset['test']['label']


In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# Bag-of-Words with max_features
bow_vectorizer = CountVectorizer(max_features=50000)
train_vectors = bow_vectorizer.fit_transform(train_texts).toarray()
test_vectors = bow_vectorizer.transform(test_texts).toarray()
bow_f1 = train_and_evaluate(train_vectors, train_labels, test_vectors, test_labels, input_size=50000)
print(f'Bag-of-Words F1 Score: {bow_f1}')

# TF-IDF with max_features
tfidf_vectorizer = TfidfVectorizer(max_features=50000)
train_vectors = tfidf_vectorizer.fit_transform(train_texts).toarray()
test_vectors = tfidf_vectorizer.transform(test_texts).toarray()
tfidf_f1 = train_and_evaluate(train_vectors, train_labels, test_vectors, test_labels, input_size=50000)
print(f'TF-IDF F1 Score: {tfidf_f1}')


Bag-of-Words F1 Score: 0.7554709447215134
TF-IDF F1 Score: 0.7409388745905656


In [30]:
import numpy as np

# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Example GloVe file path (adjust as needed)
glove_file = './.vector_cache/glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_file)

# Tokenize and vectorize using GloVe
def glove_vectorize(texts, embeddings, embedding_dim=300):
    vectors = []
    for text in texts:
        words = text.split()
        word_vectors = [embeddings.get(word, np.zeros(embedding_dim)) for word in words]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(embedding_dim))
    return np.array(vectors)

train_vectors = glove_vectorize(train_texts, glove_embeddings)
test_vectors = glove_vectorize(test_texts, glove_embeddings)
glove_f1 = train_and_evaluate(train_vectors, train_labels, test_vectors, test_labels, input_size=300)
print(f'GloVe F1 Score: {glove_f1}')


GloVe F1 Score: 0.7009512004441185
