In [1]:
import re
RANDOM_SEED = 577
import torch
import torch.nn as nn
import gensim.downloader as api
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api

Using device: cuda


In [2]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

glove_vectors = api.load("glove-twitter-25")

def tokenize(text, max_len=25):
    tokens = text.split()
    if len(tokens) < max_len:
        tokens += ['<pad>'] * (max_len - len(tokens))
    return tokens[:max_len]

def preprocess_data(data, glove_vectors, label_encoder):
    texts = data['text'].apply(tokenize).tolist()
    texts = [[glove_vectors[token] if token in glove_vectors else np.zeros(25) for token in text] for text in texts]
    labels = label_encoder.fit_transform(data['emoji_id'])
    return np.array(texts), labels, label_encoder

In [6]:
label_encoder = LabelEncoder()

# train_data = pd.read_csv("./dataset/train_data.csv").dropna()
train_data = pd.read_csv("./dataset/train_data_augmented.csv").dropna()
validate_data = pd.read_csv("./dataset/validate_data.csv").dropna()
test_data = pd.read_csv("./dataset/test_data.csv").dropna()

train_texts, train_labels, label_encoder = preprocess_data(train_data, glove_vectors, label_encoder)
test_texts, test_labels, _ = preprocess_data(test_data, glove_vectors, label_encoder)
validate_texts, validate_labels, _ = preprocess_data(validate_data, glove_vectors, label_encoder)


train_dataset = TextDataset(train_texts, train_labels)
test_dataset = TextDataset(test_texts, test_labels)
validate_dataset = TextDataset(validate_texts, validate_labels)


train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)
validate_dataloader = DataLoader(validate_dataset, batch_size=128, shuffle=False)

In [7]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

def flatten_embeddings(texts):
    return np.array([np.array(text).flatten() for text in texts])

flat_train_texts = flatten_embeddings(train_texts)
flat_test_texts = flatten_embeddings(test_texts)

svm_model = SVC(kernel='linear')
svm_model.fit(flat_train_texts, train_labels)

train_predictions = svm_model.predict(flat_train_texts)
test_predictions = svm_model.predict(flat_test_texts)

train_accuracy = accuracy_score(train_labels, train_predictions)
test_accuracy = accuracy_score(test_labels, test_predictions)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")

Train Accuracy: 0.8732
Test Accuracy: 0.1848


In [8]:
from sklearn.metrics import classification_report, f1_score
f1_score_report = classification_report(test_labels, test_predictions)
print(f1_score(test_labels, test_predictions, average="macro"))
print("F1 Score Report:")
print(f1_score_report)

0.0916912014217708
F1 Score Report:
              precision    recall  f1-score   support

           0       0.09      0.09      0.09        22
           1       0.32      0.45      0.37       380
           2       0.14      0.09      0.11        35
           3       0.17      0.16      0.16        32
           4       0.06      0.05      0.05        22
           5       0.30      0.23      0.26        43
           6       0.06      0.20      0.10         5
           7       0.22      0.20      0.21       182
           8       0.06      0.03      0.04        37
           9       0.21      0.17      0.19        75
          10       0.28      0.30      0.29        82
          11       0.07      0.10      0.08        10
          12       0.19      0.18      0.19        82
          13       0.14      0.18      0.16        11
          14       0.11      0.08      0.10        12
          15       0.28      0.23      0.25       117
          16       0.24      0.23      0.23  