In [22]:
import re
RANDOM_SEED = 577
import torch
import torch.nn as nn
import gensim.downloader as api
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import gensim.downloader as api

Using device: cuda


In [31]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

glove_vectors = api.load("glove-twitter-25")

def tokenize(text, max_len=25):
    tokens = text.split()
    if len(tokens) < max_len:
        tokens += ['<pad>'] * (max_len - len(tokens))
    return tokens[:max_len]

def preprocess_data(data, glove_vectors, label_encoder):
    texts = data['text'].apply(tokenize).tolist()
    texts = [[glove_vectors[token] if token in glove_vectors else np.zeros(25) for token in text] for text in texts]
    labels = label_encoder.fit_transform(data['emoji_id'])
    return np.array(texts), labels, label_encoder

In [32]:
label_encoder = LabelEncoder()

train_data = pd.read_csv("dataset/train_data.csv").dropna()
# train_data = pd.read_csv("dataset/train_data_augmented.csv").dropna()
validate_data = pd.read_csv("dataset/validate_data.csv").dropna()
test_data = pd.read_csv("dataset/test_data.csv").dropna()

train_texts, train_labels, label_encoder = preprocess_data(train_data, glove_vectors, label_encoder)
test_texts, test_labels, _ = preprocess_data(test_data, glove_vectors, label_encoder)
validate_texts, validate_labels, _ = preprocess_data(validate_data, glove_vectors, label_encoder)


train_dataset = TextDataset(train_texts, train_labels)
test_dataset = TextDataset(test_texts, test_labels)
validate_dataset = TextDataset(validate_texts, validate_labels)


train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)
validate_dataloader = DataLoader(validate_dataset, batch_size=128, shuffle=False)

In [33]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden.squeeze(0))
        return out
    
class BiLSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = torch.cat((hidden[0, :, :], hidden[1, :, :]), dim=1)
        out = self.fc(hidden)
        return out

input_dim = 25
hidden_dim = 64
output_dim = 100
model = LSTMClassifier(input_dim, hidden_dim, output_dim)
# model = BiLSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [34]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    model.to(device)
    total_loss = 0

    for texts, labels in dataloader:
        texts = texts.to(device, dtype=torch.float)
        labels = labels.to(device, dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

In [35]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    model.to(device)
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for texts, labels in dataloader:
            texts = texts.to(device, dtype=torch.float)
            labels = labels.to(device, dtype=torch.long)

            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()

    accuracy = correct_predictions / len(dataloader.dataset)
    return total_loss / len(dataloader), accuracy

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 40

for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    validate_loss, validate_accuracy = evaluate(model, validate_dataloader, criterion, device)
    test_loss, test_accuracy = evaluate(model, test_dataloader, criterion, device)


    print(f"Epoch {epoch + 1}/{num_epochs}:")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Test Loss: {test_loss:.4f}, Validate Accuracy: {validate_accuracy:.4f} Test Accuracy: {test_accuracy:.4f}")

Epoch 1/40:
Train Loss: 3.9201
Test Loss: 3.7651, Validate Accuracy: 0.1235 Test Accuracy: 0.1250
Epoch 2/40:
Train Loss: 3.7370
Test Loss: 3.6615, Validate Accuracy: 0.1524 Test Accuracy: 0.1582
Epoch 3/40:
Train Loss: 3.6032
Test Loss: 3.5177, Validate Accuracy: 0.1698 Test Accuracy: 0.1677
Epoch 4/40:
Train Loss: 3.4843
Test Loss: 3.4082, Validate Accuracy: 0.1784 Test Accuracy: 0.1838
Epoch 5/40:
Train Loss: 3.4003
Test Loss: 3.3165, Validate Accuracy: 0.2024 Test Accuracy: 0.2049
Epoch 6/40:
Train Loss: 3.3089
Test Loss: 3.2555, Validate Accuracy: 0.2014 Test Accuracy: 0.2095
Epoch 7/40:
Train Loss: 3.2488
Test Loss: 3.2099, Validate Accuracy: 0.2070 Test Accuracy: 0.2144
Epoch 8/40:
Train Loss: 3.1964
Test Loss: 3.1627, Validate Accuracy: 0.2089 Test Accuracy: 0.2256
Epoch 9/40:
Train Loss: 3.1352
Test Loss: 3.0969, Validate Accuracy: 0.2224 Test Accuracy: 0.2378
Epoch 10/40:
Train Loss: 3.0692
Test Loss: 3.0528, Validate Accuracy: 0.2260 Test Accuracy: 0.2486
Epoch 11/40:
Train 

In [37]:
from sklearn.metrics import classification_report, f1_score

model.eval()
model.to(device)
total_loss = 0
correct_predictions = 0
true_labels = []
predicted_labels = []

with torch.no_grad():
    for texts, labels in test_dataloader:
        texts = texts.to(device, dtype=torch.float)
        labels = labels.to(device, dtype=torch.long)

        outputs = model(texts)
        loss = criterion(outputs, labels)
        _, predicted = torch.max(outputs, 1)

        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())


f1_score_report = classification_report(true_labels, predicted_labels)
print(f1_score(true_labels, predicted_labels, average="macro"))
print("F1 Score Report:")
print(f1_score_report)

0.1345540889515316
F1 Score Report:
              precision    recall  f1-score   support

           0       0.11      0.09      0.10        22
           1       0.36      0.64      0.46       380
           2       0.25      0.06      0.09        35
           3       0.21      0.28      0.24        32
           4       0.35      0.27      0.31        22
           5       0.38      0.47      0.42        43
           6       0.17      0.20      0.18         5
           7       0.27      0.34      0.30       182
           8       0.47      0.19      0.27        37
           9       0.29      0.41      0.34        75
          10       0.53      0.54      0.53        82
          11       0.00      0.00      0.00        10
          12       0.23      0.20      0.21        82
          13       0.20      0.18      0.19        11
          14       0.00      0.00      0.00        12
          15       0.41      0.39      0.40       117
          16       0.30      0.61      0.40  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
def predict(tweet, model, glove_vectors, label_encoder):
    model.eval()
    model.to(device)
    tokenized_tweet = tokenize(tweet)
    tweet_embedding = [glove_vectors[token] if token in glove_vectors else np.zeros(25) for token in tokenized_tweet]
    tweet_tensor = torch.tensor([tweet_embedding], dtype=torch.float)
    tweet_tensor = tweet_tensor.to(device, dtype=torch.float)
    with torch.no_grad():
        output = model(tweet_tensor)
        _, predicted = torch.max(output, 1)
        print(sum(output))
        label = label_encoder.inverse_transform(predicted.cpu().numpy())[0]
    
    return label

# Example usage:
tweet = "good night"
predicted_label = predict(tweet, model, glove_vectors, label_encoder)
print(f"The predicted label for the tweet is: {predicted_label}")


tensor([ 0.3881,  0.7737, -2.6321, -0.6916,  1.8788, -0.6636, -4.2960,  2.1215,
         1.8819, -1.4105,  3.3556, -3.8281,  2.1295, -2.4174, -0.4658,  1.7660,
         0.4057,  0.4085, -2.2496,  1.5592, -3.1648,  1.8142,  1.5714,  1.6155,
        -5.0158,  3.1899,  4.3539,  3.1509,  1.1585, -1.4840, -2.4338,  3.0269,
        -2.9716,  0.5610, -3.6026,  1.4237, -1.6494, -0.4749,  3.6958, -0.4219,
        -6.3038,  0.4216,  1.2911, -1.9801, -2.8276,  1.6085, -1.4649,  4.5820,
        -5.3860,  0.7679,  1.4205,  1.7213, -2.6229,  1.9492, -3.1688,  2.2182,
        -4.6649, -3.4121, -3.6325,  6.3724, -0.3269, -0.9522, -1.6883, -0.7527,
        -2.8364,  1.3061,  2.4469,  2.6759, -1.3992, -0.5384, -1.3752, -4.1693,
        -0.9802, -3.7932,  1.0751, -1.2906, -3.1309,  4.3691, -5.7382,  0.9096,
         0.0356, -0.6946, -2.3927,  0.4154, -1.5777, -3.4246, -0.7763, -3.1403,
        -4.2069, -0.9246, -2.3919,  0.9757, -1.3427,  0.7693, -4.5236, -3.0440,
        -2.7286, -1.3618, -3.4593, -2.39

  tweet_tensor = torch.tensor([tweet_embedding], dtype=torch.float)


In [9]:
emoji_map = {}
for i, j in zip(train_data["emoji"], train_data["emoji_id"]):
  if i not in emoji_map:
    emoji_map[i] = j

In [10]:
reversed_emoji_map = {value: key for key, value in emoji_map.items()}
reversed_emoji_map[predicted_label][1]

'🌙'

In [56]:
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score

# def flatten_embeddings(texts):
#     return np.array([np.array(text).flatten() for text in texts])

# flat_train_texts = flatten_embeddings(train_texts)
# flat_test_texts = flatten_embeddings(test_texts)

# svm_model = SVC(kernel='linear')
# svm_model.fit(flat_train_texts, train_labels)

# train_predictions = svm_model.predict(flat_train_texts)
# test_predictions = svm_model.predict(flat_test_texts)

# train_accuracy = accuracy_score(train_labels, train_predictions)
# test_accuracy = accuracy_score(test_labels, test_predictions)

# print(f"Train Accuracy: {train_accuracy:.4f}")
# print(f"Test Accuracy: {test_accuracy:.4f}")