In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiments/val.csv
/kaggle/input/sentiments/train.csv
/kaggle/input/sentiments/test.csv


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from torch.nn import functional as F
from collections import Counter
from nltk.tokenize import word_tokenize

In [3]:
train_df = pd.read_csv('/kaggle/input/sentiments/train.csv')
val_df = pd.read_csv('/kaggle/input/sentiments/val.csv')
test_df = pd.read_csv('/kaggle/input/sentiments/test.csv')

def tokenize(text):
    return word_tokenize(text.lower())

train_texts = train_df['text'].tolist()
train_labels = train_df['label'].tolist()

val_texts = val_df['text'].tolist()
val_labels = val_df['label'].tolist()

test_texts = test_df['text'].tolist()
test_labels = test_df['label'].tolist()

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

In [4]:
all_texts = train_texts + val_texts + test_texts
tokenized_texts = [tokenize(text) for text in all_texts]

word_counts = Counter([word for sentence in tokenized_texts for word in sentence])
vocab = {word: idx+2 for idx, (word, _) in enumerate(word_counts.most_common())}
vocab['<PAD>'] = 0  # Padding token
vocab['<UNK>'] = 1  # Unknown token

def text_to_sequence(text, vocab):
    return [vocab.get(word, vocab['<UNK>']) for word in tokenize(text)]

train_sequences = [text_to_sequence(text, vocab) for text in train_texts]
val_sequences = [text_to_sequence(text, vocab) for text in val_texts]
test_sequences = [text_to_sequence(text, vocab) for text in test_texts]

def pad_sequences(sequences, max_len=100):
    return [seq[:max_len] + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]

train_sequences = pad_sequences(train_sequences)
val_sequences = pad_sequences(val_sequences)
test_sequences = pad_sequences(test_sequences)

X_train = torch.tensor(train_sequences, dtype=torch.long)
y_train = torch.tensor(train_labels, dtype=torch.long)
X_val = torch.tensor(val_sequences, dtype=torch.long)
y_val = torch.tensor(val_labels, dtype=torch.long)
X_test = torch.tensor(test_sequences, dtype=torch.long)
y_test = torch.tensor(test_labels, dtype=torch.long)


batch_size = 16
train_data = TensorDataset(X_train, y_train)
val_data = TensorDataset(X_val, y_val)
test_data = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

In [5]:
class BiLSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size=100, hidden_size=128, num_classes=2):
        super(BiLSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.bilstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  
        
    def forward(self, x):
        x = self.embedding(x)  
        lstm_out, (hn, cn) = self.bilstm(x)  
        hidden_state = torch.cat((hn[0], hn[1]), dim=1)  
        out = self.fc(hidden_state)  
        return out

In [6]:
vocab_size = len(vocab)
model = BiLSTMModel(vocab_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [7]:
def train(model, train_loader, val_loader, criterion, optimizer, epochs=10):
    best_val_acc = 0.0
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        correct = 0
        total = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / total
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}, Train Accuracy: {train_acc:.4f}")

        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for texts, labels in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts)
                _, predicted = torch.max(outputs, 1)
                val_correct += (predicted == labels).sum().item()
                val_total += labels.size(0)

        val_acc = val_correct / val_total
        print(f"Validation Accuracy: {val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), '/kaggle/working/best_model.pth')

train(model, train_loader, val_loader, criterion, optimizer, epochs=10)

Epoch 1, Loss: 0.3673, Train Accuracy: 0.8547
Validation Accuracy: 0.8689
Epoch 2, Loss: 0.2689, Train Accuracy: 0.8946
Validation Accuracy: 0.8873
Epoch 3, Loss: 0.2149, Train Accuracy: 0.9176
Validation Accuracy: 0.8888
Epoch 4, Loss: 0.1621, Train Accuracy: 0.9376
Validation Accuracy: 0.8910
Epoch 5, Loss: 0.1027, Train Accuracy: 0.9620
Validation Accuracy: 0.8802
Epoch 6, Loss: 0.0566, Train Accuracy: 0.9803
Validation Accuracy: 0.8789
Epoch 7, Loss: 0.0305, Train Accuracy: 0.9898
Validation Accuracy: 0.8805
Epoch 8, Loss: 0.0175, Train Accuracy: 0.9945
Validation Accuracy: 0.8846
Epoch 9, Loss: 0.0157, Train Accuracy: 0.9956
Validation Accuracy: 0.8767
Epoch 10, Loss: 0.0137, Train Accuracy: 0.9957
Validation Accuracy: 0.8816


In [8]:
model.load_state_dict(torch.load('/kaggle/working/best_model.pth'))
model.eval()

test_correct = 0
test_total = 0
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        test_correct += (predicted == labels).sum().item()
        test_total += labels.size(0)

test_acc = test_correct / test_total
print(f"Test Accuracy: {test_acc:.4f}")

  model.load_state_dict(torch.load('/kaggle/working/best_model.pth'))


Test Accuracy: 0.9267
