In [1]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
import os

def load_data(path):
    texts = []
    labels = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(path, label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith('.txt'):
                with open(os.path.join(dir_name, fname), 'r', encoding='utf-8') as f:
                    texts.append(f.read())
                labels.append(0 if label_type == 'neg' else 1)
    return texts, labels

train_texts, train_labels = load_data('aclImdb/train')
test_texts, test_labels = load_data('aclImdb/test')

--2024-10-15 06:56:19--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2024-10-15 06:56:22 (28.3 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [2]:
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

def preprocess(text):
    text = re.sub(r"<.*>", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    tokens = word_tokenize(text)
    return tokens

train_tokens = [preprocess(text) for text in train_texts]
test_tokens = [preprocess(text) for text in test_texts]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from collections import Counter

all_tokens = [token for tokens in train_tokens for token in tokens]
word_counts = Counter(all_tokens)
vocab = ['<PAD>', '<UNK>'] + [word for word, count in word_counts.items() if count >= 5]
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

vocab_size = len(vocab)
print(vocab_size)

21338


In [4]:
max_seq_len = 200

def tokens_to_indices(tokens_list, word_to_idx, max_seq_len):
    sequences = []
    for tokens in tokens_list:
        seq = [word_to_idx.get(token, word_to_idx['<UNK>']) for token in tokens]
        if len(seq) < max_seq_len:
            seq += [word_to_idx['<PAD>']] * (max_seq_len - len(seq))
        else:
            seq = seq[:max_seq_len]
        sequences.append(seq)
    return sequences

train_sequences = tokens_to_indices(train_tokens, word_to_idx, max_seq_len)
test_sequences = tokens_to_indices(test_tokens, word_to_idx, max_seq_len)

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        super().__init__()
        self.sequences = sequences
        self.labels = labels
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        sequence = torch.tensor(self.sequences[idx], dtype=torch.long)
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        return sequence, label

In [6]:
train_dataset = TextDataset(train_sequences, train_labels)
test_dataset = TextDataset(test_sequences, test_labels)

batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [7]:
import torch.nn as nn
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(embedded)
        out = self.fc(hidden[-1])
        out = self.sigmoid(out)
        return out.squeeze()

In [8]:
embed_size=128
hidden_size=128
output_size=1
num_layers=2
num_epochs=5
learning_rate = 0.001

model = SentimentLSTM(vocab_size, embed_size, hidden_size, output_size, num_layers)

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [9]:
model.train()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    total_loss = 0
    for sequences, labels in train_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(sequences)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}')

Epoch 1/5, Loss: 0.6933
Epoch 2/5, Loss: 0.6873
Epoch 3/5, Loss: 0.6522
Epoch 4/5, Loss: 0.6431
Epoch 5/5, Loss: 0.6722


In [10]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for sequences, labels in test_loader:
        sequences, labels = sequences.to(device), labels.to(device)
        outputs = model(sequences)
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print(f'테스트 정확도: {100 * correct / total:.2f}%')

테스트 정확도: 53.10%


In [17]:
def predict_sentiment(text):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokens = preprocess(text)
    seq = [word_to_idx.get(token, word_to_idx['<UNK>']) for token in tokens]
    if len(seq) < max_seq_len:
        seq += [word_to_idx['<PAD>']]*(max_seq_len - len(seq))
    else:
        seq = seq[:max_seq_len]
    sequence = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
        output = model(sequence)
        predicted = '긍정' if output.item() >= 0.5 else '부정'
        print(f'입력 문장: {text}')
        print(f'예측 확률: {output.item():.4f}')
        print(f'예측 결과: {predicted}')

In [18]:
test_sentence = "This movie was fantastic! I really enjoyed it."
predict_sentiment(test_sentence)

입력 문장: This movie was fantastic! I really enjoyed it.
예측 확률: 0.4828
예측 결과: 부정
