In [7]:
import torch
from torch import nn, tensor, randn
import pandas
# import nltk # See https://www.nltk.org/data.html
from nltk import tokenize
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [4]:
# Translate data from original CSV file into strings

data = pandas.read_csv("data/glados-portal2.original.csv", sep="|", index_col=0)["d1"]
file = open("data/glados-portal2.csv", "w")
for string in data:
    file.write(string + "\n")

In [5]:
char_tokenize = lambda text: nltk.tokenize.simple.CharTokenizer().tokenize(text)
word_tokenize = lambda text: [t.lower() for t in tokenize.WordPunctTokenizer().tokenize(text)]

# T (Time) -- размер контекстного окна
get_x = lambda data, T, pos: data[pos : pos + T]
get_y = lambda data, T, pos: data[pos + 1 : pos + T + 1]
# get_x_y = lambda data, T, pos: data[pos : pos + T], data[pos + 1 : pos + T + 1]

# B -- batch size
def get_batch(data, B, T):
    positions = torch.randint(len(data) - T, (B,))
    batch_x = torch.stack([get_x(data, T, pos) for pos in positions])
    batch_y = torch.stack([get_y(data, T, pos) for pos in positions])

In [None]:
with open("data/glados-portal2.csv", "r") as f:
    tokens = word_tokenize(f.read())

vocab = list(set(tokens))
C = len(vocab)

id_to_token = vocab
token_to_id = { t:i for i, t in enumerate(vocab)}

torch.manual_seed(0)

class DigramLM(nn.Module):
    def __init__(self, vocab):
        super.__init__()
        self.token_embedding_table = nn.Embedding(C, C)
    
    # def forward(self, input, targets):

# class LSTMLM(nn.Model):

# class TransformerLM(nn.Model):


# embedding = nn.Embedding(len(token_set), 2)
# display(torch.LongTensor([1]))
# print(len(token_set))
# embed = embedding(torch.LongTensor([1]))
# # X = torch.tensor(dataX, dtype=torch.float32).reshape(n_patterns, seq_length, 1)
# # inputs = tensor()

# # ᵺ = 2
# input_size = len(token_set)
# hidden_size = 10
# # data_strings = file.readlines()
# layer_number = 2

# lstm = nn.LSTM(input_size, hidden_size, layer_number)

# print(lstm(randn(1, len(token_set))))

AttributeError: module 'torch.nn' has no attribute 'Model'

In [43]:
with open("data/glados-portal2.csv", "r") as f:
    text = f.read()

tokens = word_tokenize(text.lower())
vocab =  sorted(list(set(tokens)))
vocab = ['<PAD>', '<UNK>'] + vocab
vocab_size = len(vocab)

token_to_id = {t:i for i,t in enumerate(vocab)}
id_to_token = {i:t for t,i in token_to_id.items()}

torch.manual_seed(0)


def tokens_to_text(token_ids, id_to_token):
    return " ".join([id_to_token.get(token_id, "<UNK>") for token_id in token_ids])


class TextDataset(Dataset):
    def __init__(self, text, token_to_id, seq_length=20):
        self.token_ids = [token_to_id.get(t, token_to_id['<UNK>']) for t in word_tokenize(text)]
        self.seq_length = seq_length
        
    def __len__(self):
        return len(self.token_ids) - self.seq_length
    
    def __getitem__(self, idx):
        input_seq = self.token_ids[idx:idx+self.seq_length]
        target_seq = self.token_ids[idx+1:idx+self.seq_length+1]
        return torch.tensor(input_seq), torch.tensor(target_seq)

dataset = TextDataset(text, token_to_id)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [46]:

class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_size=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, hidden=None):
        x = self.embed(x)
        out, hidden = self.lstm(x, hidden)
        return self.fc(out), hidden
    
model = SimpleLSTM(vocab_size)
device = torch.device('cpu')
model.to(device)
criterion = nn.CrossEntropyLoss()

In [None]:
# загрузка
checkpoint = torch.load('text_lstm_model.pth')
model.load_state_dict(checkpoint['model_state'])
token_to_id = checkpoint['token_to_id']
id_to_token = checkpoint['id_to_token']

In [47]:
# обучение
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train(model, dataloader, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            optimizer.zero_grad()
            outputs, _ = model(inputs)
            loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}')

train(model, dataloader, epochs=20)

Epoch 1/20, Loss: 5.2898
Epoch 2/20, Loss: 4.0992
Epoch 3/20, Loss: 3.3674
Epoch 4/20, Loss: 2.8076
Epoch 5/20, Loss: 2.3788
Epoch 6/20, Loss: 2.0358
Epoch 7/20, Loss: 1.7506
Epoch 8/20, Loss: 1.5136
Epoch 9/20, Loss: 1.3173
Epoch 10/20, Loss: 1.1527
Epoch 11/20, Loss: 1.0147
Epoch 12/20, Loss: 0.8983
Epoch 13/20, Loss: 0.7990
Epoch 14/20, Loss: 0.7151
Epoch 15/20, Loss: 0.6433
Epoch 16/20, Loss: 0.5824
Epoch 17/20, Loss: 0.5311
Epoch 18/20, Loss: 0.4875
Epoch 19/20, Loss: 0.4518
Epoch 20/20, Loss: 0.4217


In [53]:

# Генерация текста
def generate_text(model, start_text, token_to_id, id_to_token, max_length=50):
    model.eval()
    tokens = word_tokenize(start_text.lower())
    token_ids = [token_to_id.get(t, token_to_id['<UNK>']) for t in tokens]
    hidden = None
    
    for _ in range(max_length):
        input_tensor = torch.tensor([token_ids[-1]]).unsqueeze(0).to(device)
        with torch.no_grad():
            output, hidden = model(input_tensor, hidden)
        
        probs = torch.softmax(output.squeeze(), dim=-1)
        next_token = torch.multinomial(probs, num_samples=1).item()
        token_ids.append(next_token)
    
    return ' '.join([id_to_token[t] for t in token_ids])

# Пример использования
generated = generate_text(model, "kill", token_to_id, id_to_token)
print("\nСгенерированный текст:", generated)



Сгенерированный текст: kill you fast . with bullets . or neurotoxin . but if you ' re unqualified . impersonating a stalemate associate . i just added that to the list . it don ' t go anywhere . i ' d just finished building them before you had your , well ,


In [49]:
#Сохранение модели
torch.save({
    'model_state': model.state_dict(),
    'token_to_id': token_to_id,
    'id_to_token': id_to_token
}, 'text_lstm_model.pth')