In [1]:
import torch
import torch.nn as nn
import numpy as np
import logging
from torch.utils.data import Dataset, DataLoader
from utils import normalize, cleanTrumpData, Dataset_trump, id2lettre, string2code, EOS_IX
from textloader import TextDataset, collate_fn
from tp5 import RNN, GRU, LSTM, maskedCrossEntropy
from generate import generate, generate_beam

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(level=logging.INFO)

In [2]:
with open("data/trump_full_speech.txt", 'r') as f:
    data = f.read()

In [3]:
cleanedData = cleanTrumpData(data)[:]

In [4]:
coefTrain = 0.8
nbTrain = int(len(cleanedData)*coefTrain)
trainData, testData = cleanedData[:nbTrain], cleanedData[nbTrain:]#Ici on sépare la données brut mais les phrases ayant des tailles différentes, on pourrait avoir une train/test ne respectant pas la proportion indiquer
BATCH_SIZE = 32
embedding = nn.Embedding(len(id2lettre), len(id2lettre))

In [5]:
num_epochs = 2
latent_size = 64
input_dim = len(id2lettre)
output_dim = len(id2lettre)
lr=1e-3

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = maskedCrossEntropy

In [6]:
def getOuputs(model, seqs):
    X = embedding(seqs[:, :-1])
    hidden_states = model(X.permute(1,0,2)).permute(1,0,2)
    return model.decode(hidden_states).view(-1, X.shape[2])

In [7]:
train_loader = DataLoader(TextDataset(trainData), collate_fn=collate_fn, batch_size=BATCH_SIZE, drop_last=True)
test_loader = DataLoader(TextDataset(testData), collate_fn=collate_fn, batch_size=BATCH_SIZE, drop_last=True)

In [8]:
# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, sequences in enumerate(train_loader):

        optimizer.zero_grad()
        #import ipdb; ipdb.set_trace()
        outputs = getOuputs(model, sequences)
        y = sequences[:, 1:].reshape(-1)

        
        #import ipdb; ipdb.set_trace()
        train_loss = criterion(outputs, y)
        train_loss.backward()
        optimizer.step()

        #writer.add_scalar('Loss/train', train_loss, epoch)

    model.eval()
    for i, sequences in enumerate(test_loader):
        with torch.no_grad():
            outputs = getOuputs(model, sequences)
            y = sequences[:, 1:].reshape(-1)

            test_loss = criterion(outputs, y)

        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 2.7300894260406494, test loss 2.787170171737671
Itérations 1: train loss 2.630636692047119, test loss 2.6956629753112793


In [9]:
generate(model, embedding, model.decode, EOS_IX, start="t", maxlen=0)

't'

In [10]:
generate_beam(model, embedding, model.decode, EOS_IX, 3, start="the", maxlen=3)

([[19, 6, 29], [19, 29, 21], [19, 29, 2]],
 [tensor(-6.1789, grad_fn=<AddBackward0>),
  tensor(-6.5342, grad_fn=<AddBackward0>),
  tensor(-6.5370, grad_fn=<AddBackward0>)])