# File utils

In [1]:
import torch
import torch.nn as nn
import logging
from torch.utils.data import Dataset, DataLoader
from utils import normalize, cleanTrumpData, read_temps, \
                Dataset_trump, Dataset_tempClassif, Dataset_tempSerie, RNN, \
                id2lettre, string2code

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(level=logging.INFO)

# TME classification

In [2]:
temp_train = read_temps("data/tempAMAL_train.csv").unsqueeze(2)
temp_test = read_temps("data/tempAMAL_test.csv").unsqueeze(2)

In [3]:
nbClasse = 5
longueurData = 30
BATCH_SIZE = 6
longueurSeq = 4

In [4]:
temp_train = temp_train[:longueurData, :nbClasse]
temp_test = temp_test[:longueurData, :nbClasse]

In [5]:
train_loader = DataLoader(Dataset_tempClassif(temp_train, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_tempClassif(temp_test, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)

In [6]:
num_epochs = 10
latent_size = 10
input_dim = 1
output_dim = nbClasse
lr=1e-3

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.CrossEntropyLoss()

In [7]:
# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):

        optimizer.zero_grad()
        
        hidden_states = model(sequences.permute(1,0,2))
        outputs = model.decode(hidden_states[-1])

        train_loss = criterion(outputs, labels)
        train_loss.backward()
        optimizer.step()
        #writer.add_scalar('Loss/train', train_loss, epoch)
    

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():

            hidden_states = model(sequences.permute(1,0,2))
            outputs = model.decode(hidden_states[-1])
            test_loss = criterion(outputs, labels)

        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 1.6130918264389038, test loss 1.761473298072815
Itérations 1: train loss 1.7357286214828491, test loss 1.5339568853378296
Itérations 2: train loss 1.6905523538589478, test loss 1.6691385507583618
Itérations 3: train loss 1.5939311981201172, test loss 1.660255789756775
Itérations 4: train loss 1.5693974494934082, test loss 1.6154125928878784
Itérations 5: train loss 1.708225131034851, test loss 1.5710887908935547
Itérations 6: train loss 1.5808273553848267, test loss 1.6105331182479858
Itérations 7: train loss 1.5961437225341797, test loss 1.5861501693725586
Itérations 8: train loss 1.5846524238586426, test loss 1.6154108047485352
Itérations 9: train loss 1.6296206712722778, test loss 1.6009403467178345


# TME serie temporelle 

In [14]:
temp_train = read_temps("data/tempAMAL_train.csv").unsqueeze(2)
temp_test = read_temps("data/tempAMAL_test.csv").unsqueeze(2)

In [15]:
nbClasse = 1
longueurData = 33
BATCH_SIZE = 6
longueurSeq = 4

temp_train = temp_train[:longueurData, :nbClasse]
temp_test = temp_test[:longueurData, :nbClasse]

In [16]:
train_loader = DataLoader(Dataset_tempSerie(temp_train, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE, drop_last=True)
test_loader = DataLoader(Dataset_tempSerie(temp_test, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE, drop_last=True)

In [17]:
model = RNN(latent_size, input_dim, output_dim, act_decode=None)

In [18]:
num_epochs = 1
latent_size = 5
input_dim = 1
output_dim = 1
lr=1e-4

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.MSELoss()

In [19]:
# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):

        optimizer.zero_grad()
        #import ipdb; ipdb.set_trace()
        X = sequences[:, :-1, :]
        y = sequences[:, 1:, :]

        hidden_states = model(X.permute(1,0,2))
        outputs = model.decode(hidden_states)
        
        train_loss = criterion(outputs, y.permute(1,0,2))
        train_loss.backward()
        optimizer.step()

        #writer.add_scalar('Loss/train', train_loss, epoch)

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():
            X = sequences[:, :-1, :]
            y = sequences[:, 1:, :]

            hidden_states = model(X.permute(1,0,2))
            outputs = model.decode(hidden_states)
            test_loss = criterion(outputs, y.permute(1,0,2))

        #writer.add_scalar('Loss/test', test_loss, epoch)
    #if(epoch%100==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 81193.7578125, test loss 86993.65625


# TME Generation

### Pré-traitement données trump

In [20]:
with open("data/trump_full_speech.txt", 'r') as f:
    data = f.read()

In [21]:
cleanedData = cleanTrumpData(data)
cleanedNormalizedData = string2code(cleanedData)
cleanedNormalizedData = cleanedNormalizedData[:300]#To have a little sample

In [22]:
coefTrain = 0.8
nbTrain = int(len(cleanedNormalizedData)*coefTrain)
trainData, testData = cleanedNormalizedData[:nbTrain], cleanedNormalizedData[nbTrain:]
BATCH_SIZE = 64

In [23]:
embedding = nn.Embedding(len(id2lettre), len(id2lettre))

In [24]:
num_epochs = 1
latent_size = 64
input_dim = len(id2lettre)
output_dim = len(id2lettre)
lr=1e-3

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.CrossEntropyLoss()

In [25]:
def getOuputs(model, seqs):
    X = embedding(seqs[:, :-1])
    hidden_states = model(X.permute(1,0,2)).permute(1,0,2)
    return model.decode(hidden_states).view(-1, X.shape[2])

In [26]:
train_loader = DataLoader(Dataset_trump(trainData, None), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_trump(testData, None), shuffle=True, batch_size=BATCH_SIZE)

# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, _) in enumerate(train_loader):

        optimizer.zero_grad()
        
        outputs = getOuputs(model, sequences)
        y = sequences[:, 1:].reshape(-1)

        
        
        #import ipdb; ipdb.set_trace()
        train_loss = criterion(outputs, y)
        train_loss.backward()
        optimizer.step()

        #writer.add_scalar('Loss/train', train_loss, epoch)

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():
            outputs = getOuputs(model, sequences)
            y = sequences[:, 1:].reshape(-1)

            test_loss = criterion(outputs, y)

        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 3.319438934326172, test loss 3.282474994659424


In [36]:
#generation
start = "thank y"
nbGenere = 20
sm = nn.LogSoftmax(dim=1)

xgens = []

for i in range(nbGenere):
    
    if(i==0):#Première fois on forward la sequence
        X = embedding(string2code(start).unsqueeze(0))
        hidden_states = model(X.permute(1,0,2))
        hgen = hidden_states[-1]
        outputs = model.decode(hgen)
        xgen = id2lettre[int(sm(outputs)[0].argmax())]
    else:#Ensuite on génère en one step
        x = embedding(string2code(xgen))
        hgen = model.one_step(x,hgen)
        outputs = model.decode(hgen)
        xgen = id2lettre[int(sm(outputs)[0].argmax())]
    xgens.append(xgen)

In [37]:
"".join(xgens)

'iqrzeeeeeeeeeeeeeeee'