# File utils

In [1]:
import torch
import torch.nn as nn
import logging
from torch.utils.data import Dataset, DataLoader
from utils import normalize, cleanTrumpData, read_temps, fill_na, \
                Dataset_trumpOld, Dataset_trump, Dataset_temp, RNN, strs2code, id2lettre 
import string

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(level=logging.INFO)

# TME classification

In [2]:
temp_train = read_temps("data/tempAMAL_train.csv").unsqueeze(2)
temp_test = read_temps("data/tempAMAL_test.csv").unsqueeze(2)

In [3]:
nbClasse = 5
longueurData = 30
BATCH_SIZE = 6
longueurSeq = 4

In [4]:
temp_train = temp_train[:longueurData, :nbClasse]
temp_test = temp_test[:longueurData, :nbClasse]

In [5]:
train_loader = DataLoader(Dataset_temp(temp_train, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_temp(temp_test, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)

In [6]:
num_epochs = 10
latent_size = 10
input_dim = 1
output_dim = nbClasse
lr=1e-3

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.CrossEntropyLoss()

In [7]:
# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):

        optimizer.zero_grad()

        hidden_states = model(sequences.permute(1,0,2))
        outputs = model.decode(hidden_states[-1])

        train_loss = criterion(outputs, labels)
        train_loss.backward()
        optimizer.step()
        #writer.add_scalar('Loss/train', train_loss, epoch)
    

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():

            hidden_states = model(sequences.permute(1,0,2))
            outputs = model.decode(hidden_states[-1])
            test_loss = criterion(outputs, labels)

        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 1.6772527694702148, test loss 1.6328250169754028
Itérations 1: train loss 1.6704071760177612, test loss 1.4179824590682983
Itérations 2: train loss 1.6533112525939941, test loss 1.5643876791000366
Itérations 3: train loss 1.5116451978683472, test loss 1.5273733139038086
Itérations 4: train loss 1.71176016330719, test loss 1.6397552490234375
Itérations 5: train loss 1.665434718132019, test loss 1.6382112503051758
Itérations 6: train loss 1.6335707902908325, test loss 1.6256200075149536
Itérations 7: train loss 1.5858124494552612, test loss 1.6263929605484009
Itérations 8: train loss 1.6088508367538452, test loss 1.613486647605896
Itérations 9: train loss 1.61981201171875, test loss 1.5955265760421753


# TME serie temporelle 

In [8]:
#TODO

# TME Generation

### Pré-traitement données trump

In [2]:
with open("data/trump_full_speech.txt", 'r') as f:
    data = f.read()

In [3]:
#cleanedData = cleanTrumpData(data)
cleanedData = cleanTrumpData(data).lower()

In [4]:
cleanedNormalizedData = normalize(cleanedData)
cleanedNormalizedData = cleanedNormalizedData[:100]#To have a little sample

In [5]:
coefTrain = 0.8
nbTrain = int(len(cleanedNormalizedData)*coefTrain)
trainData, testData = cleanedNormalizedData[:nbTrain], cleanedNormalizedData[nbTrain:]
BATCH_SIZE = 32

In [6]:
embedding = nn.Embedding(len(id2lettre), len(id2lettre))

In [9]:
num_epochs = 10
latent_size = 10
input_dim = len(id2lettre)
output_dim = len(id2lettre)
lr=1e-3

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.CrossEntropyLoss()

In [8]:
#Training loop with error
train_loader = DataLoader(Dataset_trump(trainData, None), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_trump(testData, None), shuffle=True, batch_size=BATCH_SIZE)

# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):

        optimizer.zero_grad()

        X = embedding(strs2code(sequences))
        y = strs2code(labels).squeeze(1)

        hidden_states = model(X.permute(1,0,2))
        outputs = model.decode(hidden_states)
        train_loss = criterion(outputs.view(-1, outputs.shape[2]), y.view(-1))
        train_loss.backward()
        optimizer.step()

        #writer.add_scalar('Loss/train', train_loss, epoch)

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():
            X = embedding(strs2code(sequences))
            y = strs2code(labels).squeeze(1)

            hidden_states = model(X.permute(1,0,2))
            outputs = model.decode(hidden_states)
            test_loss = criterion(outputs.view(-1, outputs.shape[2]), y.view(-1))

        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 3.3564438819885254, test loss 3.4521713256835938
Itérations 1: train loss 3.3164329528808594, test loss 3.449589490890503
Itérations 2: train loss 3.3680641651153564, test loss 3.422910690307617
Itérations 3: train loss 3.3876781463623047, test loss 3.400259017944336
Itérations 4: train loss 3.2536683082580566, test loss 3.4162790775299072
Itérations 5: train loss 3.2927563190460205, test loss 3.4094583988189697
Itérations 6: train loss 3.3133275508880615, test loss 3.3913044929504395
Itérations 7: train loss 3.2898683547973633, test loss 3.381716012954712
Itérations 8: train loss 3.2362546920776367, test loss 3.3980188369750977
Itérations 9: train loss 3.2678513526916504, test loss 3.384019136428833


In [10]:
#Training loop without error but without exploiting all result
train_loader = DataLoader(Dataset_trumpOld(trainData, None), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_trumpOld(testData, None), shuffle=True, batch_size=BATCH_SIZE)

# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):

        optimizer.zero_grad()

        X = embedding(strs2code(sequences))
        y = strs2code(labels).squeeze(1)

        hidden_states = model(X.permute(1,0,2))
        outputs = model.decode(hidden_states[-1])
        
        train_loss = criterion(outputs, y)
        train_loss.backward()
        optimizer.step()

        #writer.add_scalar('Loss/train', train_loss, epoch)

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():
            X = embedding(strs2code(sequences))
            y = strs2code(labels).squeeze(1)

            hidden_states = model(X.permute(1,0,2))
            outputs = model.decode(hidden_states[-1])
            test_loss = criterion(outputs, y)

        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 3.5792267322540283, test loss 3.2277398109436035
Itérations 1: train loss 3.3816404342651367, test loss 3.218677043914795
Itérations 2: train loss 3.29781174659729, test loss 3.2091362476348877
Itérations 3: train loss 3.5039656162261963, test loss 3.1998813152313232
Itérations 4: train loss 3.503018379211426, test loss 3.1919660568237305
Itérations 5: train loss 3.547041893005371, test loss 3.1848390102386475
Itérations 6: train loss 3.386291265487671, test loss 3.1769325733184814
Itérations 7: train loss 3.1281824111938477, test loss 3.1691927909851074
Itérations 8: train loss 3.317228078842163, test loss 3.1618258953094482
Itérations 9: train loss 3.4206831455230713, test loss 3.1546719074249268


In [11]:
#Génération
debut = "thank y"
nbGenere = 20
sm = nn.Softmax(dim=1)

xgens = []

for i in range(nbGenere):
    
    if(i==0):#Première fois on forward la sequence
        X = embedding(strs2code([debut]))
        hidden_states = model(X.permute(1,0,2))
        hgen = hidden_states[-1]
        outputs = model.decode(hgen)
        xgen = id2lettre[int(sm(outputs)[0].argmax())]
    else:#Ensuite on génère en one step
        x = embedding(strs2code([xgen])).squeeze(0)
        hgen = model.one_step(x,hgen)
        outputs = model.decode(hgen)
        xgen = id2lettre[int(sm(outputs)[0].argmax())]
    xgens.append(xgen)

In [12]:
"".join(xgens)

'tatatatatatatatatata'