# File utils

Questions:
1) Pour l'entrainement du RNN pour la génération (tp4), on utilise des séquences fixes. Qu'est-ce que votre dataset vous renvoie comme X et y? 
Car de mon coté, mon dataset va par exemple me donner X="thank yo" et y="u". Mais de doit-on pas plutot renvoyer X="thank you" et pas de y puis entrainer le modèle en prenant:
- X="t" et y="h" puis 
- X="th" et y="a" puis
- X="tha" et y="n" etc ?

on va par exemple me donner la séquence "thank yo" et la taget associé sera "u". C'est ce que vous faites? (pcq je crois qu'on peut pour une séquence "thank you" , on peut faire autant d'entrainement que de longueur de séquence genre on prend "t" pour prédire "h" puis "th" pour prédire "a" puis "tha" pour prédire "n" etc, jusqu'au botu de la séquence?)

Vu le code du tp5, le loader peut ne renvoyer que un X (et pas un couple X,y) donc peut etre qu'il faut re,voyer une sequence et faire l'apprentissage sur toutes les sous-sequences...

In [2]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import logging
import csv
from torch.utils.data import Dataset, DataLoader
from collections import Counter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(level=logging.INFO)


def fill_na(mat):
    ix,iy = np.where(np.isnan(mat))
    for i,j in zip(ix,iy):
        if np.isnan(mat[i+1,j]):
            mat[i,j]=mat[i-1,j]
        else:
            mat[i,j]=(mat[i-1,j]+mat[i+1,j])/2.
    return mat


def read_temps(path):
    """Lit le fichier de températures"""
    data = []
    with open(path, "rt") as fp:
        reader = csv.reader(fp, delimiter=',')
        next(reader)
        for row in reader:
            if not row[1].replace(".","").isdigit():
                continue
            data.append([float(x) if x != "" else float('nan') for x in row[1:]])
    return torch.tensor(fill_na(np.array(data)), dtype=torch.float)

In [26]:
class RNN(nn.Module):
    #  TODO:  Implémenter comme décrit dans la question 1
    def __init__(self, latent_dim, input_dim, output_dim):
        super().__init__()
        self.latent_size = latent_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.act_encode = torch.tanh
        self.act_decode = torch.tanh

        # Network parameters
        self.linearX = nn.Linear(input_dim, latent_dim, bias=True)
        self.linearH = nn.Linear(latent_dim, latent_dim, bias=False)
        
        self.linearD = nn.Linear(latent_dim, output_dim, bias=True)
        

    def one_step(self, x, h):
        """ 
        compute the hidden state for one step of time
        dim(x) = batch x dimX
        dim(h) = batch x latent_size
        """
        return self.act_encode(self.linearX(x) + self.linearH(h))

    def forward(self, x):
        """
        Treat a batch of sequences,
        x -> batch of sequences, dim(X) = lenght_sequence x batch x dimX
        h -> init hidden state, dim(h) = batch x latent_size

        return a batch of hidden state sequences -> dim = lenght_sequence x batch x latent_size
        """
        length, batch, dim = x.shape
        res = []
        res.append(self.one_step(x[0], torch.zeros((batch, self.latent_size), dtype=torch.float)))

        for i in range(1,length):
            res.append(self.one_step(x[i], res[i-1]))

        return torch.stack(res)

        
    def decode(self, h):
        """
        decode a batch of hidden state
        """
        return self.act_decode(self.linearD(h))
    

In [30]:
class GRU(nn.Module):
    
    def __init__(self, latent_dim, input_dim, output_dim):
        super().__init__()
        self.latent_size = latent_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        self.sigmoid = torch.sigmoid
        self.tanh = torch.tanh


        # Network parameters
        self.linearZ = nn.Linear(input_dim+latent_dim, latent_dim, bias=False)
        self.linearR = nn.Linear(input_dim+latent_dim, input_dim+latent_dim, bias=False)
        self.linearH = nn.Linear(input_dim+latent_dim, latent_dim, bias=False)
        
        self.linearD = nn.Linear(latent_dim, output_dim, bias=True)
        
        

    def one_step(self, x, h):
        """ 
        compute the hidden state for one step of time
        dim(x) = batch x dimX
        dim(h) = batch x latent_size
        """
        concatHX = torch.cat((x, h), 1)
        zt = self.sigmoid(self.linearZ(concatHX))
        rt = self.sigmoid(self.linearR(concatHX))
        ht = (1-zt)*h + zt* self.tanh(self.linearH(rt*concatHX))
        return ht

    def forward(self, x):
        """
        Treat a batch of sequences,
        x -> batch of sequences, dim(X) = lenght_sequence x batch x dimX
        h -> init hidden state, dim(h) = batch x latent_size

        return a batch of hidden state sequences -> dim = lenght_sequence x batch x latent_size
        """
        length, batch, dim = x.shape
        res = []
        res.append(self.one_step(x[0], torch.zeros((batch, self.latent_size), dtype=torch.float)))

        for i in range(1,length):
            res.append(self.one_step(x[i], res[i-1]))

        return torch.stack(res)

        
    def decode(self, h):
        """
        decode a batch of hidden state
        """
        return self.tanh(self.linearD(h))
    

In [55]:
class LSTMv1(nn.Module):
    
    def __init__(self, latent_dim, input_dim, output_dim):
        super().__init__()
        self.latent_size = latent_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        self.sigmoid = torch.sigmoid
        self.tanh = torch.tanh
        
        self.ct = torch.zeros((BATCH_SIZE, latent_dim))


        # Network parameters
        self.linearF = nn.Linear(input_dim+latent_dim, latent_dim, bias=True)
        self.linearI = nn.Linear(input_dim+latent_dim, latent_dim, bias=True)
        self.linearC = nn.Linear(input_dim+latent_dim, latent_dim, bias=True)
        self.linearO = nn.Linear(input_dim+latent_dim, latent_dim, bias=True)
        
        self.linearD = nn.Linear(latent_dim, output_dim, bias=True)

    def one_step(self, x, h):
        """ 
        compute the hidden state for one step of time
        dim(x) = batch x dimX
        dim(h) = batch x latent_size
        """
        concatHX = torch.cat((x, h), 1)
        ft = self.sigmoid(self.linearF(concatHX))
        it = self.sigmoid(self.linearI(concatHX))
        newCt = ft*self.ct.clone() + it*self.tanh(self.linearC(concatHX))
        #self.ct = ft*self.ct.clone() + it*self.tanh(self.linearC(concatHX))
        ot = self.sigmoid(self.linearO(concatHX))
        ht = ot*self.tanh(newCt)
        self.ct = newCt
        
        return ht

    def forward(self, x):
        """
        Treat a batch of sequences,
        x -> batch of sequences, dim(X) = lenght_sequence x batch x dimX
        h -> init hidden state, dim(h) = batch x latent_size

        return a batch of hidden state sequences -> dim = lenght_sequence x batch x latent_size
        """
        length, batch, dim = x.shape
        res = []
        res.append(self.one_step(x[0], torch.zeros((batch, self.latent_size), dtype=torch.float)))

        for i in range(1,length):
            res.append(self.one_step(x[i], res[i-1]))

        return torch.stack(res)

        
    def decode(self, h):
        """
        decode a batch of hidden state
        """
        return self.tanh(self.linearD(h))
    

In [60]:
class LSTM(nn.Module):
    
    def __init__(self, latent_dim, input_dim, output_dim):
        super().__init__()
        self.latent_size = latent_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        
        self.sigmoid = torch.sigmoid
        self.tanh = torch.tanh
        
        self.cts = [torch.zeros((BATCH_SIZE, latent_dim))]


        # Network parameters
        self.linearF = nn.Linear(input_dim+latent_dim, latent_dim, bias=True)
        self.linearI = nn.Linear(input_dim+latent_dim, latent_dim, bias=True)
        self.linearC = nn.Linear(input_dim+latent_dim, latent_dim, bias=True)
        self.linearO = nn.Linear(input_dim+latent_dim, latent_dim, bias=True)
        
        self.linearD = nn.Linear(latent_dim, output_dim, bias=True)

    def one_step(self, x, h):
        """ 
        compute the hidden state for one step of time
        dim(x) = batch x dimX
        dim(h) = batch x latent_size
        """
        concatHX = torch.cat((x, h), 1)
        ft = self.sigmoid(self.linearF(concatHX))
        it = self.sigmoid(self.linearI(concatHX))
        ct = ft*self.cts[-1] + it*self.tanh(self.linearC(concatHX))
        #self.ct = ft*self.ct.clone() + it*self.tanh(self.linearC(concatHX))
        ot = self.sigmoid(self.linearO(concatHX))
        ht = ot*self.tanh(ct)
        
        self.cts.append(ct)
        
        return ht

    def forward(self, x):
        """
        Treat a batch of sequences,
        x -> batch of sequences, dim(X) = lenght_sequence x batch x dimX
        h -> init hidden state, dim(h) = batch x latent_size

        return a batch of hidden state sequences -> dim = lenght_sequence x batch x latent_size
        """
        #delete all cts
        #self.cts = [self.cts[-1]]
        
        #forward
        length, batch, dim = x.shape
        res = []
        res.append(self.one_step(x[0], torch.zeros((batch, self.latent_size), dtype=torch.float)))

        for i in range(1,length):
            res.append(self.one_step(x[i], res[i-1]))

        return torch.stack(res)

        
    def decode(self, h):
        """
        decode a batch of hidden state
        """
        return self.tanh(self.linearD(h))
    

# TME classification

In [5]:
class Dataset_temp(Dataset):
    def __init__(self, data, target, lenght=50):
        self.data = data
        self.lenght = lenght
        self.size = self.data.shape[0]-self.lenght+1

    def __getitem__(self, index):
        col = index//self.size
        lin = index%self.size
        return (self.data[lin:lin+self.lenght, col], col)

    def __len__(self):
        return self.size*self.data.shape[1]

In [414]:
temp_train = read_temps("data/tempAMAL_train.csv").unsqueeze(2)
temp_test = read_temps("data/tempAMAL_test.csv").unsqueeze(2)

In [415]:
nbClasse = 5
longueurData = 30
BATCH_SIZE = 6
longueurSeq = 4

In [416]:
temp_train = temp_train[:longueurData, :nbClasse]
temp_test = temp_test[:longueurData, :nbClasse]

In [417]:
train_loader = DataLoader(Dataset_temp(temp_train, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_temp(temp_test, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)

In [418]:
num_epochs = 10
latent_size = 10
input_dim = 1
output_dim = nbClasse
lr=1e-3

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.CrossEntropyLoss()

In [419]:
# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):

        optimizer.zero_grad()

        hidden_states = model(sequences.permute(1,0,2))
        outputs = model.decode(hidden_states[-1])

        train_loss = criterion(outputs, labels)
        train_loss.backward()
        optimizer.step()
        #writer.add_scalar('Loss/train', train_loss, epoch)
    

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():

            hidden_states = model(sequences.permute(1,0,2))
            outputs = model.decode(hidden_states[-1])
            test_loss = criterion(outputs, labels)

        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 1.9284852743148804, test loss 1.5937219858169556
Itérations 1: train loss 1.7714735269546509, test loss 1.7347369194030762
Itérations 2: train loss 1.7616195678710938, test loss 1.414864420890808
Itérations 3: train loss 1.8163965940475464, test loss 1.6045713424682617
Itérations 4: train loss 1.6932960748672485, test loss 1.6150871515274048
Itérations 5: train loss 1.6077475547790527, test loss 1.4935122728347778
Itérations 6: train loss 1.5200608968734741, test loss 1.5941352844238281
Itérations 7: train loss 1.7681688070297241, test loss 1.613354206085205
Itérations 8: train loss 1.571211338043213, test loss 1.8035459518432617
Itérations 9: train loss 1.6167024374008179, test loss 1.371878743171692


# TME Generation

### Code donnée

In [6]:
import string
import unicodedata

In [7]:
#LETTRES = string.ascii_letters + string.punctuation+string.digits+' '
#LETTRES = string.ascii_letters+' '
LETTRES = string.ascii_letters[:26]+"."+' '
id2lettre = dict(zip(range(1, len(LETTRES)+1), LETTRES))
id2lettre[0] = ''
lettre2id = dict(zip(id2lettre.values(), id2lettre.keys()))

def normalize(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if c in LETTRES)

def string2code(s):
    return torch.tensor([lettre2id[c] for c in normalize(s)])

def code2string(t):
    if(type(t)!=list):
        t = t.tolist()
    return ''.join(id2lettre[i] for i in t)

def str2code(s):
    return [lettre2id[c] for c in s]

def strs2code(ss):
    return torch.LongTensor([str2code(s) for s in ss])

### Pré-traitement données trump

In [8]:
import re

In [9]:
def cleanTrumpData(s):
    tmp = re.sub("\[[^]]+\]", "", s) #delete non vocan words as [applause]
    tmp = re.sub("[.?!]", ".", tmp)#replace end of phrase by .
    tmp = re.sub(":\s*pmurT\s*\.", ":%.", tmp[::-1]) #reverse string and replace trump by %
    tmp = re.sub(":[^.%]+?\.", ":@.", tmp) # place all no trump speaker by @
    tmp = re.sub("^\s*Trump", "%", tmp[::-1]) #reverse string and replace first Trump by %
    tmp = re.sub("@\s*:[^%]+?%", "%", tmp)  #delete words not say by trump
    return re.sub("%:", "", tmp)# delete %: wich is just to show wo speaks (but now it is trump every time)

In [10]:
with open("data/trump_full_speech.txt", 'r') as f:
    data = f.read()

In [11]:
#cleanedData = cleanTrumpData(data)
cleanedData = cleanTrumpData(data).lower()

In [12]:
cleanedData[:1000]

" wow. whoa. that is some group of people. thousands. so nice, thank you very much. that's really nice. thank you. it's great to be at trump tower. it's great to be in a wonderful city, new york. and it's an honor to have everybody here. this is beyond anybody's expectations. there's been no crowd like this. and, i can tell, some of the candidates, they went in. they didn't know the air-conditioner didn't work. they sweated like dogs.  they didn't know the room was too big, because they didn't have anybody there. how are they going to beat isis. i don't think it's gonna happen.  our country is in serious trouble. we don't have victories anymore. we used to have victories, but we don't have them. when was the last time anybody saw us beating, let's say, china in a trade deal. they kill us. i beat china all the time. all the time. when did we beat japan at anything. they send their cars over by the millions, and what do we do. when was the last time you saw a chevrolet in tokyo. it doesn

In [13]:
cleanedNormalizedData = normalize(cleanedData)
cleanedNormalizedData = cleanedNormalizedData[:1000]#To have a little sample

In [14]:
cleanedNormalizedData

' wow. whoa. that is some group of people. thousands. so nice thank you very much. thats really nice. thank you. its great to be at trump tower. its great to be in a wonderful city new york. and its an honor to have everybody here. this is beyond anybodys expectations. theres been no crowd like this. and i can tell some of the candidates they went in. they didnt know the airconditioner didnt work. they sweated like dogs.  they didnt know the room was too big because they didnt have anybody there. how are they going to beat isis. i dont think its gonna happen.  our country is in serious trouble. we dont have victories anymore. we used to have victories but we dont have them. when was the last time anybody saw us beating lets say china in a trade deal. they kill us. i beat china all the time. all the time. when did we beat japan at anything. they send their cars over by the millions and what do we do. when was the last time you saw a chevrolet in tokyo. it doesnt exist folks. they beat u

In [15]:
coefTrain = 0.8
nbTrain = int(len(cleanedNormalizedData)*coefTrain)
trainData, testData = cleanedNormalizedData[:nbTrain], cleanedNormalizedData[nbTrain:]
BATCH_SIZE = 2

In [16]:
class Dataset_trump(Dataset):
    def __init__(self, data, target, length=10):
        self.data = data
        self.length = length
        self.size = len(data)-self.length

    def __getitem__(self, index):
        return self.data[index:index+self.length], self.data[index+self.length]

    def __len__(self):
        return self.size

In [17]:
train_loader = DataLoader(Dataset_trump(trainData, None), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_trump(testData, None), shuffle=True, batch_size=BATCH_SIZE)

In [18]:
embedding = nn.Embedding(len(id2lettre), len(id2lettre))

In [61]:
num_epochs = 100
latent_size = 10
input_dim = len(id2lettre)
output_dim = len(id2lettre)
lr=1e-3

model = LSTM(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.CrossEntropyLoss()

In [62]:
# Training loop
print("Training ...")

with torch.autograd.set_detect_anomaly(True):
    for epoch in range(num_epochs):
        model.train()
        for i, (sequences, labels) in enumerate(train_loader):

            optimizer.zero_grad()

            X = embedding(strs2code(sequences))
            y = strs2code(labels).squeeze(1)

            hidden_states = model(X.permute(1,0,2))
            outputs = model.decode(hidden_states[-1])

            train_loss = criterion(outputs, y)
            train_loss.backward(retain_graph=True)
            optimizer.step()

            #writer.add_scalar('Loss/train', train_loss, epoch)

        model.eval()
        for i, (sequences, labels) in enumerate(test_loader):
            with torch.no_grad():
                X = embedding(strs2code(sequences))
                y = strs2code(labels).squeeze(1)

                hidden_states = model(X.permute(1,0,2))
                outputs = model.decode(hidden_states[-1])
                test_loss = criterion(outputs, y)

            #writer.add_scalar('Loss/test', test_loss, epoch)
      #if(epoch%10==0):
        print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [39, 10]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

In [33]:
#Génération
debut = "thank y"
nbGenere = 20
sm = nn.Softmax(dim=1)

xgens = []

for i in range(nbGenere):
    
    if(i==0):#Première fois on forward la sequence
        X = embedding(strs2code([debut]))
        hidden_states = model(X.permute(1,0,2))
        hgen = hidden_states[-1]
        outputs = model.decode(hgen)
        xgen = id2lettre[int(sm(outputs)[0].argmax())]
    else:#Ensuite on génère en one step
        x = embedding(strs2code([xgen])).squeeze(0)
        hgen = model.one_step(x,hgen)
        outputs = model.decode(hgen)
        xgen = id2lettre[int(sm(outputs)[0].argmax())]
    xgens.append(xgen)

In [34]:
"".join(xgens)

'ont thant thany tont'