# File utils

In [146]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import logging
import csv
from torch.utils.data import Dataset, DataLoader
from collections import Counter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(level=logging.INFO)


def fill_na(mat):
    ix,iy = np.where(np.isnan(mat))
    for i,j in zip(ix,iy):
        if np.isnan(mat[i+1,j]):
            mat[i,j]=mat[i-1,j]
        else:
            mat[i,j]=(mat[i-1,j]+mat[i+1,j])/2.
    return mat


def read_temps(path):
    """Lit le fichier de températures"""
    data = []
    with open(path, "rt") as fp:
        reader = csv.reader(fp, delimiter=',')
        next(reader)
        for row in reader:
            if not row[1].replace(".","").isdigit():
                continue
            data.append([float(x) if x != "" else float('nan') for x in row[1:]])
    return torch.tensor(fill_na(np.array(data)), dtype=torch.float)

In [163]:
class RNN(nn.Module):
    #  TODO:  Implémenter comme décrit dans la question 1
    def __init__(self, latent_dim, input_dim, output_dim):
        super().__init__()
        self.latent_size = latent_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.act_encode = torch.tanh
        self.act_decode = torch.tanh

        # Network parameters
        self.linearX = nn.Linear(input_dim, latent_dim, bias=True)
        self.linearH = nn.Linear(latent_dim, latent_dim, bias=False)
        
        self.linearD = nn.Linear(latent_dim, output_dim, bias=True)
        

    def one_step(self, x, h):
        """ 
        compute the hidden state for one step of time
        dim(x) = batch x dimX
        dim(h) = batch x latent_size
        """
        return self.act_encode(self.linearX(x) + self.linearH(h))

    def forward(self, x):
        """
        Treat a batch of sequences,
        x -> batch of sequences, dim(X) = lenght_sequence x batch x dimX
        h -> init hidden state, dim(h) = batch x latent_size

        return a batch of hidden state sequences -> dim = lenght_sequence x batch x latent_size
        """
        length, batch, dim = x.shape
        res = torch.zeros((length, batch, self.latent_size), dtype=torch.float)
        res[0] = self.one_step(x[0], torch.zeros((batch, self.latent_size), dtype=torch.float)) 

        for i in range(1,length):
            res[i] = self.one_step(x[i], res[i-1].clone())

        return res

        
    def decode(self, h):
        """
        decode a batch of hidden state
        """
        return self.act_decode(self.linearD(h))
    

In [311]:
class RNN(nn.Module):
    #  TODO:  Implémenter comme décrit dans la question 1
    def __init__(self, latent_dim, input_dim, output_dim):
        super().__init__()
        self.latent_size = latent_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.act_encode = torch.tanh
        self.act_decode = torch.tanh

        # Network parameters
        self.linearX = nn.Linear(input_dim, latent_dim, bias=True)
        self.linearH = nn.Linear(latent_dim, latent_dim, bias=False)
        
        self.linearD = nn.Linear(latent_dim, output_dim, bias=True)
        

    def one_step(self, x, h):
        """ 
        compute the hidden state for one step of time
        dim(x) = batch x dimX
        dim(h) = batch x latent_size
        """
        return self.act_encode(self.linearX(x) + self.linearH(h))

    def forward(self, x):
        """
        Treat a batch of sequences,
        x -> batch of sequences, dim(X) = lenght_sequence x batch x dimX
        h -> init hidden state, dim(h) = batch x latent_size

        return a batch of hidden state sequences -> dim = lenght_sequence x batch x latent_size
        """
        length, batch, dim = x.shape
        res = []
        res.append(self.one_step(x[0], torch.zeros((batch, self.latent_size), dtype=torch.float)))

        for i in range(1,length):
            res.append(self.one_step(x[i], res[i-1]))

        return torch.stack(res)

        
    def decode(self, h):
        """
        decode a batch of hidden state
        """
        return self.act_decode(self.linearD(h))
    

# File exo2

In [None]:
"""
from utils import read_temps, device, RNN, Dataset_temp
import torch
from torch.utils.data import Dataset, DataLoader

#  TODO:  Question 2 : prédiction de la ville correspondant à une séquence

temp_test, temp_test_labels = read_temps("data/tempAMAL_test.csv").unsqueeze(1), torch.arange(30)
temp_train, temp_train_labels = read_temps("data/tempAMAL_train.csv").unsqueeze(1), torch.arange(30)
print(f"train shape {temp_train.shape}")
print(f"test shape {temp_test.shape}")

import ipdb; ipdb.set_trace()

BATCH_SIZE = 30

train_loader = DataLoader(Dataset_temp(temp_train, temp_train_labels), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_temp(temp_test, temp_test_labels), shuffle=True, batch_size=BATCH_SIZE)



num_epochs = 50
latent_size = 20
input_dim = 1
output_dim = temp_train.shape[1]

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=[model.Wx,model.Wh,model.Wd,model.bh,model.bd],lr=1e-3)
optimizer.zero_grad()

error = torch.nn.CrossEntropyLoss()

# Training loop
print("Training ...")

train_loss_list = []
test_loss_list = []

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):
        
        optimizer.zero_grad()
        hidden_states = model(sequences)
        outputs = model.decode(hidden_states[-1])
        train_loss = error(outputs, sequences)
        train_loss.backward()
        optimizer.step()
        
        #writer.add_scalar('Loss/train', train_loss, epoch)

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():
            hidden_states = model(sequences)
            outputs = model.decode(hidden_states[-1])
        test_loss = error(outputs, sequences)
        
        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")
"""

# TME classification

In [148]:
class Dataset_temp(Dataset):
    def __init__(self, data, target, lenght=50):
        self.data = data
        self.lenght = lenght
        self.size = self.data.shape[0]-self.lenght+1

    def __getitem__(self, index):
        col = index//self.size
        lin = index%self.size
        return (self.data[lin:lin+self.lenght, col], col)

    def __len__(self):
        return self.size*self.data.shape[1]

In [149]:
temp_train = read_temps("data/tempAMAL_train.csv").unsqueeze(2)
temp_test = read_temps("data/tempAMAL_test.csv").unsqueeze(2)

In [150]:
nbClasse = 5
longueurData = 30
BATCH_SIZE = 6
longueurSeq = 4

In [151]:
temp_train = temp_train[:longueurData, :nbClasse]
temp_test = temp_test[:longueurData, :nbClasse]

In [152]:
train_loader = DataLoader(Dataset_temp(temp_train, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_temp(temp_test, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)

In [307]:
num_epochs = 10
latent_size = 10
input_dim = 1
output_dim = nbClasse
lr=1e-3

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.CrossEntropyLoss()

In [308]:
# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):

        optimizer.zero_grad()

        hidden_states = model(sequences.permute(1,0,2))
        outputs = model.decode(hidden_states[-1])

        train_loss = criterion(outputs, labels)
        train_loss.backward()
        optimizer.step()

        #writer.add_scalar('Loss/train', train_loss, epoch)

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():

            hidden_states = model(sequences.permute(1,0,2))
            outputs = model.decode(hidden_states[-1])
            test_loss = criterion(outputs, labels)

        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...


AttributeError: 'tuple' object has no attribute 'permute'

# TME Generation

### Code donnée

In [4]:
import string
import unicodedata

In [261]:
#LETTRES = string.ascii_letters + string.punctuation+string.digits+' '
#LETTRES = string.ascii_letters+' '
LETTRES = string.ascii_letters[:26]+' '
id2lettre = dict(zip(range(1, len(LETTRES)+1), LETTRES))
id2lettre[0] = ''
lettre2id = dict(zip(id2lettre.values(), id2lettre.keys()))

def normalize(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if c in LETTRES)

def string2code(s):
    return torch.tensor([lettre2id[c] for c in normalize(s)])

def code2string(t):
    if(type(t)!=list):
        t = t.tolist()
    return ''.join(id2lettre[i] for i in t)

def str2code(s):
    return [lettre2id[c] for c in s]

def strs2code(ss):
    return torch.LongTensor([str2code(s) for s in ss])

In [197]:
string2code("test avec accent école")

tensor([20,  5, 19, 20, 53,  1, 22,  5,  3, 53,  1,  3,  3,  5, 14, 20, 53,  5,
         3, 15, 12,  5])

### Pré-traitement données trump

In [326]:
import re

In [327]:
def cleanTrumpData(s):
    tmp = re.sub("\[[^]]+\]", "", s) #delete non vocan words as [applause]
    tmp = re.sub(":\s*pmurT\s*\.", ":%.", tmp[::-1]) #reverse string and replace trump by %
    tmp = re.sub(":[^.%]+?\.", ":@.", tmp) # place all no trump speaker by @
    tmp = re.sub("^\s*Trump", "%", tmp[::-1]) #reverse string and replace first Trump by %
    tmp = re.sub("@\s*:[^%]+?%", "%", tmp)  #delete words not say by trump
    return re.sub("%:", "", tmp)# delete %: wich is just to show wo speaks (but now it is trump every time)

In [329]:
with open("data/trump_full_speech.txt", 'r') as f:
    data = f.read()

In [330]:
#cleanedData = cleanTrumpData(data)
cleanedData = cleanTrumpData(data).lower()

In [346]:
cleanedNormalizedData = normalize(cleanedData)
cleanedNormalizedData = cleanedNormalizedData[:1000]#To have a little sample

In [332]:
#Pour bien séparer mais pas utile pour l'instant
allData = cleanedNormalizedData.replace("!",".").replace("?",".")
phrases = [phrase.strip() for phrase in allData.split(".")]
phrases = [phrase for phrase in phrases if len(phrase)>0]
histo = sorted(list(Counter([len(phrase) for phrase in phrases]).items()), reverse=False, key=lambda e:e[0])

In [347]:
coefTrain = 0.8
nbTrain = int(len(cleanedNormalizedData)*0.8)
trainData, testData = cleanedNormalizedData[:nbTrain], cleanedNormalizedData[nbTrain:]
BATCH_SIZE = 2

In [348]:
class Dataset_trump(Dataset):
    def __init__(self, data, target, length=10):
        self.data = data
        self.length = length
        self.size = len(data)-self.length

    def __getitem__(self, index):
        return self.data[index:index+self.length], self.data[index+self.length]

    def __len__(self):
        return self.size

In [350]:
train_loader = DataLoader(Dataset_trump(trainData, None), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_trump(testData, None), shuffle=True, batch_size=BATCH_SIZE)

In [351]:
embedding = nn.Embedding(len(id2lettre), len(id2lettre))

In [352]:
num_epochs = 100
latent_size = 10
input_dim = len(id2lettre)
output_dim = len(id2lettre)
lr=1e-3

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.CrossEntropyLoss()

In [353]:
# Training loop
print("Training ...")

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):

        optimizer.zero_grad()

        X = embedding(strs2code(sequences))
        y = strs2code(labels).squeeze(1)
        
        hidden_states = model(X.permute(1,0,2))
        outputs = model.decode(hidden_states[-1])

        train_loss = criterion(outputs, y)
        train_loss.backward()
        optimizer.step()

        #writer.add_scalar('Loss/train', train_loss, epoch)

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():
            X = embedding(strs2code(sequences))
            y = strs2code(labels).squeeze(1)
            
            hidden_states = model(X.permute(1,0,2))
            outputs = model.decode(hidden_states[-1])
            test_loss = criterion(outputs, y)

        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 2.8953452110290527, test loss 2.8922247886657715
Itérations 1: train loss 3.145390272140503, test loss 3.3052873611450195
Itérations 2: train loss 2.5619359016418457, test loss 2.4238903522491455
Itérations 3: train loss 2.6971216201782227, test loss 3.0695183277130127
Itérations 4: train loss 2.9537553787231445, test loss 2.2594547271728516
Itérations 5: train loss 2.7399237155914307, test loss 2.204073905944824
Itérations 6: train loss 2.2690963745117188, test loss 3.13906192779541
Itérations 7: train loss 2.6233603954315186, test loss 3.176398754119873
Itérations 8: train loss 3.2904183864593506, test loss 2.896482467651367
Itérations 9: train loss 2.5344767570495605, test loss 3.260714054107666
Itérations 10: train loss 2.139059543609619, test loss 2.4624903202056885
Itérations 11: train loss 2.8210980892181396, test loss 2.468247652053833
Itérations 12: train loss 2.9178080558776855, test loss 2.6756556034088135
Itérations 13: train loss 2.341

In [410]:
#Génération
debut = "thank y"
nbGenere = 20
sm = nn.Softmax(dim=1)

xgens = []

for i in range(nbGenere):
    
    if(i==0):#Première fois on forward la sequence
        X = embedding(strs2code([debut]))
        hidden_states = model(X.permute(1,0,2))
        hgen = hidden_states[-1]
        outputs = model.decode(hgen)
        xgen = id2lettre[int(sm(outputs)[0].argmax())]
    else:#Ensuite on génère en one step
        x = embedding(strs2code([xgen])).squeeze(0)
        hgen = model.one_step(x,hgen)
        outputs = model.decode(hgen)
        xgen = id2lettre[int(sm(outputs)[0].argmax())]
    xgens.append(xgen)

In [409]:
xgens

['o', 'n', 't', ' ', 't', 'h', 'e', ' ', 't', 'o', ' ', 't', 'h', 'e', ' ']