# File utils

In [146]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import logging
import csv
from torch.utils.data import Dataset, DataLoader
from collections import Counter

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.basicConfig(level=logging.INFO)


def fill_na(mat):
    ix,iy = np.where(np.isnan(mat))
    for i,j in zip(ix,iy):
        if np.isnan(mat[i+1,j]):
            mat[i,j]=mat[i-1,j]
        else:
            mat[i,j]=(mat[i-1,j]+mat[i+1,j])/2.
    return mat


def read_temps(path):
    """Lit le fichier de températures"""
    data = []
    with open(path, "rt") as fp:
        reader = csv.reader(fp, delimiter=',')
        next(reader)
        for row in reader:
            if not row[1].replace(".","").isdigit():
                continue
            data.append([float(x) if x != "" else float('nan') for x in row[1:]])
    return torch.tensor(fill_na(np.array(data)), dtype=torch.float)

In [163]:
class RNN(nn.Module):
    #  TODO:  Implémenter comme décrit dans la question 1
    def __init__(self, latent_dim, input_dim, output_dim):
        super().__init__()
        self.latent_size = latent_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.act_encode = torch.tanh
        self.act_decode = torch.tanh

        # Network parameters
        self.linearX = nn.Linear(input_dim, latent_dim, bias=True)
        self.linearH = nn.Linear(latent_dim, latent_dim, bias=False)
        
        self.linearD = nn.Linear(latent_dim, output_dim, bias=True)
        

    def one_step(self, x, h):
        """ 
        compute the hidden state for one step of time
        dim(x) = batch x dimX
        dim(h) = batch x latent_size
        """
        return self.act_encode(self.linearX(x) + self.linearH(h))

    def forward(self, x):
        """
        Treat a batch of sequences,
        x -> batch of sequences, dim(X) = lenght_sequence x batch x dimX
        h -> init hidden state, dim(h) = batch x latent_size

        return a batch of hidden state sequences -> dim = lenght_sequence x batch x latent_size
        """
        length, batch, dim = x.shape
        res = torch.zeros((length, batch, self.latent_size), dtype=torch.float)
        res[0] = self.one_step(x[0], torch.zeros((batch, self.latent_size), dtype=torch.float)) 

        for i in range(1,length):
            res[i] = self.one_step(x[i], res[i-1].clone())

        return res

        
    def decode(self, h):
        """
        decode a batch of hidden state
        """
        return self.act_decode(self.linearD(h))
    

In [166]:
class RNN(nn.Module):
    #  TODO:  Implémenter comme décrit dans la question 1
    def __init__(self, latent_dim, input_dim, output_dim):
        super().__init__()
        self.latent_size = latent_dim
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.act_encode = torch.tanh
        self.act_decode = torch.tanh

        # Network parameters
        self.linearX = nn.Linear(input_dim, latent_dim, bias=True)
        self.linearH = nn.Linear(latent_dim, latent_dim, bias=False)
        
        self.linearD = nn.Linear(latent_dim, output_dim, bias=True)
        

    def one_step(self, x, h):
        """ 
        compute the hidden state for one step of time
        dim(x) = batch x dimX
        dim(h) = batch x latent_size
        """
        return self.act_encode(self.linearX(x) + self.linearH(h))

    def forward(self, x):
        """
        Treat a batch of sequences,
        x -> batch of sequences, dim(X) = lenght_sequence x batch x dimX
        h -> init hidden state, dim(h) = batch x latent_size

        return a batch of hidden state sequences -> dim = lenght_sequence x batch x latent_size
        """
        length, batch, dim = x.shape
        res = []
        res.append(self.one_step(x[0], torch.zeros((batch, self.latent_size), dtype=torch.float)))

        for i in range(1,length):
            res.append(self.one_step(x[i], res[i-1]))

        return torch.stack(res)

        
    def decode(self, h):
        """
        decode a batch of hidden state
        """
        return self.act_decode(self.linearD(h))
    

# File exo2

In [None]:
"""
from utils import read_temps, device, RNN, Dataset_temp
import torch
from torch.utils.data import Dataset, DataLoader

#  TODO:  Question 2 : prédiction de la ville correspondant à une séquence

temp_test, temp_test_labels = read_temps("data/tempAMAL_test.csv").unsqueeze(1), torch.arange(30)
temp_train, temp_train_labels = read_temps("data/tempAMAL_train.csv").unsqueeze(1), torch.arange(30)
print(f"train shape {temp_train.shape}")
print(f"test shape {temp_test.shape}")

import ipdb; ipdb.set_trace()

BATCH_SIZE = 30

train_loader = DataLoader(Dataset_temp(temp_train, temp_train_labels), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_temp(temp_test, temp_test_labels), shuffle=True, batch_size=BATCH_SIZE)



num_epochs = 50
latent_size = 20
input_dim = 1
output_dim = temp_train.shape[1]

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=[model.Wx,model.Wh,model.Wd,model.bh,model.bd],lr=1e-3)
optimizer.zero_grad()

error = torch.nn.CrossEntropyLoss()

# Training loop
print("Training ...")

train_loss_list = []
test_loss_list = []

for epoch in range(num_epochs):
    model.train()
    for i, (sequences, labels) in enumerate(train_loader):
        
        optimizer.zero_grad()
        hidden_states = model(sequences)
        outputs = model.decode(hidden_states[-1])
        train_loss = error(outputs, sequences)
        train_loss.backward()
        optimizer.step()
        
        #writer.add_scalar('Loss/train', train_loss, epoch)

    model.eval()
    for i, (sequences, labels) in enumerate(test_loader):
        with torch.no_grad():
            hidden_states = model(sequences)
            outputs = model.decode(hidden_states[-1])
        test_loss = error(outputs, sequences)
        
        #writer.add_scalar('Loss/test', test_loss, epoch)
  #if(epoch%10==0):
    print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")
"""

# TME classification

In [148]:
class Dataset_temp(Dataset):
    def __init__(self, data, target, lenght=50):
        self.data = data
        self.lenght = lenght
        self.size = self.data.shape[0]-self.lenght+1

    def __getitem__(self, index):
        col = index//self.size
        lin = index%self.size
        return (self.data[lin:lin+self.lenght, col], col)

    def __len__(self):
        return self.size*self.data.shape[1]

In [149]:
temp_train = read_temps("data/tempAMAL_train.csv").unsqueeze(2)
temp_test = read_temps("data/tempAMAL_test.csv").unsqueeze(2)

In [150]:
nbClasse = 5
longueurData = 30
BATCH_SIZE = 6
longueurSeq = 4

In [151]:
temp_train = temp_train[:longueurData, :nbClasse]
temp_test = temp_test[:longueurData, :nbClasse]

In [152]:
train_loader = DataLoader(Dataset_temp(temp_train, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(Dataset_temp(temp_test, None, longueurSeq), shuffle=True, batch_size=BATCH_SIZE)

In [164]:
num_epochs = 10
latent_size = 10
input_dim = 1
output_dim = nbClasse
lr=1e-3

model = RNN(latent_size, input_dim, output_dim)

optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
optimizer.zero_grad()

criterion = torch.nn.CrossEntropyLoss()

In [165]:
# Training loop
print("Training ...")

with torch.autograd.set_detect_anomaly(True):
    for epoch in range(num_epochs):
        model.train()
        for i, (sequences, labels) in enumerate(train_loader):

            optimizer.zero_grad()

            hidden_states = model(sequences.permute(1,0,2))
            outputs = model.decode(hidden_states[-1])
            
            train_loss = criterion(outputs, labels)
            train_loss.backward()
            optimizer.step()

            #writer.add_scalar('Loss/train', train_loss, epoch)

        model.eval()
        for i, (sequences, labels) in enumerate(test_loader):
            with torch.no_grad():

                hidden_states = model(sequences.permute(1,0,2))
                outputs = model.decode(hidden_states[-1])
                test_loss = criterion(outputs, labels)

            #writer.add_scalar('Loss/test', test_loss, epoch)
      #if(epoch%10==0):
        print(f"Itérations {epoch}: train loss {train_loss}, test loss {test_loss}")

Training ...
Itérations 0: train loss 1.408150315284729, test loss 1.4691978693008423
Itérations 1: train loss 1.2020816802978516, test loss 1.2513796091079712
Itérations 2: train loss 2.1629505157470703, test loss 1.7868728637695312
Itérations 3: train loss 1.5914438962936401, test loss 1.7306660413742065
Itérations 4: train loss 1.6886820793151855, test loss 1.5907374620437622
Itérations 5: train loss 1.6217952966690063, test loss 1.6637362241744995
Itérations 6: train loss 1.5996265411376953, test loss 1.5807021856307983
Itérations 7: train loss 1.6258958578109741, test loss 1.622285008430481
Itérations 8: train loss 1.6335803270339966, test loss 1.6276897192001343
Itérations 9: train loss 1.597016453742981, test loss 1.5985349416732788


# TME Generation

### Code donnée

In [4]:
import string
import unicodedata

In [5]:
LETTRES = string.ascii_letters + string.punctuation+string.digits+' '
id2lettre = dict(zip(range(1, len(LETTRES)+1), LETTRES))
id2lettre[0] = ''
lettre2id = dict(zip(id2lettre.values(), id2lettre.keys()))

def normalize(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if c in LETTRES)

def string2code(s):
    return torch.tensor([lettre2id[c] for c in normalize(s)])

def code2string(t):
    if(type(t)!=list):
        t = t.tolist()
    return ''.join(id2lettre[i] for i in t)

In [6]:
string2code("test avec accent école")

tensor([20,  5, 19, 20, 95,  1, 22,  5,  3, 95,  1,  3,  3,  5, 14, 20, 95,  5,
         3, 15, 12,  5])

### Pré-traitement données trump

In [7]:
import re

In [87]:
def cleanTrumpData(s):
    tmp = re.sub("\[[^]]+\]", "", s) #delete non vocan words as [applause]
    tmp = re.sub(":\s*pmurT\s*\.", ":%.", tmp[::-1]) #reverse string and replace trump by %
    tmp = re.sub(":[^.%]+?\.", ":@.", tmp) # place all no trump speaker by @
    tmp = re.sub("^\s*Trump", "%", tmp[::-1]) #reverse string and replace first Trump by %
    tmp = re.sub("@\s*:[^%]+?%", "%", tmp)  #delete words not say by trump
    return re.sub("%:", "", tmp)# delete %: wich is just to show wo speaks (but now it is trump every time)

In [84]:
tmpData = re.sub("\[[^]]+\]", "", data[:1000])
tmpData = tmpData[::-1]
tmpData = re.sub(":\s*pmurT\s*\.", ":%.", tmpData)
tmpData = re.sub(":[^.%]+?\.", ":@.", tmpData) 
tmpData = re.sub("^\s*Trump", "%", tmpData[::-1])  
tmpData = re.sub("@\s*:[^%]+?%", "%", tmpData)
re.sub("%:", "", tmpData)  

" Wow. Whoa. That is some group of people. Thousands. So nice, thank you very much. That's really nice. Thank you. It's great to be at Trump Tower. It's great to be in a wonderful city, New York. And it's an honor to have everybody here. This is beyond anybody's expectations. There's been no crowd like this. And, I can tell, some of the candidates, they went in. They didn't know the air-conditioner didn't work. They sweated like dogs.  They didn't know the room was too big, because they didn't have anybody there. How are they going to beat ISIS? I don't think it's gonna happen.  Our country is in serious trouble. We don't have victories anymore. We used to have victories, but we don't have them. When was the last time anybody saw us beating, let's say, China in a trade deal? They kill us. I beat China all the time. All the time. When did we beat Japan at anything? They send their cars over by the "

In [None]:
class Dataset_trump(Dataset):
    def __init__(self, data, target, lenght=50):
        self.data = data
        self.lenght = lenght
        self.size = self.data.shape[0]-self.lenght+1

    def __getitem__(self, index):
        col = index//self.size
        lin = index%self.size
        return (self.data[lin:lin+self.lenght, col], col)

    def __len__(self):
        return self.size*self.data.shape[1]

In [85]:
with open("data/trump_full_speech.txt", 'r') as f:
    data = f.read()

In [93]:
cleanedData = cleanTrumpData(data)

In [94]:
cleanedNormalizedData = normalize(cleanedData)

In [95]:
allData = cleanedNormalizedData.replace("!",".").replace("?",".")

In [130]:
phrases = [phrase.strip() for phrase in allData.split(".")]
phrases = [phrase for phrase in phrases if len(phrase)>0]

In [139]:
for phrase in phrases:
    if(len(phrase)==7):
        print(phrase)

" "Yeah
I would
So true
history
Oh, Jon
history
650,000
In cash
Chicago
Go vote
Go vote
Go vote
Go vote


In [131]:
sorted(list(Counter([len(phrase) for phrase in phrases]).items()), reverse=False, key=lambda e:e[0])


[(1, 62),
 (2, 8),
 (3, 8),
 (4, 7),
 (5, 11),
 (6, 13),
 (7, 13),
 (8, 16),
 (9, 80),
 (10, 25),
 (11, 42),
 (12, 59),
 (13, 55),
 (14, 35),
 (15, 60),
 (16, 68),
 (17, 67),
 (18, 52),
 (19, 75),
 (20, 65),
 (21, 63),
 (22, 63),
 (23, 60),
 (24, 61),
 (25, 51),
 (26, 54),
 (27, 54),
 (28, 61),
 (29, 68),
 (30, 71),
 (31, 63),
 (32, 57),
 (33, 60),
 (34, 62),
 (35, 56),
 (36, 58),
 (37, 36),
 (38, 36),
 (39, 57),
 (40, 53),
 (41, 56),
 (42, 33),
 (43, 50),
 (44, 56),
 (45, 56),
 (46, 52),
 (47, 36),
 (48, 49),
 (49, 40),
 (50, 47),
 (51, 45),
 (52, 41),
 (53, 30),
 (54, 31),
 (55, 23),
 (56, 55),
 (57, 40),
 (58, 38),
 (59, 50),
 (60, 35),
 (61, 37),
 (62, 40),
 (63, 22),
 (64, 40),
 (65, 33),
 (66, 25),
 (67, 37),
 (68, 35),
 (69, 30),
 (70, 35),
 (71, 32),
 (72, 37),
 (73, 37),
 (74, 28),
 (75, 17),
 (76, 26),
 (77, 26),
 (78, 38),
 (79, 22),
 (80, 33),
 (81, 30),
 (82, 40),
 (83, 31),
 (84, 17),
 (85, 28),
 (86, 35),
 (87, 32),
 (88, 22),
 (89, 33),
 (90, 26),
 (91, 41),
 (92, 17),


[(3, 1), (2, 2), (1, 3)]