<a href="https://colab.research.google.com/github/jcauzi/jcauzi/blob/main/CNN_for_language_identification_on_surnames.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

M2 project : Convolutions and character embeddings
======================

The project aims to predict the language from which a character sequence comes from. This is done with surnames and it involves a dozen of languages.

Data download & description
---------------------

In [None]:
from urllib.request import urlretrieve

urlretrieve('http://www.linguist.univ-paris-diderot.fr/~bcrabbe/datasets/name2lang.train','name2lang.train')
urlretrieve('http://www.linguist.univ-paris-diderot.fr/~bcrabbe/datasets/name2lang.valid','name2lang.valid')

#Prints the beginning of the valid set
istream = open('name2lang.valid')
for idx, line in enumerate(istream):
  print(line.strip())
  if idx >=20:
    break
istream.close()


Barros, Portuguese
Campos, Portuguese
D'cruz, Portuguese
Henriques, Portuguese
Machado, Portuguese
Silva, Portuguese
Torres, Portuguese
Ahearn, Irish
Aonghus, Irish
Brady, Irish
Cearbhall, Irish
Flann, Irish
Kavanagh, Irish
Maguire, Irish
Mcmahon, Irish
Mcneil, Irish
Monahan, Irish
Muirchertach, Irish
Mullen, Irish
O'Connell, Irish
O'Grady, Irish


First exercise : data preprocessing (3pts)
---
The first exercise amounts to create encodings from integers to strings and from strings to integers.

In [None]:
def vocabulary(filename,char_vocab,pad_token='<pad>'):
    #char_vocab is a boolean flag that tells if we extract char symbols or language codes
    #TODO : return the two encoding maps idx2sym and sym2idx as a couple
    istream = open(filename)

    #For characters (if boolean flag)
    if char_vocab :
      idx2sym = [pad_token]
      for line in istream:
        for char_symbol in line.split(",")[0] :
          if char_symbol not in idx2sym :
            idx2sym.append(char_symbol)

    #For languages (if not boolean flag)
    else : 
      idx2sym = []
      for line in istream:
        language = line.split(",")[1].strip()
        if language not in idx2sym :
          idx2sym.append(language)
    
    sym2idx = {char: i for i, char in enumerate(idx2sym)}

    return idx2sym, sym2idx

print(vocabulary('name2lang.valid', True))

(['<pad>', 'B', 'a', 'r', 'o', 's', 'C', 'm', 'p', 'D', "'", 'c', 'u', 'z', 'H', 'e', 'n', 'i', 'q', 'M', 'h', 'd', 'S', 'l', 'v', 'T', 'A', 'g', 'y', 'b', 'F', 'K', 't', 'O', 'G', 'R', 'f', ' ', 'E', 'x', 'L', 'N', 'P', 'U', 'V', 'Z', 'w', 'Q', 'J', 'k', 'j', 'W', 'I', 'Y'], {'<pad>': 0, 'B': 1, 'a': 2, 'r': 3, 'o': 4, 's': 5, 'C': 6, 'm': 7, 'p': 8, 'D': 9, "'": 10, 'c': 11, 'u': 12, 'z': 13, 'H': 14, 'e': 15, 'n': 16, 'i': 17, 'q': 18, 'M': 19, 'h': 20, 'd': 21, 'S': 22, 'l': 23, 'v': 24, 'T': 25, 'A': 26, 'g': 27, 'y': 28, 'b': 29, 'F': 30, 'K': 31, 't': 32, 'O': 33, 'G': 34, 'R': 35, 'f': 36, ' ': 37, 'E': 38, 'x': 39, 'L': 40, 'N': 41, 'P': 42, 'U': 43, 'V': 44, 'Z': 45, 'w': 46, 'Q': 47, 'J': 48, 'k': 49, 'j': 50, 'W': 51, 'I': 52, 'Y': 53})


In [None]:
def pad_sequence(sequence,pad_size,pad_token):
    #returns a list with additional pad tokens to match pad_size if needed
    pads_needed = pad_size - len(sequence)
    return sequence + [pad_token] * pads_needed

##TEST
#print(pad_sequence(["y", "n"], 5, "pad"))

def code_sequence(charseq,encodingmap):
  #we ignore chars not seen in train set
  #charseq is a sequence of chars
  return [encodingmap[c] for c in charseq if c in encodingmap]

##TEST
coded = code_sequence(['a', 'b', 'c'], vocabulary('name2lang.valid', True)[1])
print(coded)

def decode_sequence(idxseq,decodingmap):
  #idxseq is a list of integers
  return [decodingmap[idx] for idx in idxseq]

##TEST
print(decode_sequence(coded, vocabulary('name2lang.valid', True)[0]))

[2, 29, 11]
['a', 'b', 'c']


Second exercise : data generator (2pt)
------------

The data generator aims to deliver efficiently well formed batches of data to the model. 

In [None]:
def read_dataset(filename,input_symbols):
    #reads from a raw datafile, either the surnmaes if input_symbols is True otherwise it reads the language 
    symbols = []
    istream = open(filename)
    for line in istream:
      if line and not line.isspace():
        word,lang = line.split(',')
        symbol = list(word.strip()) if input_symbols else lang.strip()
        symbols.append(symbol)
    istream.close()
    return symbols

print(len(read_dataset('name2lang.valid', True)))


1941


In [None]:
from random import shuffle

class DataGenerator:

      def __init__(self,filename, parentgenerator = None,pad_token='<pad>'):

           self.pad_token = pad_token
           if parentgenerator is not None: #Reuse the encodings of the parent if specified
             self.input_idx2sym,self.input_sym2idx     = parentgenerator.input_idx2sym,parentgenerator.input_sym2idx
             self.output_idx2sym,self.output_sym2idx   = parentgenerator.output_idx2sym,parentgenerator.output_sym2idx
           else:
             #######################################
             self.input_idx2sym,self.input_sym2idx     = vocabulary(filename, True, pad_token)
             self.output_idx2sym,self.output_sym2idx   = vocabulary(filename, False, pad_token)
             ####################################
           self.X = read_dataset(filename,True)
           self.Y = read_dataset(filename,False)

      def generate_batches(self,batch_size):
         
              assert(len(self.X) == len(self.Y))
              
              N     = len(self.X)
              idxes = list(range(N))

              #Data ordering (try to explain why these 2 lines make sense...)
              shuffle(idxes)
              idxes.sort(key=lambda idx: len(self.X[idx]))
              #generates a different distribution of examples each time :
              #without shuffle(), examples would always be sorted in the same ordrer 

              #batch generation
              bstart = 0
              while bstart < N:
                 bend        = min(bstart+batch_size,N)
                 batch_idxes = idxes[bstart:bend] 
                 batch_len   = max(len(self.X[idx]) for idx in batch_idxes)   
                 Xpad        = [pad_sequence(self.X[idx],batch_len,self.pad_token)  for idx in batch_idxes]               
                 seqX        = [code_sequence(x,self.input_sym2idx) for x in Xpad]
                 seqY        = [self.output_sym2idx[self.Y[idx]] for idx in batch_idxes]

                 assert(len(seqX) == len(seqY))
                 yield (seqX,seqY)
                 bstart += batch_size


Third exercise : Implement the word embedding submodule (5pts)
-----
This exercise amounts to implement a pytorch submodule that takes as input a sequence of char indexes and outputs the word embedding corresponding for the sequence.

The module contains no training method and is meant to be used in a larger network. Its use is quite similar to `nn.Embedding`



In [None]:
import torch
import torch.nn as nn

class CharConvolution(nn.Module):

      def __init__(self,windowK,chars_vocab_size,input_embedding_size,output_embedding_size,padding_idx = None):

          super(CharConvolution, self).__init__()
          #####################################
          #Initialize and allocate an embeddings class and the Conv1d class
          self.embeddings = nn.Embedding(chars_vocab_size, input_embedding_size, padding_idx=padding_idx)
          self.convlayer = nn.Conv1d(input_embedding_size, output_embedding_size, (2*windowK)+1, padding=windowK)
          #####################################

      def forward(self,xinput):
          #####################################
          #Implement the forward method, taking an input of the form [batch,seq]
          #and return the max pooled result
          embedded_xinput = self.embeddings(xinput)
          embedded_xinput = torch.transpose(embedded_xinput, 1, 2)
          batch_size,embedding_size,nsymbols = embedded_xinput.shape
          pool = nn.MaxPool1d(nsymbols)
          return pool(self.convlayer(embedded_xinput)).squeeze()
          ######################################

Fourth Exercise : predict the target language (10pts)
-------
In this exercise, we aim to predict the target language from a word char embedding. You will implement for the `LanguageIdentifier` class:
* A forward function: the function takes as input a char index tensor and returns a vector of prediction for each word
* A train function: the function trains the model on the full dataset (with early stopping)
* A predict function: the function takes a test corpus (a list of words)
and predicts the language. The function outputs its results in textual form. Each word is printed on the same line as its predicted class.

Once implemented you are expected to search for hyperparameters in the main program.






In [None]:
import torch.optim as optim

class LanguageIdentifier(nn.Module):

    def __init__(self,datagenerator,window_size,char_embedding_size,word_embedding_size):
      super(LanguageIdentifier, self).__init__()       
      invocab_size   = len(datagenerator.input_idx2sym)
      outvocab_size  = len(datagenerator.output_idx2sym)
      pad_idx        = datagenerator.input_sym2idx[datagenerator.pad_token]
      self.charE     = CharConvolution(window_size,invocab_size,char_embedding_size,word_embedding_size,padding_idx = pad_idx)
      self.output    = nn.Linear(word_embedding_size,outvocab_size)
      # self.class_vocab = vocabulary('name2lang.valid', False)[0]
      # self.letter_vocab = vocabulary('name2lang.valid', True)[0]

    def load(self,filename):
        self.load_state_dict(torch.load(filename))


    def forward(self,xinput):
      #########################
      #takes as input a tensor of the form [batch,seq] 
      #and returns a vector of predictions for the language 
      #print("input shape :", xinput.shape )
      conv = self.charE(xinput)
      #print("conv shape : ", conv.shape)
      return self.output(conv)
      #########################

    def train(self,traingenerator,validgenerator,epochs,batch_size,device='cpu',learning_rate=0.001):

      self.minloss = 10000000 #the minimal validation loss found so far for an epoch     
      ###########################
      #TODO Implement the training function, save the model with minimum loss

      optimizer = torch.optim.SGD(self.parameters(), lr=learning_rate)


      ##Earlystopping :
      trigger_times = 0
      patience = 3
      previous_loss = self.minloss
      all_losses = []
      ##

      device = torch.device(device)
      loss_fnc = nn.CrossEntropyLoss()

      for epoch in range(epochs) :
          self.zero_grad()
          batch_accurracies = []
          batch_losses      = []
          batch_sizes       = []
          print(epoch)

          for (trainX, trainY) in traingenerator.generate_batches(batch_size):
            X    = torch.LongTensor(trainX).to(device)
            Y    = torch.LongTensor(trainY).to(device)

            outputs = self.forward(X)
            loss = loss_fnc(outputs,Y)
            loss.backward()

            Ypred = torch.argmax(outputs,dim=1)
            acc   = float(torch.sum(Ypred == Y))

            batch_losses.append(loss.item())
            batch_accurracies.append(acc)
            batch_sizes.append(len(Y))

            optimizer.step()
          
          self.validate(validgenerator, batch_size)
          train_loss = sum(batch_losses) / len(batch_losses) 

          ##Early stopping :
          
          all_losses.append(train_loss)
          if train_loss <= self.minloss :
            trigger_times = 0
            self.minloss = train_loss
          else :
            trigger_times += 1

          if trigger_times >= patience :
              print(f"loss went up {trigger_times} times ! \nTraining early stopped.")
              break
      ###########################

    def predict(self,datagenerator,batch_size,device='cpu'):
        ##########################
        #TODO implement a prediction function that returns the class with highest score for each word in the batch
      i = 0
      all = []

      for (seqX, seqY) in datagenerator.generate_batches(batch_size):
        i+=1
        
        #IMPRESSION DU MOT
        letters = decode_sequence(seqX[0], datagenerator.input_idx2sym)
        word = "".join(letters)
        

        X = torch.LongTensor(seqX).to(device)
        outputs = self.forward(X)
        #print(outputs)
        Ypred = torch.argmax(outputs)
        #print(Ypred)
        language = datagenerator.output_idx2sym[Ypred.item()]

        all.append((word, language))

      shuffle(all)
      print(all[100:150])
        ##########################

    def validate(self,datagenerator,batch_size,device='cpu',save_min_model=False):
        #This function cannot be modified 

        batch_accurracies = []
        batch_losses      = []
        batch_sizes       = []

        device    = torch.device(device)
        loss_fnc  = nn.CrossEntropyLoss()

        for (seqX,seqY) in datagenerator.generate_batches(batch_size):

              with torch.no_grad():   
                  X    = torch.LongTensor(seqX).to(device)
                  Y    = torch.LongTensor(seqY).to(device)

                  Yhat = self.forward(X)
                  loss = loss_fnc(Yhat,Y)
                  Ypred = torch.argmax(Yhat,dim=1)
                  acc   = float(torch.sum(Ypred == Y))
                
                  batch_losses.append(loss.item())
                  batch_accurracies.append(acc)
                  batch_sizes.append(len(Y))
        
        valid_loss = sum(batch_losses) / len(batch_losses) 
        print('[valid]  mean loss = %f, mean acc = %f'%( valid_loss , sum(batch_accurracies)/sum(batch_sizes)))
        if valid_loss < self.minloss:
            self.minloss = valid_loss
            torch.save(self.state_dict(), 'names_params.pt')


Main program. You are expected to search for hyperparameters:

In [None]:
traing = DataGenerator('name2lang.train')
validg = DataGenerator('name2lang.valid',parentgenerator=traing)

# model = LanguageIdentifier(traing,2,32,512)
# model.train(traing,validg,25,128)

model = LanguageIdentifier(traing,2,32,512)
model.load_state_dict(torch.load('names_params.pt'))

model.predict(traing, 1)

#model.validate(validg, 128)

[('Linsby', 'Russian'), ('Purnell', 'Russian'), ('Grierson', 'Russian'), ('Makhnenko', 'Russian'), ('Tyrrell', 'English'), ('Baz', 'Arabic'), ('Jachikov', 'Russian'), ('Bokhoven', 'Russian'), ('Podolinsky', 'Russian'), ('Mojin', 'Russian'), ('Yakubonis', 'Russian'), ('Vysokov', 'Russian'), ('Braune', 'German'), ('Lichman', 'English'), ('Deeb', 'Arabic'), ('Ichikawa', 'Japanese'), ('Vaskovsky', 'Russian'), ('Zhilinsky', 'Russian'), ('Norwood', 'English'), ('Allison', 'English'), ('Mikhail', 'Arabic'), ('Sai', 'Chinese'), ('Uzlov', 'Russian'), ('Shamon', 'Arabic'), ('Rooiakkers', 'English'), ('Whitlock', 'English'), ('Tulin', 'Russian'), ('Eneev', 'Russian'), ('Vinaver', 'Italian'), ('Potenza', 'Russian'), ('Tsaregradsky', 'Russian'), ('Filipek', 'Czech'), ('Munyabin', 'Russian'), ('Mukovozov', 'Russian'), ('Bakradze', 'Russian'), ('Dudnakov', 'Russian'), ('Balashov', 'Russian'), ('Hubiev', 'Russian'), ('Shalyugin', 'Russian'), ('Kawatake', 'Japanese'), ('Harlanov', 'Russian'), ('Teufel'

In [None]:
#INTERACTIVE PREDICTIONS 

#not implemented yet



TypeError: ignored