# Modèle de Markov Caché du second ordre

### Application à la correction de typos dans des textes

In [1]:
import nltk
from numpy import array, ones, zeros, multiply
import numpy as np
import sys

UNK = "<unk>"  # token to map all out-of-vocabulary words (OOVs)
UNKid = 0  # index for UNK
epsilon = 1e-100

# le smoothing_obs permet de ne pas avoir des probabilités d'émission/ transitions nulles ( matrice souvent sparse) , cela augmente la précision 
# https://core.ac.uk/download/pdf/62868660.pdf
class HMM:
    def __init__(self,
                 state_list,
                 observation_list,
                 transition_proba=None,
                 transition_proba_2=None,
                 observation_proba=None,
                 initial_state_proba=None,
                 smoothing_obs=0.001):
        """
            Builds a Hidden Markov Model
            * state_list is the list of state symbols [q_0...q_(N-1)]
            * observation_list is the list of observation symbols [v_0...v_(M-1)]
            * transition_proba is the transition probability matrix
                [a_ij] a_ij = Pr(Y_(t+1)=q_i|Y_t=q_j)
            * observation_proba is the observation probablility matrix
                [b_ki] b_ki = Pr(X_t=v_k|Y_t=q_i)
            * initial_state_proba is the initial state distribution
                [pi_i] pi_i = Pr(Y_0=q_i)"""

        print ("HMM creating with: ")

        self.N = len(state_list)  # number of states
        self.M = len(observation_list)  # number of possible emissions

        print (str(self.N) + " states")
        print( str(self.M) + " observations")

        self.omega_Y = state_list
        self.omega_X = observation_list

        if transition_proba is None:
            self.transition_proba = zeros((self.N, self.N))
        else:
            self.transition_proba = transition_proba

        if transition_proba is None:
            self.transition_proba_2 = zeros((self.N, self.N**2))
        else:
            self.transition_proba_2 = transition_proba_2

        if observation_proba is None:
            self.observation_proba = zeros((self.M, self.N))
        else:
            self.observation_proba = observation_proba

        if initial_state_proba is None:
            self.initial_state_proba = zeros((self.N, ))
        else:
            self.initial_state_proba = initial_state_proba

        self.make_indexes()  # build indexes, i.e the mapping between token and int
        self.smoothing_obs = smoothing_obs

    def make_indexes(self):  # OK
        """Creates the reverse table that maps states/observations names
            to their index in the probabilities array"""
        self.Y_index = {}
        for i in range(self.N):
            self.Y_index[self.omega_Y[i]] = i  # list d etat  // N = len(etat distinct)
        self.X_index = {}
        for i in range(self.M):
            self.X_index[self.omega_X[i]] = i # omega_X = list de mot // M =len(mot distinct )

    

    def encode(self, sentence):  
        
        """ Input: sentence of tuple (word, tag)
            return: index of word into the dictionnary
                    of word
                    index of tag into the dictionnary of
                    tag
            """
        word_index = []
        tag_index = []
        for couple in sentence:
            mot, tag  = couple
            if mot in self.X_index:   # self.X_index = dictionnaire { mot : indice, ...}
                word_index.append(self.X_index[mot])
            else:
                word_index.append(UNKid)
            tag_index.append(self.Y_index[tag])   # self.Y_index = dictionnaire { tag : indice, ...}
        return word_index, tag_index   # return array indice 

    def observation_matrix(self, compt_pair):   
        """ We build here the observation matrix (M, N)
            M the number of word (into the vocabulary) , 
            N the number of states
            Input : dictionary of count (word, tag )
            return : Observation matrix (M,N)
        """
       
        for pair in compt_pair:
            mot, tag = pair
            num = compt_pair[pair]  
            k = 0  # unkid par défault
            if mot in self.X_index:
                k = self.X_index[mot]    # recupère indice du mot dans dictionnaire 
            i = self.Y_index[tag]  # recupere indice du tag
            self.observation_proba[k, i] = num
        self.observation_proba = self.observation_proba + self.smoothing_obs
        self.observation_proba = self.observation_proba / self.observation_proba.sum(
            axis=0).reshape(1, self.N)
        #return observation_proba

    def transition_matrix(self, trans_counts):  # transition tag simple count (tag1, tag2)   #transition_matrix
        """ We build here the transition matrix (N, N) 
            N the number of states
            Input : dictionary of count (tag, tag ) :(states, states)
            return: transition matrix (N,N) (states_t+1,states_t)
        """
        for pair in trans_counts:
            i = self.Y_index[pair[0]]
            j = self.Y_index[pair[1]]  
            self.transition_proba[j, i] = trans_counts[pair]     
        self.transition_proba = self.transition_proba + self.smoothing_obs
        self.transition_proba = self.transition_proba / self.transition_proba.sum(
            axis=0).reshape(1, self.N)

    def transition_matrix_2(self, trans_counts_2): # TRANSITION 2 
        
        """ We build here the transition matrix 2  (N, N**2) 
            N the number of states
            N**2 number of tuple of states : ((a,a), (a,b)...(z,z))
            Input : dictionary of count ((tag_2, tag_1), tag ) :((states_2, states_1),states)
            return: transition matrix 2 (N,N**2) (states_t,(tuples (states_2,states_1)))
        """
        
        # trans_counts_2 : {(('t', 'h'), 'e'): 2479, (('h', 'e'), 'i'): 163, (('e', 'i'), 'r'): 162,
        
        for pair in trans_counts_2:
            i = self.Y_index[pair[0][0]]
            j = self.Y_index[pair[0][1]] # Y_index dictionnaire { etat: indice , ....}
            
            # k la ligne pour le dernier                                 
            k = self.Y_index[pair[1]]    # en dessous : modulo 26 pour le premier i et +- 26 pour j :(a,a), (a, b )...
            self.transition_proba_2[k, (i * self.N + j)] = trans_counts_2[pair]
        self.transition_proba_2 = self.transition_proba_2 + self.smoothing_obs
        self.transition_proba_2 = self.transition_proba_2 / self.transition_proba_2.sum(
            axis=0).reshape(1, self.N**2)

    def init(self, init_counts):
        """ We build here the inititation state matrix (N,) 
            N the number of states
            Input : dictionary of count (tag) : (states) for the first word (observation) of each sentence
            return: initation state matrix (N,) 
        """
        
        for tag in init_counts:
            i = self.Y_index[tag]  # recupère indice tag .. 
            self.initial_state_proba[i] = init_counts[tag]
        self.initial_state_proba = self.initial_state_proba / sum(
            self.initial_state_proba)

    def train(self, pair_counts, head_trans_counts, trans_counts, init_counts):
        self.observation_matrix(pair_counts)
        self.transition_matrix(head_trans_counts)
        self.transition_matrix_2(trans_counts)
        self.init(init_counts)

    def viterbi(self, obsids):
        """ Viterbi Algorithm : 
            Finding the most likely sequence of hidden states. 
            """

        T = len(obsids)

        delta = zeros(self.N, float) 
        tmp = zeros(self.N, float)
        psi = zeros((T, self.N), int)

        delta_t = zeros(self.N, float)

        delta = self.observation_proba[obsids[0]] * self.initial_state_proba
        
        for t in range(1, T):
            if t == 1:
                for i in range(self.N):
                    for j in range(self.N):
                        # début transition 1 
                        tmp[j] = delta[j] * self.transition_proba[i, j]
                    psi[t, i] = tmp.argmax()
                    delta_t[i] = tmp.max() * self.observation_proba[obsids[t],
                                                                    i]
            else:
                for i in range(self.N):
                    for j in range(self.N):
                        # suite => transition 2
                        tmp[j] = delta[j] * self.transition_proba_2[
                            i, psi[t - 1, j] * self.N + j]
                    psi[t, i] = tmp.argmax()
                    delta_t[i] = tmp.max() * self.observation_proba[obsids[t],
                                                                    i]

            delta, delta_t = delta_t, delta

        # Chemin inverse
        i_star = [delta.argmax()]
        for psi_t in psi[-1:0:-1]:
            i_star.append(psi_t[i_star[-1]])
        i_star.reverse()

        return i_star

   

# Compter les mots et les tags

In [2]:
# Fonction qui compte mots, tags, pairS (mot, tag), les transitions(tag1, tag2), les premiers mots 
# compter : lettre (x ) , lettre (y), (pair: (x, y) (lettre, lettre ), (etat, etat): (lettre, lettre ), (lettre premiere:  etat)

def compteur_dict(data_sent):
    
    compt_mot = {}
    compt_tag = {}
    compt_pair= {}
    compt_transition1 = {}
    compt_init = {}
    compt_transition2 = {}
    
    
    
    for sentence in data_sent:
        for i in range(len(sentence)):
            couple = sentence[i]
            mot, tag = couple
           
            if mot in compt_mot:
                compt_mot[mot] = compt_mot[mot] + 1
            else:
                compt_mot[mot] = 1
                
            if tag in compt_tag:
                compt_tag[tag] = compt_tag[tag] + 1
            else:
                compt_tag[tag] = 1
                
            if couple in compt_pair:
                compt_pair[couple] = compt_pair[couple] + 1
            else:
                compt_pair[couple] = 1
            
            if i > 1:
                trans = ((sentence[i-2][1], sentence[i-1][1]), tag) 
                if trans in compt_transition2:
                    compt_transition2[trans] = compt_transition2[trans] + 1
                else:
                    compt_transition2[trans] = 1
                    
            elif i == 1:
                trans = (sentence[i-1][1], tag)
                if trans in compt_transition1:
                    compt_transition1[trans] = compt_transition1[trans] + 1
                else:
                    compt_transition1[trans] = 1
                    
            else:
                if tag in compt_init:
                    compt_init[tag] = compt_init[tag] + 1
                else:
                    compt_init[tag] = 1
                    
    return compt_mot, compt_tag, compt_pair, compt_transition1, compt_transition2, compt_init

In [3]:
def make_vocab(c, threshold):
    voc = []
    voc.append(UNK)
    for w in c:
        if c[w] >= threshold:
            voc.append(w)
    return voc

def do_nothing(data_test):
    acc= 0
    total= 0
    for mot in data_test:
        for l1, l2 in mot:
            if l1==l2:
                acc= acc+1
            total= total+1
    return 1- acc/total
# un peu moin de 3% d'amelioration avec un viterbi du 1er ordre 

def precision(hmm, data_test) : 
    total = 0
    acc = 0

    for mot in data_test:

        obs_index, stat_index = hmm.encode(mot)
        sequence_pred = hmm.viterbi(obs_index)
        
        for i in range(len(sequence_pred)) : 
            if stat_index[i]== sequence_pred[i]:
                acc = acc +1 
            total = total+1
    return 1- acc/total



# Les données


In [4]:
import pickle


with open('train10.pkl', 'rb') as f:
    data_train_10 = pickle.load(f)
    
with open('test10.pkl', 'rb') as f:
    data_test_10 = pickle.load(f)
    
with open('train20.pkl', 'rb') as f:
    data_train_20 = pickle.load(f)
    
with open('test20.pkl', 'rb') as f:
    data_test_20 = pickle.load(f)
    

In [5]:
compt_mot,compt_tag,compt_pair,compt_trans1,compt_trans2,compt_init = compteur_dict(data_train_10)
print ("Nombre de lettres dans le train : ", len(compt_mot))
print ("Nombre de tags dans le train   : ", len(compt_tag))
print ("Nombre de paires dans le train : ",len(compt_pair))
print ("Nombre de transitions dans le train : " , len(compt_trans2) )
print ("Nombre de inititiation dans le train  : ", len(compt_init))
print (compt_tag)
vocab = make_vocab(compt_mot,10)
print ("Vocabulaire :", len(vocab))

Nombre de lettres dans le train :  26
Nombre de tags dans le train   :  26
Nombre de paires dans le train :  127
Nombre de transitions dans le train :  2489
Nombre de inititiation dans le train  :  25
{'b': 2070, 'y': 2985, 't': 13877, 'h': 6683, 'e': 18091, 'i': 10976, 'r': 8247, 'o': 11935, 'w': 2229, 'n': 9778, 'a': 10560, 'c': 4808, 'u': 3931, 'v': 1927, 'l': 6417, 's': 9762, 'f': 3379, 'm': 3773, 'd': 4541, 'g': 2736, 'k': 590, 'p': 3217, 'z': 124, 'j': 108, 'x': 274, 'q': 150}
Vocabulaire : 27


# Création du HMM et apprentissage

In [6]:
hmm = HMM(state_list=vocab, 
          observation_list=vocab,
          smoothing_obs = 0.01)

hmm.train(compt_pair,compt_trans1,compt_trans2,compt_init)

HMM creating with: 
27 states
27 observations


## Resultat test10

In [7]:
print("taux d'erreur en touchant à rien " , do_nothing(data_test_10))
print("taux d'erreur avec viterbi d'ordre 2 " , precision(hmm, data_test_10))

taux d'erreur en touchant à rien  0.10177595628415304
taux d'erreur avec viterbi d'ordre 2  0.046038251366120164


## Resultat test20

In [8]:
compt_mot,compt_tag,compt_pair,compt_trans1,compt_trans2,compt_init = compteur_dict(data_train_20)

hmm = HMM(state_list=vocab, 
          observation_list=vocab, 
          smoothing_obs = 0.001)

hmm.train(compt_pair,compt_trans1,compt_trans2,compt_init)
print("taux d'erreur en touchant à rien " , do_nothing(data_test_20))
print("taux d'erreur avec viterbi d'ordre 2 " , precision(hmm, data_test_20))


HMM creating with: 
27 states
27 observations
taux d'erreur en touchant à rien  0.19405667725121323
taux d'erreur avec viterbi d'ordre 2  0.09262476783895512


## Insertion lettres 

In [9]:
# On cree un corpus sans erreur 
def clean_corpus(data):
    clean_corp =[]
    
    for mot in data:
        clean_mot=[]
        for pair in mot :
            mot, tag = pair
            clean_mot.append((tag,tag))
        clean_corp.append(clean_mot)
    return clean_corp 


data_clean_train = clean_corpus(data_train_10)  
data_clean_test = clean_corpus(data_test_10)
do_nothing(data_clean_train)
            
            

0.0

In [10]:
# on insere des caractères au hasard dans le corpus 
# exemple : [('a', 'a'), ('c', 'c'), ('c', 'c'), ('o', 'o'), ('u', 'u'), ('n', 'n'), ('t', 't')]
# devient :  [('a', 'a'), ('c', 'c'), ('c', 'c'), ('w', '<ins>'), ('o', 'o'), ('u', 'u'), ('n', 'n'), ('t', 't')]

def insert_carac(data, taux_erreur = 0.1):
    import string
    import random
    alphabet =list(string.ascii_lowercase)
    err = 0
    total = 0
    for mot in data:
        for couple in mot:
            total = total+1
    taux_erreur = err / total 
    
    while  taux_erreur <0.1:
        i = random.randint(0,len(data)-1)
        mot = data[i]
        j= random.randint(0,len(mot)-1)
        lettre_insert = alphabet[random.randint(0,len(alphabet)-1)] 
        data[i].insert(j,(lettre_insert, '<ins>'))
        err = err + 1 
        taux_erreur = err/total 
    return data 

def make_vocab_insert(c, threshold):
   
    voc = list()
    voc.append(UNK)
    voc.append('<ins>')
    for w in c:
        if c[w] >= threshold:
            voc.append(w)
    return voc



data_insert_train = insert_carac(data_clean_train, taux_erreur = 0.1)
data_insert_test = insert_carac(data_clean_test, taux_erreur = 0.1)

In [11]:
compt_mot,compt_tag,compt_pair,compt_trans1,compt_trans2,compt_init = compteur_dict(data_insert_train)

vocab= make_vocab_insert(compt_mot, 10)
hmm = HMM(state_list=vocab, 
          observation_list=vocab, 
          smoothing_obs = 0.001)

hmm.train(compt_pair,compt_trans1,compt_trans2,compt_init)
print("taux d'erreur en touchant à rien " , do_nothing(data_insert_test))
print("taux d'erreur avec viterbi d'ordre 2 " , precision(hmm, data_insert_test))

HMM creating with: 
28 states
28 observations
taux d'erreur en touchant à rien  0.09090909090909094
taux d'erreur avec viterbi d'ordre 2  0.05278191753601591


### Supression 

In [13]:
data_clean_train = clean_corpus(data_train_10)  
data_clean_test = clean_corpus(data_test_10)

# creer un corpus avec des paires de lettres 
# exemple : mot pair  [(f,f), ('o','o'), ('r','r'), ('m','m')] devient  [('fo', 'fo'), ('rm', 'rm')] 
# mot impair [('t','t'), ('h','h'), ('e', 'e')]   devient   [('th', 'th'), ('e', 'e')]
def double_corpus(data):
    double_corp =[]
    
    for mot in data:
        double_mot=[]
        #print(mot)
        for i in range(1, len(mot), 2) :
            
            mot0, tag0 =mot[i-1]
            mot1, tag1 = mot[i]
            double_mot.append(((tag0+tag1), (tag0+ tag1)))
        if len(mot)%2:
            double_mot.append((mot[-1][1],mot[-1][1] ))
        double_corp.append(double_mot)
    return double_corp  

double_corpus_train = double_corpus(data_clean_train)
double_corpus_test = double_corpus(data_clean_test)

# détruit des caractères aléatoirement dans les observation 
# ex [('vi', 'vi'), ('ol', 'ol'), ('en', 'en'), ('ce', 'ce')] devient [('i', 'vi'), ('ol', 'ol'), ('en', 'en'), ('ce', 'ce')]
def del_carac(data, taux_erreur = 0.1):
    import string
    import random
    alphabet =list(string.ascii_lowercase)
    err = 0
    total = 0
    for mot in data:
        for couple in mot:
            total = total+1
    taux_erreur = err / total 
    
    while  taux_erreur <0.1:
        i = random.randint(0,len(data)-1)
        mot = data[i]
        j= random.randint(0,len(mot)-1)
        k = random.randint(0,1)
        
        lettre_insert = alphabet[random.randint(0,len(alphabet)-1)] 
        couple, couple_tag = data[i][j]
        if len( couple) >1:
            couple  = couple[k]
        new_couple = couple , couple_tag
        data[i][j] = new_couple 
        #print(data[i][j])
        #break 
        err = err + 1 
        taux_erreur = err/total 
    return data 

# on ajoute les observations dans le vocabulaire
# soit l alphabet ('a', 'b'...) et tous les couples alphabet ('aa'), ('ab')... ('zz')
def make_vocab_del(c, threshold):
    
    voc = list()
    voc.append(UNK)
    import string
    alphabet =list(string.ascii_lowercase)
    for l1 in alphabet:
        voc.append(l1)
        for l2 in alphabet:
            voc.append(l1+l2)
    return voc

from random import *



del_carac(double_corpus_train, taux_erreur = 0.1)

[[('by', 'by')],
 [('th', 'th'), ('ei', 'ei'), ('r', 'r')],
 [('ow', 'ow'), ('n', 'n')],
 [('ac', 'ac'), ('co', 'co'), ('n', 'un'), ('t', 't')],
 [('i', 'vi'), ('ol', 'ol'), ('en', 'en'), ('ce', 'ce')],
 [('s', 'is')],
 [('fo', 'fo'), ('r', 'r')],
 [('th', 'th'), ('em', 'em')],
 [('a', 'a')],
 [('fo', 'fo'), ('rm', 'rm')],
 [('of', 'of')],
 [('li', 'li'), ('be', 'be'), ('ra', 'ra'), ('ti', 'ti'), ('on', 'on')],
 [('in', 'in')],
 [('ot', 'ot'), ('he', 'he'), ('r', 'r')],
 [('wo', 'wo'), ('rd', 'rd'), ('s', 's')],
 [('by', 'by')],
 [('co', 'co'), ('mm', 'mm'), ('it', 'it'), ('ti', 'ti'), ('ng', 'ng')],
 [('vi', 'vi'), ('ol', 'ol'), ('en', 'en'), ('e', 'ce')],
 [('t', 'th'), ('ey', 'ey')],
 [('br', 'br'), ('ea', 'ea'), ('k', 'k')],
 [('th', 'th'), ('ro', 'ro'), ('ug', 'ug'), ('h', 'h')],
 [('th', 'th'), ('e', 'e')],
 [('ps', 'ps'),
  ('yc', 'yc'),
  ('ho', 'ho'),
  ('lo', 'lo'),
  ('gi', 'gi'),
  ('ca', 'ca'),
  ('l', 'l')],
 [('re', 're'), ('st', 'st'), ('ra', 'ra'), ('in', 'in'), ('ts',

In [14]:
# Seulement des suppressions
train10 = del_carac(double_corpus_train, 10)
test10 = del_carac(double_corpus_test, 10)
compt_mot,compt_tag,compt_pair,compt_trans1,compt_trans2,compt_init = compteur_dict(train10)



In [15]:
vocab = make_vocab_del(compt_mot,10)
hmm = HMM(state_list=vocab, 
          observation_list=vocab,
          smoothing_obs = 0.001)

hmm.train(compt_pair,compt_trans1,compt_trans2,compt_init)


HMM creating with: 
703 states
703 observations


In [16]:
print("taux d'erreur en touchant à rien " , do_nothing(test10))
print("taux d'erreur avec viterbi d'ordre 2 " , precision(hmm, test10))

taux d'erreur en touchant à rien  0.0742871435717859
taux d'erreur avec viterbi d'ordre 2  0.03101550775387696


# Unsupervised Learning 

In [23]:
#http://www.cip.informatik.uni-muenchen.de/~hauser/papers/unsupervisedHistIR.pdf

def distance_levenshtein(string1, string2):
    d= np.zeros((len(string1), len(string2)))
    cout_sub= 0
    for i in range(len(string1)):
        d[i,0]= i
    for j in range(len(string2)):
        d[0,j]= j 
    for i in range(1,len(string1)):
        for j in range(1, len(string2)):
            if string1[i-1]== string2[j-1]:
                cout_sub = 0
            else:
                cout_sub=1
            d[i,j ] = min(d[i-1,j]+1, d[i,j-1]+1, d[i-1,j-1]+cout_sub)
                
    return d[len(string1)-1,len(string2)-1] 
distance_levenshtein('chien', 'chion')




def unsup_levenshtein(lexique, corpus, maxCost= 1, n= 1 ):
    # lexique liste de mot 
    #corpus liste de mot 
    corpus_corr=[]
    for word1 in corpus:
        temp=[]
        word_corr= word1 
        for word2 in lexique:
            dis = distance_levenshtein(word1, word2)
            if dis <= maxCost:
                temp.append((word1,word2))
        if len(temp)<= n :
            for word1, word2 in temp:
                conversions = []
                conversions_mot=[]
                conversions_mot.append(word2)
                conversions.append(distance_levenshtein(word2, word1))
                word_corr = conversions_mot[np.argmin(np.array(conversions)) ]
        corpus_corr.append(word_corr)
         #record_conv(conversions)
    #compute_weight()
    return corpus_corr

def create_lexique(data):
    lexique =[]
    for mot in data:
        mot_entier =""
        for _, tag in mot:
            mot_entier = mot_entier+tag
        lexique.append(mot_entier)
    return set(lexique), lexique  # pour verif erreur lexique

        
def create_corpus(data ):
    corpus =[]
    for mot in data:
        mot_entier =""
        for lettre,_  in mot:
            mot_entier = mot_entier+lettre
        corpus.append(mot_entier)
    return corpus


            
    
corp = create_corpus(data_test_10) 
lex, true_corp = create_lexique(data_train_10)

_ ,true_corp_test = create_lexique(data_test_10)
  

In [24]:
pred_corpus = unsup_levenshtein(lex, corp, maxCost= 1, n= 1 )

In [35]:
corp

['the',
 'leftist',
 'is',
 'too',
 'far',
 'gone',
 'for',
 'that',
 'his',
 'reeljhgs',
 'of',
 'inferikrigy',
 'are',
 'sl',
 'ingrzined',
 'that',
 'he',
 'cannot',
 'conceive',
 'of',
 'hkmsekf',
 'ad',
 'individually',
 'strkng',
 'and',
 'vakhavle',
 'hence',
 'tnr',
 'cillectivism',
 'of',
 'yhe',
 'leftist',
 'he',
 'can',
 'feel',
 'strong',
 'ojly',
 'ss',
 's',
 'membee',
 'of',
 'a',
 'karge',
 'prganization',
 'or',
 'q',
 'mass',
 'movejrnt',
 'with',
 'which',
 'he',
 'idenyifies',
 'himself',
 'notice',
 'the',
 'maaochistic',
 'tendency',
 'of',
 'leftist',
 'tsctics',
 'lertusts',
 'pfoyesr',
 'ny',
 'lying',
 'cosn',
 'in',
 'rront',
 'ot',
 'vehicles',
 'they',
 'intenrionally',
 'provokr',
 'police',
 'or',
 'racists',
 'tl',
 'abhse',
 'them',
 'etc',
 'theze',
 'tactixs',
 'may',
 'often',
 'be',
 'effectuve',
 'bur',
 'mamy',
 'leftists',
 'usr',
 'them',
 'not',
 'as',
 'a',
 'meqns',
 'to',
 'aj',
 'end',
 'but',
 'because',
 'tney',
 'prefer',
 'masochixtic'

In [36]:
pred_corpus

['the',
 'leftist',
 'is',
 'too',
 'far',
 'gone',
 'for',
 'that',
 'his',
 'reeljhgs',
 'of',
 'inferikrigy',
 'are',
 'sl',
 'ingrzined',
 'that',
 'he',
 'cannot',
 'conceive',
 'of',
 'hkmsekf',
 'ad',
 'individuals',
 'strong',
 'and',
 'vakhavle',
 'hence',
 'tnr',
 'cillectivism',
 'of',
 'yhe',
 'leftist',
 'he',
 'can',
 'feel',
 'strong',
 'ojly',
 'ss',
 's',
 'member',
 'of',
 'a',
 'large',
 'organization',
 'or',
 'q',
 'mass',
 'movejrnt',
 'with',
 'which',
 'he',
 'idenyifies',
 'himself',
 'notion',
 'the',
 'maaochistic',
 'tendency',
 'of',
 'leftist',
 'tsctics',
 'lertusts',
 'pfoyesr',
 'ny',
 'line',
 'cosn',
 'in',
 'rront',
 'ot',
 'vehicles',
 'they',
 'intentionally',
 'provokr',
 'police',
 'or',
 'racism',
 'tl',
 'abuse',
 'them',
 'etc',
 'theze',
 'tactixs',
 'may',
 'often',
 'be',
 'effective',
 'bur',
 'mamy',
 'leftists',
 'usr',
 'them',
 'not',
 'as',
 'a',
 'meqns',
 'to',
 'aj',
 'end',
 'but',
 'because',
 'tney',
 'prefer',
 'masochixtic',
 

In [31]:
total= 0
compt =0

for i in range(len(true_corp_test)):
    if true_corp_test[i] == pred_corpus[i]:
        compt = compt + 1
    total = total +1 
print("precision mot bien écrit après correction via distance " , compt/ total )

total= 0
compt =0

for i in range(len(true_corp_test)):
    if true_corp_test[i] == corp[i]:
        compt = compt + 1
    total = total +1 
print("precision mot bien écrit sans correction " , compt/ total )




precision mot bien écrit après correction via distance  0.7128580946035976
precision mot bien écrit sans correction  0.6289140572951366
