# Importación de librerías

In [None]:
import pandas as pd
import numpy as np
import os

# Definición de clases auxiliares

En esta sección se definirán dos clases que servirán para agrupar información a la entrada y a la salida de la clase *HMMBigram*, que será donde se calculen las tablas probabilísticas de transición y obvservación.

In [None]:
class Word:
    '''
    It stores the most important properties of each word from the corpus
    '''

    def __init__(self, token: str, tag: str, lemma: str):
        '''
        Class constructor.
        Parameters:
        -token: Word token
        -tag: Word tag
        '''
        self._token = token
        self._tag = tag
        self._lemma = lemma

    def Token(self):
        '''
        Returns the token of the word
        '''
        return self._token

    def Tag(self):
        '''
        Returns the tag of the word
        '''
        return self._tag

    def Lemma(self):
        '''
        Returns the lemma of the word
        '''
        return self._lemma



In [None]:
class HMM_Probabilities:
    '''
    It stores the transition and emission probabilities of each document of the corpus
    '''

    def __init__(self, _prob_trans: pd.DataFrame(), _prob_obs: pd.DataFrame()):
        '''
        Class constructor.
        Parameters:
        -_prob_trans: The probability table of transition
        -_prob_obs: The probability table of emission
        '''
        self._prob_trans = _prob_trans
        self._prob_obs =  _prob_obs

    def PTrans(self):
        '''
        Returns the probability table of transition stored
        '''
        return self._prob_trans

    def PObs(self):
        '''
        Returs the probability table of emission stored
        '''
        return self._prob_obs 

Se declara también la función *non_zero_green*, que se utilizará como medio para mostrar con mayor claridad algunos valores de las tablas

In [None]:
def non_zero_green(val):
    '''
    Function to highlight probabilities other than 0 in green
    '''
    return 'background-color: Aquamarine' if val > 0 else ''  

# Definición de la clase HMMBigram

Esta clase permitirá hacer el cálculo de las tablas de probabilidades de transición y de emisión a partir de la información agrupada del corpus (por la clase *Word*). Como no se calcularán las tablas finales de forma instantánea, sino que se calcularán por separado para cada archivo de wikicorpus descargado, la clase *HMM_Probabilities* almacenará cada set de probabilidades calculado en cada caso.

In [2]:
class HMMBigram:
    '''    
    Class to obtain the probability matrices HMM Bigrama from a corpus
    '''

    def __init__(self, corpus: [[Word]], LemmaDictionary: dict(), RemovedLemmaDict: dict()):
        '''
        Class constructor
        Parameters:
        -corpus: The corpus to proc
        -LemmaDictionary: The dictionary of lemmas to create
        -RemovedLemmaDict: The dictionary of deleted tokens and its lemmas
        '''
        self._corpus = corpus
        self._states = dict()
        self._tokens = dict()
        self._reducedtokens = dict()
        self._lemmas_dict = LemmaDictionary

        self._removedDict = RemovedLemmaDict
        self._q0 = 'q0'
        self._qF = 'qF'
        self.state_dictionary = {"A":"Adjective", "D":"Determiner", "N":"Noun", "V":"Verb", "P":"Pronoun", "R":"Adverb", "C":"Conjunction", "S":"Adposition", "W":"Date", "Z":"Number", "I":"Interjection", "F":"Punctuation"}

        self._prob_trans = pd.DataFrame()
        self._prob_obs = pd.DataFrame()


    def LemmaDictionaries(self):
        '''    
        Returns the dictionary of lemmas created
        '''
        return self._lemmas_dict

    def RemovedLemmaDict(self):
        '''    
        Returns the dictionary of deleted lemmas created
        '''
        return self._removedDict


    def CorpusProcessing(self):
        '''
        Counts the number of occurrences of states and tokens and identifies the "tokenseliminados"
        '''
        for sentence in self._corpus:
            
            for word in sentence:

                #Getting the frequency of appearance for each word (without filtering)

                if self.state_dictionary[word.Tag()[0]] in self._states.keys():
                    self._states[self.state_dictionary[word.Tag()[0]]]+=1

                else:
                    self._states[self.state_dictionary[word.Tag()[0]]]=1

                if word.Token().lower() in self._tokens.keys():
                    self._tokens[word.Token().lower()]+=1

                else:
                    self._tokens[word.Token().lower()]=1

                #Getting the frequency of appearance for each word (with filtering)
                if word.Tag()[0] != "W" and word.Tag()[0] != "Z" and word.Tag()[:2] != "NP":
                    CompoundWord = word.Token().count('_')
                    WrongWord = word.Token().count('.')
                    if CompoundWord == 0 and WrongWord== 0:

                          #Creating the dictionary of lemmas
                          if not word.Token().lower() in  self._lemmas_dict.keys():
                              self._lemmas_dict[word.Token().lower()] = dict()
                              self._lemmas_dict[word.Token().lower()][self.state_dictionary[word.Tag()[0]]] = word.Lemma().lower() 
                          else:
                              self._lemmas_dict[word.Token().lower()][self.state_dictionary[word.Tag()[0]]] = word.Lemma().lower() 

     
                          #If the word does not need to be filtered
                          if word.Token().lower() in self._reducedtokens.keys():
                              self._reducedtokens[word.Token().lower()]+=1

                          else:
                              self._reducedtokens[word.Token().lower()]=1
                    else:
                        #If the word needs to be filtered
                        if "tokenseliminados" in self._reducedtokens.keys():
                              self._reducedtokens["tokenseliminados"]+=1

                        else:
                            self._reducedtokens["tokenseliminados"]=1
                else:
                    #Creating the dictionary of removed lemmas
                    if not word.Token().lower() in  self._removedDict.keys():
                          self._removedDict[word.Token().lower()] = dict()
                          self._removedDict[word.Token().lower()][self.state_dictionary[word.Tag()[0]]] = word.Lemma().lower()  
                    else:
                        self._removedDict[word.Token().lower()][self.state_dictionary[word.Tag()[0]]] = word.Lemma().lower() 


    def States(self, include_initial: bool = False, include_last: bool = False):
        '''
        Returns the states of the corpus.
        Parameters:
        -include_initial: includes q0
        -include_last : includes qf
        '''

        if len(self._states) == 0:
            self.CorpusProcessing()

        states_copy = dict()
        #Adding q0
        if include_initial:
            states_copy[self._q0] = len(self._corpus)

        states_copy.update(self._states)
        #Adding qf
        if include_last:
            states_copy[self._qF] = len(self._corpus)

        return states_copy

    def Tokens(self):
        '''
        Returns the tokens of the corpus
        '''

        if len(self._tokens) == 0:
            self.CorpusProcessing()

        return self._tokens.copy()

    def ReducedTokens(self):
        '''
        Returns the filtered list of tokens
        '''

        if len(self._reducedtokens) == 0:
            self.CorpusProcessing()

        return self._reducedtokens.copy()

    
    def TransitionProbabilities(self):  
        '''
        Calculates the probabilities of transition
        '''

        if len(self._prob_trans) != 0:
            return self._prob_trans.copy()

        q0 = self._q0
        qF = self._qF
 
        transition_count = dict()
        aux_dict = dict()
        
        #Creating an empty dictionary with the correct keys 
        for state in self.States(include_initial=True, include_last=True).keys():
            for state2 in self.States(include_initial=True,include_last=True).keys():
                aux_dict[state2] = 0     
            transition_count[state]= aux_dict.copy()
            aux_dict.clear()
            
        #Getting the transition count between states and filling the previous empty dictionary
        previous_word = self._q0
        for sentence in self._corpus:
            cnt = 0
            for word in sentence:
                if cnt == 0:
                    transition_count[self._q0][self.state_dictionary[word.Tag()[0]]] += 1
                    previous_word = self.state_dictionary[word.Tag()[0]]
                    cnt=1
                    
                else:                    
                    transition_count[previous_word][self.state_dictionary[word.Tag()[0]]] += 1
                    previous_word = self.state_dictionary[word.Tag()[0]]

            
            transition_count[self.state_dictionary[word.Tag()[0]]][self._qF] += 1

        #Getting additional information to adjust the probability table of transition
        initial_tags_states = list(
            self.States(include_initial=True).keys())
        final_tags_states = list(self.States(include_last=True).keys())
        total_states = self.States(
            include_initial=True, include_last=True)

        prob_trans_dict = dict()

        #Creating an empty dictionary
        for state in self.States(include_initial=True, include_last=True).keys():
            for state2 in self.States(include_initial=True,include_last=True).keys():
                aux_dict[state2] = 0     
            prob_trans_dict[state]= aux_dict.copy()
            aux_dict.clear()
        
        #Filling the dictionary with the probability table of transition
        for state in self.States(include_initial=True, include_last=True).keys():
            for state2 in self.States(include_initial=True,include_last=True).keys():
                prob_trans_dict[state2][state] = transition_count[state][state2] / total_states[state]
                
        #Converting the information
        self._prob_trans = pd.DataFrame().from_dict(prob_trans_dict.copy()) 


        return self._prob_trans.copy()
          


    def EmissionProbabilities(self):
        '''
        Calculates the probabilities of emission
        '''

        if len(self._prob_obs) != 0:
            return self._prob_obs.copy()


        states = self.States()
        observation_count = dict()
        aux_dict = dict()

        #Creating an empty dictionary with the correct keys
        for state in self.States().keys():
            for word in self.ReducedTokens().keys():
                aux_dict[word.lower()] = 0     
            observation_count[state]= aux_dict.copy()
            aux_dict.clear()

        #Getting the emission count between states and filling the previous empty dictionary
        for sentence in self._corpus:
            for word in sentence:
              try:
                observation_count[self.state_dictionary[word.Tag()[0]]][word.Token().lower()] += 1
              except:
                observation_count[self.state_dictionary[word.Tag()[0]]]["tokenseliminados"] += 1               

                             
        tokens = self.ReducedTokens()
        prob_obs = {Ti: {Wi: 0 for Wi in tokens} for Ti in states}

        prob_obs_dict = dict()
        total_states = self.States()

        #Creating an empty dictionary 
        for word in self.ReducedTokens().keys():
            for state in self.States().keys():                        
                aux_dict[state] = 0     
            prob_obs_dict[word.lower()]= aux_dict.copy()
            aux_dict.clear()

        #Filling the dictionary with the probability table of emission
        for state in self.States().keys():
            for word in self.ReducedTokens().keys():            
                prob_obs_dict[word.lower()][state] = observation_count[state][word.lower()] / total_states[state]           
              
        #Converting the information        
        self._prob_obs = pd.DataFrame().from_dict(prob_obs_dict.copy()) 

        return self._prob_obs

    def __Corpus(self):
        '''
        Returns the corpus that has been processed
        '''
        return self._corpus.copy()

    def __InitialState(self):
        '''
        Returns the initial state
        '''
        return self._q0

    def __FinalState(self):
        '''
        Returns the final state
        '''
        return self._qF


# Obtención de las tablas de probabilidad

En esta sección, se iterará por el número de ficheros disponibles de Wikicorpus, se calcularán sus probabilidades y se almacenarán en la lista denominada HMM. A partir de esta lista, se obtendrán las tablas finales de emisión y transición en función de la aparición de las palabras en cada fichero (mediante una media relativa al número de éstas)

In [4]:
HMM = list()
DictLemmas = dict()
RemovedLemmaDict = dict()
#Uploading the files
for document in os.listdir("Wikicorpus"):
    if document != ".ipynb_checkpoints":
      wikicorpus_file = open("Wikicorpus/"+document, "r", encoding = "ISO-8859-1",)
      actual_sentence = list()
      corpus = list()
      print(document)

      #Reading each line of the file
      for line in wikicorpus_file.readlines():
          line = line.split()
          if len(line) == 0:
              if len(actual_sentence) > 0:
                  corpus.append(actual_sentence)

              actual_sentence = list()
              continue

          elif line[0] == '<doc':
              #Start of document. Nothing is done
              continue

          elif line[0] == '</doc>':
              #End of document. Nothing is done
              continue
          try:
            actual_sentence.append(Word(token=line[0], tag=line[2], lemma = line[1]))
            if line[2] == 'mito' or len(line) > 4 :
              print(line)
          except:
            print(line)
    

      wikicorpus_file.close()
      #Calculating the HMM of the file
      hmmbigram = HMMBigram(corpus, DictLemmas, RemovedLemmaDict)
      hmmbigram.CorpusProcessing()
      #Getting the dictionary of lemmas and the dictionary of removed lemmas
      DictLemmas= hmmbigram.LemmaDictionaries()
      RemovedLemmaDict = hmmbigram.RemovedLemmaDict()
      #Getting the probability tables
      prob_transition = hmmbigram.TransitionProbabilities()
      prob_transition.to_excel("Desktop/docs/"+document+'resultados_trans.xlsx', sheet_name='prob_trans')
      prob_emission = hmmbigram.EmissionProbabilities().T
      prob_emission.to_excel("Desktop/docs/"+document+'resultados_emision.xlsx', sheet_name='prob_emission')
      #Storing the probability tables in the HMM list
      HMM.append(HMM_Probabilities(prob_transition, prob_emission.T))

      del hmmbigram
      del prob_transition
      del prob_emission

SE_110k_115k.txt
SE_180k_185k.txt
SE_185k_190k.txt
SE_200k_205k.txt
SE_225k_230k.txt
SE_230k_235k.txt
SE_25k_30k.txt
SE_260k_265k.txt
SE_285k_290k.txt
SE_305k_310k.txt
SE_310k_315k.txt
SE_315k_320k.txt
SE_320k_325k.txt
SE_330k_335k.txt
SE_335k_340k.txt
SE_340k_345k.txt
SE_345k_350k.txt
SE_355k_360k.txt
SE_360k_365k.txt
SE_365k_370k.txt
SE_370k_375k.txt
SE_375k_380k.txt
SE_380k_385k.txt
SE_385k_390k.txt
SE_390k_395k.txt
SE_405k_410k.txt
SE_425k_430k.txt
SE_430k_435k.txt
SE_435k_440k.txt
SE_440_445k.txt
SE_470k_475k.txt
SE_90k_95k.txt


A continuación, se muestra un ejemplo de la lista HMM, mostrando las tablas de transición y emisión guardadas

In [5]:
HMM[0].PTrans().style.applymap(non_zero_green)

Unnamed: 0,q0,Determiner,Noun,Verb,Adposition,Adjective,Punctuation,Conjunction,Pronoun,Adverb,Number,Date,Interjection,qF
q0,0.0,0.24916,0.318593,0.109062,0.171052,0.004547,0.010995,0.028374,0.035704,0.055023,0.015718,0.001559,0.000213,0.0
Determiner,0.0,0.011758,0.808015,0.013182,0.009249,0.082384,0.004906,0.0011,0.025491,0.006354,0.017866,0.019659,3.6e-05,0.0
Noun,0.0,0.010561,0.048491,0.088396,0.245157,0.111023,0.363798,0.067022,0.033353,0.018431,0.012697,0.000488,0.000582,0.0
Verb,0.0,0.227699,0.086812,0.140377,0.295037,0.030314,0.064043,0.052566,0.040454,0.04996,0.012497,0.000121,0.000121,0.0
Adposition,0.0,0.51215,0.313212,0.050803,0.002159,0.019188,0.005896,0.006043,0.020845,0.007539,0.056865,0.005251,4.8e-05,0.0
Adjective,0.0,0.011215,0.25758,0.065085,0.221025,0.0219,0.294795,0.084907,0.029432,0.010011,0.003562,0.000459,2.9e-05,0.0
Punctuation,0.0,0.057034,0.251712,0.069913,0.071242,0.012565,0.102883,0.054243,0.031118,0.019076,0.055198,0.003539,0.000751,0.270725
Conjunction,0.0,0.216529,0.291573,0.151233,0.105575,0.052964,0.025285,0.011827,0.062673,0.057161,0.02354,0.001536,0.000105,0.0
Pronoun,0.0,0.036758,0.025993,0.627845,0.090202,0.009408,0.047187,0.013894,0.098533,0.048465,0.001615,4.5e-05,5.6e-05,0.0
Adverb,0.0,0.073572,0.053709,0.277721,0.149489,0.144527,0.137833,0.032538,0.064719,0.054338,0.011197,5.1e-05,0.000306,0.0


In [6]:
HMM[0].PObs()

Unnamed: 0,la,es,una,institución,de,educación,superior,inspiración,cristiana,",",...,espiritu,desalineación,neandertales,cro-magnon,cromañón,prognatismo,disimular,masticación,pasapurés,criba
Determiner,0.250913,0.0,0.054342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Noun,0.001173,7.9e-05,0.0,0.000247,9.9e-05,0.000426,6e-06,6.5e-05,2e-06,0.0,...,2e-06,2e-06,2e-06,2e-06,2e-06,5e-06,0.0,2e-06,5e-06,3e-06
Verb,0.0,0.059271,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4e-06,0.0,0.0,4e-06,0.0,0.0,0.0
Adposition,0.0,0.0,0.0,0.0,0.42612,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adjective,0.0,0.0,0.0,0.0,0.0,0.0,0.002114,0.0,0.000537,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Punctuation,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.343961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Conjunction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Pronoun,0.015037,0.0,0.00111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Adverb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Number,0.0,0.0,0.000567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Se extrae la información de transición de cada fichero. Como en todos los ficheros los estados de inicio de transición y fin de transición son los mismos, se puede realizar una media de los valores para obtener la tabla de probabilidades de transición definitiva. Una vez creada, se guarda como xlsx.

In [7]:
PTrans_Total = HMM[0].PTrans()
first_iter=1
cnt_row=0
for row in HMM:
  if not first_iter:
    PTrans_Total = row.PTrans() + PTrans_Total
 
  first_iter=0
  cnt_row+=1

PTrans_Total = PTrans_Total/cnt_row
PTrans_Total.to_excel('Desktop/docs/Total_Transmision.xlsx', sheet_name='prob_trans')
del PTrans_Total
del cnt_row
  

Se repite el mismo procedimiento para las tablas de emisión. En este caso, los estados de cafa fichero serán los mismos, pero no los tokens. Por tanto, se deberá almacenar la frecuencia de aparición de éstos en una matriz auxiliar denominada *divider*, a partir de la que se podrá efectuar la media relativa a cada token en cada caso

In [8]:
PObs_Total = HMM[0].PObs().copy()
divider = dict()
firstone=0

for hmm in HMM:
  if firstone:
    for cols in hmm.PObs().columns.values:
      try:
          PObs_Total[cols] = PObs_Total[cols] + hmm.PObs()[cols]
          try:
            divider[cols] += 1 
          except:
            divider[cols] = 2
      except:
        PObs_Total[cols]= hmm.PObs()[cols]
        try:
            divider[cols] += 1        
        except:
            divider[cols] = 2
  firstone=1

  

Se almacena la tabla de probabilidad de emisión como xlsx

In [9]:
PObs_Total
del HMM
del firstone

In [10]:
PObs_Total.T.to_excel('Desktop/docs/Total_emision.xlsx', sheet_name='prob_emission')

Se calcula la mencionada media relativa a cada token

In [11]:
PObs_TotalDIV = pd.DataFrame()
for cols in PObs_Total.columns.values:
    try:
        PObs_TotalDIV[cols] = PObs_Total[cols] / divider[cols]
    except:
        PObs_TotalDIV[cols] = PObs_Total[cols]     

Se obtiene y se guarda la tabla de emision definitiva creada

In [12]:
PObs_TotalDIV.T.to_excel('Desktop/docs/Total_Emission.xlsx', sheet_name='prob_obs') 

In [13]:
np.save('Desktop/docs/total_divider', divider)

Se guarda el diccionario de lemas y el de lemas eliminados

In [14]:
np.save('Desktop/docs/LemmaDictionaries', DictLemmas)

In [15]:
np.save('Desktop/docs/DiccionarioEliminar', RemovedLemmaDict)
del PObs_TotalDIV
del PObs_Total
del divider

Por último, si se deseara, se podría filtrar el contenido del diccionario de lemas para asegurar que no aparezcan ciertas palabras del diccionario de lemas elimados. Sin embargo, esta aproximación no se contemplará para el proyecto, por este motivo, la celda en la que se guarda el diccionario filtrado aparece comentada

In [16]:
len(RemovedLemmaDict)

1718284

In [17]:
len(DictLemmas)

385599

In [18]:
for clave in RemovedLemmaDict.keys():
    if clave in DictLemmas.keys():
        del DictLemmas[clave]

In [19]:
len(DictLemmas)

299878

In [20]:
#np.save('Desktop/docs/LemmaDictionariesDef', DictLemmas)