# Text segmentation using Hidden Markov Models

In [1]:
import os
import glob
import numpy as np

In [2]:
ROOT = os.path.abspath('.')

PERL_DIR = os.path.join(ROOT,'PerlScriptAndModel')
RES_DIR = os.path.join(ROOT,'res')

### Coding/Decoding Mails

In [3]:
DATA_DIR = os.path.join(ROOT,'dat\\dat')

# Iterate through files and load the text 
def files_iter(data_dir, with_name=False):
    files = glob.glob('{}\\*.dat'.format((data_dir)))
    if with_name:
        for f in files:
            # Get the filename 
            name = f.split('\\')[-1].split('.')[0]
            # Return filename and associated text
            #text = open("{}/{}.txt".format(data_dir, name),"r",encoding="gbk").read()
            text = np.loadtxt("{}\\{}.dat".format(data_dir, name),dtype=int)
            yield name, text
    else:
        for f in files :
            yield np.loadtxt(f, dtype=int)

In [4]:
# And we get a generator that will allow us to iterate through the mails
mail_iter = files_iter(DATA_DIR, with_name=True)

### Distribution files

In [5]:
PERL_DIR = os.path.join(ROOT,'PerlScriptAndModel\\PerlScriptAndModel')

# Writing a function to get the probability data
def get_emission_prob(perl_dir):
    return np.loadtxt("{}\\P.text".format(perl_dir))

In [6]:
# Inputs to the Viterbi function
trans = np.array([[0.999218078035812, 0.000781921964187974],[0,1]])
emission_prob = get_emission_prob(PERL_DIR)
states = [1, 2]
start_prob = np.array([1, 0])

### To implement:

In [46]:
# Viterbi function
def viterbi(obs, states, start_prob, trans, emission_prob):
    """
        Viterbi Algorithm Implementation

        Keyword arguments:
            - obs: sequence of observation
            - states:list of states
            - start_prob:vector of the initial probabilities
            - trans: transition matrix
            - emission_prob: emission probability matrix
        Returns:
            - seq: sequence of state
    """

    # Avoid underflow: use the logarithm !
    # Avoid 0 in logarithm: use a small constant !
    small = np.finfo(np.float64).tiny
    
    start_prob = np.log(start_prob + small)
    trans = np.log(trans + small)
    emission_prob = np.log(emission_prob + small)
    
    T = len(obs) # Number of observations
    N = len(states) # Number of model states
    
    # Initialisation
    log_l = np.full((N,T),-np.inf)
    bcktr = np.zeros((N,T),dtype=int)
    
    # Viterbi
    
    # Forward loop:
    log_l[:,0]= start_prob + emission_prob[obs[0],:]
    for t in range(1, T):
        for j in range(N):
            max_log_l = np.max(log_l[:,t-1] + trans[:,j] + emission_prob[obs[t],j])
            bcktr[j,t] = np.argmax(log_l[:,t-1] + trans[:,j] + emission_prob[obs[t],j])
            log_l[j,t] = max_log_l
    # Backward loop
    path = [np.argmax(log_l[:,-1])]
    for i in range(T-1, 0, -1):
        path.append(bcktr[path[-1],i])
    path.reverse()
    return [i + 1 for i in path]

In [47]:
RES_DIR = os.path.join(ROOT,'Res')

# Creating a directory to put the result of the viterbi function
if not os.path.exists(RES_DIR):
    os.mkdir(RES_DIR)
    
# Function that will write a viterbi path for a mail in a dedicated result file
def create_viterbi_path_file(mail_name, viterbi_path):
    with open('{}\\{}_path.txt'.format(RES_DIR, mail_name), 'w') as f: 
        f.write(''.join([str(c) for c in viterbi_path]))   

In [48]:
# Using our generator, we get the mail names and data
mail_iter = files_iter(DATA_DIR, with_name=True)
for name_file, data in mail_iter:
    # Find out the viterbi path using viterbi
    viterbi_path = viterbi(data,states,start_prob,trans,emission_prob)
    # Put it in the result file
    create_viterbi_path_file(name_file, viterbi_path)

### Visualizing segmentation

In [153]:
# Writing a function to go into the directory and execute the perl script "segment.pl" on the mail in the given path
def exec_perl_script(mail, path):
    res = !cd {PERL_DIR}; perl segment.pl {mail} {path}
    return res

# Writing a function getting the original mail, the result of viterbi, and applying the segmentation script
# Then putting the result
def segment_mail(mail_name, data_dir, output_dir):
    # Get the full path of the mail
    mail = os.path.join(ROOT,'dat/dat/{}'.format(mail_name))
    # Get the full path of the result
    path = os.path.join(ROOT,'dat/dat/{}'.format(mail_name))
    # Execute the visualization script
    formatted_mail = ...
    # Get the results
    formatted_mail_text = ...
    # Go through the resulting text until the cutting line
    ...
    # If this was not the last line, return the text cut in to parts: header and body
    ...
    # If not, it's just a header
    ...

In [154]:
# Getting mails names
...
# Call the function and look at the result of segmentation
...