# Part 1.1: Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation) (5 points)

In [1]:
import os

In [2]:
def estimate_emission_parameters_base(training_set):
    #Store the count for each state, count(y)
    state_count = {}
    #Store the state-observation count, count(y->x)
    state_observation_count = {}
    #Store the estimated emission parameters
    estimated_emission_parameters = {}
    for i in range(len(training_set)):
        #if its not a single empty line that separates sentences
        if(len(training_set[i])!=0):
            parts = training_set[i].split(" ")
            observation = ' '.join(parts[:len(parts)-1])
            state = parts[-1]
            
            #Increment count(y->x)
            if (observation,state) in state_observation_count:
                state_observation_count[(observation,state)]+=1
            else:
                state_observation_count[(observation,state)]=1

            #Increment count(y)
            if state in state_count:
                state_count[state]+=1
            else:
                state_count[state]=1
        else:
            continue
            
    #Emission probability: Probability of generating observation x from underlying state y
    #count(y->x): Number of times we see state y generated from observation x
    #count(y): Number of times we see state y 
    #For each e(x|y), calculate count(y->x)/count(y)
    for k,v in state_observation_count.items():
        estimated_emission_parameters[k] = v/state_count[k[1]]
    
    return estimated_emission_parameters

# Part 1.2: Write a function that estimates the emission parameters from the training set using MLE, accounting for words that appear in the test set that do not appear in the training set (maximum likelihood estimation) (10 points)

In [3]:
def estimate_emission_parameters(training_set, k_value=1):
    state_count = {}
    state_observation_count = {}
    estimated_emission_parameters = {}
    #Get a set of the trained words
    trained_words = set()
    
    #k_value: k occurences of generating observation #UNK# from any label y
    
    for i in range(len(training_set)):
        #if its not a single empty line that separates sentences
        if(len(training_set[i])!=0):
            parts = training_set[i].split(" ")
            observation = ' '.join(parts[:len(parts)-1])
            state = parts[-1]
            
            #Increment count(y->x)
            if (observation,state) in state_observation_count:
                state_observation_count[(observation,state)]+=1
            else:
                state_observation_count[(observation,state)]=1

            #Increment count(y)
            if state in state_count:
                state_count[state]+=1
            else:
                state_count[state]=1
                
            trained_words.add(observation)
        else:
            continue
            
    #count(y->x): Number of times we see state y generated from observation x
    #For each e(x|y), calculate count(y->x)/(count(y)+k) and calculate k/(count(y)+k) -> the x for this is #UNK#
    #We assume from any label y there is a certain chance of generating #UNK# as a rare event,
    #and emprically we assume we have observed that there are k occurences of such an event 
    for k,v in state_observation_count.items():
        estimated_emission_parameters[k] = v/(state_count[k[1]]+k_value)
        estimated_emission_parameters[("#UNK#",k[1])] = k_value/(state_count[k[1]]+k_value)
    
    return estimated_emission_parameters, list(trained_words)

# Part 1.3: Implementation of simple sentiment analysis system (10 points)

## Train and Evaluate with ES

#### Read ES Train Dataset

In [4]:
filepath_ES_train = os.path.join(os.getcwd(), 'Data', 'ES', 'train')

#Read the file contents
with open(filepath_ES_train, 'r', encoding='utf-8') as file:
    file_contents_ES_train = file.readlines()
    
#Convert to training set
es_training_set = [w.strip() for w in file_contents_ES_train]

In [5]:
es_training_set

['Estuvimos O',
 'hace O',
 'poco O',
 'mi O',
 'pareja O',
 'y O',
 'yo O',
 'comiendo O',
 'y O',
 'resultó O',
 'todo O',
 'muy O',
 'bien O',
 ', O',
 'tanto O',
 'la O',
 'comida B-positive',
 ', O',
 'el O',
 'vino B-positive',
 ', O',
 'el O',
 'trato B-positive',
 ', O',
 'la O',
 'decoración B-positive',
 '… O',
 'nos O',
 'gustó O',
 'todo O',
 'mucho O',
 '. O',
 '',
 'Por O',
 'poner O',
 'algún O',
 'pero O',
 ', O',
 'quizá O',
 'el O',
 'jamón B-negative',
 'no O',
 'era O',
 'todo O',
 'lo O',
 '" O',
 'ibérico O',
 '" O',
 'que O',
 'cabía O',
 'esperar O',
 '. O',
 '',
 'Bien O',
 'lo O',
 'sabe O',
 'el O',
 'autor O',
 'del O',
 'blog O',
 '. O',
 ') O',
 '',
 'Comida B-positive',
 'exquisita O',
 '. O',
 '',
 'Restaurante B-positive',
 'diferente O',
 ', O',
 'creativo O',
 'y O',
 'agradable O',
 '. O',
 '',
 'Si O',
 'no O',
 'has O',
 'probado O',
 'sus O',
 'carnes B-positive',
 'te O',
 'estas O',
 'perdiendo O',
 'algo O',
 'muy O',
 'grande O',
 '! O',
 '',


#### Learn ES parameters

In [6]:
#Calculate the parameters using the training set
all_estimated_emission_parameters, trained_words = estimate_emission_parameters(es_training_set)

In [7]:
all_estimated_emission_parameters

{('Estuvimos', 'O'): 0.00020664003306240529,
 ('#UNK#', 'O'): 3.444000551040088e-05,
 ('hace', 'O'): 0.0008954401432704229,
 ('poco', 'O'): 0.0018942003030720485,
 ('mi', 'O'): 0.0024796803967488635,
 ('pareja', 'O'): 0.00044772007163521146,
 ('y', 'O'): 0.0352665656426505,
 ('yo', 'O'): 0.0012398401983744318,
 ('comiendo', 'O'): 0.00034440005510400884,
 ('resultó', 'O'): 0.00013776002204160352,
 ('todo', 'O'): 0.0039606006336961016,
 ('muy', 'O'): 0.013638242182118749,
 ('bien', 'O'): 0.005682600909216145,
 (',', 'O'): 0.05730816916930707,
 ('tanto', 'O'): 0.0013431602149056344,
 ('la', 'O'): 0.026002204160352666,
 ('comida', 'B-positive'): 0.14556416881998277,
 ('#UNK#', 'B-positive'): 0.0008613264427217916,
 ('el', 'O'): 0.022110483537677365,
 ('vino', 'B-positive'): 0.00516795865633075,
 ('trato', 'B-positive'): 0.03789836347975883,
 ('decoración', 'B-positive'): 0.006029285099052541,
 ('…', 'O'): 0.0015498002479680396,
 ('nos', 'O'): 0.005028240804518529,
 ('gustó', 'O'): 0.000378

#### Learn ES parameters: Get argmax_y( e(x|y) )

In [8]:
#Calculate y* = argmax_y e(x|y)
#i.e. find the y that produces the highest emission probability for x
estimated_emission_parameters = {}
for k,v in all_estimated_emission_parameters.items():
    #If the word is already in the estimated_emission_parameters
    if(k[0] in estimated_emission_parameters):
        #Check if its emission probability is greater than what has been stored previously
        #If it is greater, then update the tag and emission probability 
        if(v > estimated_emission_parameters[k[0]][1]):
            estimated_emission_parameters[k[0]] = [k[1],v]
    #else if the word is not already in estimated_emission_parameters
    #create an entry
    else:
        estimated_emission_parameters[k[0]] = [k[1],v]

In [9]:
estimated_emission_parameters

{'Estuvimos': ['O', 0.00020664003306240529],
 '#UNK#': ['I-neutral', 0.022727272727272728],
 'hace': ['O', 0.0008954401432704229],
 'poco': ['O', 0.0018942003030720485],
 'mi': ['O', 0.0024796803967488635],
 'pareja': ['O', 0.00044772007163521146],
 'y': ['O', 0.0352665656426505],
 'yo': ['I-negative', 0.005813953488372093],
 'comiendo': ['O', 0.00034440005510400884],
 'resultó': ['O', 0.00013776002204160352],
 'todo': ['I-negative', 0.005813953488372093],
 'muy': ['O', 0.013638242182118749],
 'bien': ['O', 0.005682600909216145],
 ',': ['O', 0.05730816916930707],
 'tanto': ['O', 0.0013431602149056344],
 'la': ['I-positive', 0.0380952380952381],
 'comida': ['B-positive', 0.14556416881998277],
 'el': ['I-neutral', 0.022727272727272728],
 'vino': ['B-negative', 0.015706806282722512],
 'trato': ['B-positive', 0.03789836347975883],
 'decoración': ['B-positive', 0.006029285099052541],
 '…': ['B-negative', 0.002617801047120419],
 'nos': ['O', 0.005028240804518529],
 'gustó': ['O', 0.000378840

#### Read ES dev.in Dataset

In [10]:
filepath_ES_devin = os.path.join(os.getcwd(), 'Data', 'ES', 'dev.in')

#Read the file contents
with open(filepath_ES_devin, 'r', encoding='utf-8') as file:
    file_contents_ES_devin = file.readlines()
    
es_devin = [w.strip() for w in file_contents_ES_devin]

In [11]:
es_devin

['Plato',
 'degustación',
 ':',
 'un',
 'poco',
 'abundante',
 'de',
 'más',
 ',',
 'pero',
 'bien',
 'cocinado',
 '.',
 '',
 'restaurante',
 'excelente',
 'con',
 'carne',
 'de',
 'alta',
 'calidad',
 '.',
 '',
 'Las',
 'posibilidades',
 'en',
 'el',
 'restaurante',
 'son',
 'fundamentalmente',
 'tres',
 ';',
 'carta',
 'normal',
 ',',
 'menú',
 'degustacion',
 'y',
 'una',
 'opción',
 'intermedia',
 'que',
 'es',
 'una',
 'selección',
 'de',
 'primeros',
 'y',
 'postres',
 'y',
 'carta',
 'para',
 'el',
 'segundo',
 '.',
 '',
 'No',
 'perderse',
 'el',
 'sorbete',
 'de',
 'mojito',
 '.',
 '',
 'para',
 'mi',
 'perfecto',
 '!',
 '',
 'Devolucion',
 'a',
 'cocina',
 ',',
 'amabilidad',
 'de',
 'camarera',
 ',',
 'requerimiento',
 'de',
 'cuenta',
 'y',
 'adios',
 '.',
 '',
 'Así',
 'como',
 'el',
 'romesco',
 ',',
 'que',
 'era',
 'un',
 'poco',
 '"',
 'de',
 'bote',
 '"',
 '.',
 '',
 'Destacar',
 'los',
 'arroces',
 ',',
 'la',
 'caldereta',
 'de',
 'bogavante',
 ',',
 'las',
 'zambur

#### Evaluate on ES dev.in

In [12]:
for i in range(len(es_devin)):
    #If its not an empty line
    if(len(es_devin[i])!=0):
        #If the word can be found in our learned emission parameters, add the learned label
        if(es_devin[i] in estimated_emission_parameters.keys()):
            es_devin[i] = es_devin[i] + " " + estimated_emission_parameters[es_devin[i]][0]
        #else, use the label for unknown
        else:
            es_devin[i] = es_devin[i] + " " + estimated_emission_parameters["#UNK#"][0]

In [13]:
es_devin

['Plato B-negative',
 'degustación I-positive',
 ': O',
 'un O',
 'poco O',
 'abundante O',
 'de I-positive',
 'más O',
 ', O',
 'pero I-neutral',
 'bien O',
 'cocinado O',
 '. O',
 '',
 'restaurante B-negative',
 'excelente O',
 'con I-neutral',
 'carne B-negative',
 'de I-positive',
 'alta O',
 'calidad I-positive',
 '. O',
 '',
 'Las O',
 'posibilidades I-neutral',
 'en I-neutral',
 'el I-neutral',
 'restaurante B-negative',
 'son O',
 'fundamentalmente I-neutral',
 'tres I-negative',
 '; O',
 'carta B-negative',
 'normal O',
 ', O',
 'menú B-positive',
 'degustacion I-positive',
 'y O',
 'una O',
 'opción O',
 'intermedia I-neutral',
 'que O',
 'es O',
 'una O',
 'selección O',
 'de I-positive',
 'primeros O',
 'y O',
 'postres B-negative',
 'y O',
 'carta B-negative',
 'para O',
 'el I-neutral',
 'segundo B-negative',
 '. O',
 '',
 'No I-positive',
 'perderse I-neutral',
 'el I-neutral',
 'sorbete B-positive',
 'de I-positive',
 'mojito I-neutral',
 '. O',
 '',
 'para O',
 'mi O',

#### Write to dev.p1.out

In [9]:
filepath_dev_p1_out = os.path.join(os.getcwd(), 'Data', 'ES', 'dev.p1.out')

In [10]:
with open(filepath_dev_p1_out, 'w', encoding='utf-8') as file:
    for line in es_devin:
        file.write(line + '\n')

#### Compare dev.p1.out with dev.out for ES

## Train and Evaluate with RU

#### Read RU Train Dataset

In [11]:
filepath_RU_train = os.path.join(os.getcwd(), 'Data', 'RU', 'train')

#Read the file contents
with open(filepath_RU_train, 'r', encoding='utf-8') as file:
    file_contents_RU_train = file.readlines()
    
#Convert to training set
ru_training_set = [w.strip() for w in file_contents_RU_train]

#### Learn RU parameters

In [12]:
#Calculate the parameters using the training set
all_estimated_emission_parameters, trained_words = estimate_emission_parameters(ru_training_set)

#### Learn RU parameters: Get argmax_y( e(x|y) )

In [13]:
#Calculate y* = argmax_y e(x|y)
#i.e. find the y that produces the highest emission probability for x
estimated_emission_parameters = {}
for k,v in all_estimated_emission_parameters.items():
    #If the word is already in the estimated_emission_parameters
    if(k[0] in estimated_emission_parameters):
        #Check if its emission probability is greater than what has been stored previously
        #If it is greater, then update the tag and emission probability 
        if(v > estimated_emission_parameters[k[0]][1]):
            estimated_emission_parameters[k[0]] = [k[1],v]
    #else if the word is not already in estimated_emission_parameters
    #create an entry
    else:
        estimated_emission_parameters[k[0]] = [k[1],v]

#### Read RU dev.in Dataset

In [14]:
filepath_RU_devin = os.path.join(os.getcwd(), 'Data', 'RU', 'dev.in')

#Read the file contents
with open(filepath_RU_devin, 'r', encoding='utf-8') as file:
    file_contents_RU_devin = file.readlines()
    
ru_devin = [w.strip() for w in file_contents_RU_devin]

#### Evaluate on RU dev.in

In [15]:
for i in range(len(ru_devin)):
    #If its not an empty line
    if(len(ru_devin[i])!=0):
        #If the word can be found in our learned emission parameters, add the learned label
        if(ru_devin[i] in estimated_emission_parameters.keys()):
            ru_devin[i] = ru_devin[i] + " " + estimated_emission_parameters[ru_devin[i]][0]
        #else, use the label for unknown
        else:
            ru_devin[i] = ru_devin[i] + " " + estimated_emission_parameters["#UNK#"][0]

#### Write to dev.p1.out

In [16]:
filepath_dev_p1_out = os.path.join(os.getcwd(), 'Data', 'RU', 'dev.p1.out')

In [17]:
with open(filepath_dev_p1_out, 'w', encoding='utf-8') as file:
    for line in ru_devin:
        file.write(line + '\n')

#### Compare dev.p1.out with dev.out for RU