In [2]:
import numpy as np
import pandas as pd

## Process File Functions

In [3]:
def get_data(filename):
    f = open(filename,'r')
    lines = f.readlines()
    datas = []
    
    start = 0
    for i in range(len(lines)):
        if lines[i] == '\n':
            datas.append(lines[start:i])
            start = i+1
        lines[i] = lines[i].replace('\n','')
        lines[i] = tuple(lines[i].split(' '))
        
    # check formatting
    for i in range(len(datas)):
        for j in range(len(datas[i])):
            #print datas[i][j]
            assert len(datas[i][j])==2
            
    
    for i in range(len(datas)):
        data = datas[i]
        x = [word[0] for word in data]
        y = [word[1] for word in data]
        datas[i] = [x,y]
    
    all_x = []
    for i in range(len(datas)):
        for j in range(len(datas[i][0])):
            all_x.append(datas[i][0][j])
    x_set = frozenset(all_x)
    
    all_y = []
    for i in range(len(datas)):
        for j in range(len(datas[i][0])):
            all_y.append(datas[i][1][j])
    y_set = frozenset(all_y)
    
    return dict(data=datas,x_set=x_set,y_set=y_set)
            

## Get data from file

In [4]:
data_dict = get_data('EN/train')

### - Write a function that estimates the emission parameters from the training set using MLE
(maximum likelihood estimation):

In [5]:
def get_emission_counts(data_dict):
    """
    returns (DataFrame,Series) 
    an emission count (y->x) DataFrame and y count Series
    """
    data = data_dict['data']
    x_set = data_dict['x_set']
    y_set = data_dict['y_set']
    count_em_df = pd.DataFrame(np.zeros((len(x_set),len(y_set))),index=x_set,columns=y_set)
    count_y = pd.Series(np.zeros(len(y_set)),index=y_set)

    for instance in data:
        x_vector,y_vector = instance
        for i in range(len(x_vector)):
            x,y = x_vector[i],y_vector[i]
            count_em_df.loc[x,y]+=1
            count_y[y]+=1
    return count_em_df,count_y

def get_emission_params(data_dict):
    """
    returns DataFrame representing conditional probabilities P(y|x)
    """
    count_em_df,count_y = get_emission_counts(data_dict)
    return count_em_df/count_y



In [6]:

em_df = get_emission_params(data_dict)
em_df.head()


Unnamed: 0,B-neutral,B-positive,O,B-negative,I-neutral,I-positive,I-negative
limited,0.0,0.0,8.3e-05,0.0,0.0,0.0,0.0
too--but,0.0,0.0,4.1e-05,0.0,0.0,0.0,0.0
unpretentious,0.0,0.0,8.3e-05,0.0,0.0,0.0,0.0
four,0.0,0.000828,0.000248,0.0,0.0,0.0,0.0
Dining,0.0,0.000828,0.0,0.0,0.0,0.0,0.0


In [7]:
em_df.sum(axis=0)


B-neutral     1.0
B-positive    1.0
O             1.0
B-negative    1.0
I-neutral     1.0
I-positive    1.0
I-negative    1.0
dtype: float64

### (10 pts) One problem with estimating the emission parameters is that some words that appear in the test set do not appear in the training set. One simple idea to handle this issue is as follows. First, replace those words that appear less than k times in the training set with a special token #UNK# before training. This leads to a “modified training set”. We then use such a modified training set to train our model.
### During the testing phase, if the word does not appear in the “modified training set”, we replace that word with #UNK# as well.
### Set k to 3, implement this fix into your function for computing the emission parameters.

In [8]:
def get_modified_counts(data_dict,k):
    count_em_df,count_y = get_emission_counts(data_dict)
    
    counts_x = count_em_df.sum(axis=1)
    fail = counts_x[counts_x<k]

    unk = count_em_df.loc[fail.index].sum(axis=0)
    unk.name = '#UNK#'
   
    modified_df = count_em_df.append(unk)
    modified_df = modified_df.drop(fail.index, axis=0) 
    
    return modified_df,count_y


def get_modified_emission_params(data_dict,k=3):
    """
    returns DataFrame representing conditional probabilities P(y|x)
    """
    count_em_df,count_y = get_modified_counts(data_dict,k)
    return count_em_df/count_y


In [9]:
modified_em_params = get_modified_emission_params(data_dict)
modified_em_params.sum(axis=0)

B-neutral     1.0
B-positive    1.0
O             1.0
B-negative    1.0
I-neutral     1.0
I-positive    1.0
I-negative    1.0
dtype: float64

In [10]:
modified_em_params.tail()


Unnamed: 0,B-neutral,B-positive,O,B-negative,I-neutral,I-positive,I-negative
–,0.0,0.0,0.002393,0.0,0.0,0.0,0.0
',0.0,0.0,0.00066,0.0,0.0,0.0,0.007519
’,0.0,0.0,0.000578,0.0,0.043478,0.003295,0.0
portion,0.0,0.000828,8.3e-05,0.0,0.0,0.0,0.0
#UNK#,0.169231,0.24255,0.116492,0.183246,0.217391,0.347611,0.255639


In [11]:
em_df.loc[['four','NYC']]

Unnamed: 0,B-neutral,B-positive,O,B-negative,I-neutral,I-positive,I-negative
four,0.0,0.000828,0.000248,0.0,0.0,0.0,0.0
NYC,0.0,0.0,0.000578,0.0,0.043478,0.0,0.0


### (10 pts) Implement a simple sentiment analysis system that produces the tag
```y = argmax_y e(x|y) ```
### for each word x in the sequence.

In [12]:
def train(filename,k=3):
    data_dict = get_data(filename)
    return get_modified_emission_params(data_dict,k=k)

def argmax_y(emission_params,x):
    # check if x in trained x's 
    if x not in emission_params.index:
        x = '#UNK#'
    p = emission_params.loc[x,:]
    
    max_p = None
    for col in p.index:
        if max_p is None:
            max_p = p.loc[col]
            y = col
        elif p.loc[col]>max_p:
            max_p = p.loc[col]
            y = col
    return y

def decode(filename,emission_params,outfile):
    f = open(filename,'r')
    lines = f.readlines()
    lines = [line.replace('\n','') for line in lines]
    #print lines
    
    for i in range(len(lines)):
        line = lines[i]
        if line != '':
            line = line +' '+argmax_y(emission_params,line)
        line += '\n'
        
        lines[i] = line
        
    fout = open(outfile,'w')
    for line in lines:
        fout.write(line)
    fout.close()
    print "decoding completed"
            

In [19]:
word = 'NYC'
try:
    print modified_em_params.loc[word]
except:
    word = '#UNK#'
    print modified_em_params.loc[word]
argmax_y(modified_em_params,word)

B-neutral     0.000000
B-positive    0.000000
O             0.000578
B-negative    0.000000
I-neutral     0.043478
I-positive    0.000000
I-negative    0.000000
Name: NYC, dtype: float64


'I-neutral'

## Training and Decoding on EN data Results

In [14]:
emission_params = train('EN/train')
decode('EN/dev.in',emission_params,'EN/dev.p2.out')

decoding completed


```
>python3 evalResult.py EN/dev.out EN/dev.p2.out

#Entity in gold data: 226
#Entity in prediction: 1201

#Correct Entity : 165
Entity  precision: 0.1374
Entity  recall: 0.7301
Entity  F: 0.2313

#Correct Sentiment : 71
Sentiment  precision: 0.0591
Sentiment  recall: 0.3142
Sentiment  F: 0.0995
```

## Training and Decoding on CN data Results

In [16]:
emission_params = train('CN/train')
decode('CN/dev.in',emission_params,'CN/dev.p2.out')

decoding completed


```
>python3 evalResult.py CN/dev.out CN/dev.p2.out

#Entity in gold data: 362
#Entity in prediction: 3318

#Correct Entity : 183
Entity  precision: 0.0552
Entity  recall: 0.5055
Entity  F: 0.0995

#Correct Sentiment : 57
Sentiment  precision: 0.0172
Sentiment  recall: 0.1575
Sentiment  F: 0.0310
```

## Training and Decoding on FR data Results

In [17]:
emission_params = train('FR/train')
decode('FR/dev.in',emission_params,'FR/dev.p2.out')

decoding completed


```
>python3 evalResult.py FR/dev.out FR/dev.p2.out

#Entity in gold data: 223
#Entity in prediction: 1149

#Correct Entity : 182
Entity  precision: 0.1584
Entity  recall: 0.8161
Entity  F: 0.2653

#Correct Sentiment : 68
Sentiment  precision: 0.0592
Sentiment  recall: 0.3049
Sentiment  F: 0.0991
```

## Training and Decoding on SG data Results

In [22]:
emission_params = train('SG/train')
decode('SG/dev.in',emission_params,'SG/dev.p2.out')

decoding completed


```
>python3 evalResult.py SG/dev.out SG/dev.p2.out

#Entity in gold data: 1382
#Entity in prediction: 6599

#Correct Entity : 794
Entity  precision: 0.1203
Entity  recall: 0.5745
Entity  F: 0.1990

#Correct Sentiment : 315
Sentiment  precision: 0.0477
Sentiment  recall: 0.2279
Sentiment  F: 0.0789
```

In [26]:
import subprocess
p = subprocess.Popen(['python3','evalResult.py','SG/dev.out','SG/dev.p2.out'], stdout=subprocess.PIPE)
print p.communicate()[0]


#Entity in gold data: 1382
#Entity in prediction: 6599

#Correct Entity : 794
Entity  precision: 0.1203
Entity  recall: 0.5745
Entity  F: 0.1990

#Correct Sentiment : 315
Sentiment  precision: 0.0477
Sentiment  recall: 0.2279
Sentiment  F: 0.0789

