In [177]:
#Part 2 
#(1) Write a function to evaluate the emission parameters based on data set X and Y

#Input: Directory of files (datafile_dir)
#Output: X data set and Y data set (X,Y)
#####################################################################
def get_XY(datafile_dir):
    f = open(datafile_dir)
    f_content = f.read()
    X = []
    Y = []
    xi = []
    yi = []
    
    for data_pair in f_content.split('\n'):
        
        if data_pair == '':
            if xi != []:
                X.append(xi)
                Y.append(yi)
                xi = []
                yi = []
            
        else:
            xij,yij = data_pair.split(" ")
            xi.append(xij)
            yi.append(yij)
            
    return (X,Y)
#####################################################################
#Helper function: Get X sequence from a file
#Input: Directory of a file (datafile_dir)
#Output: Array of X sequences (X)
def get_X(datafile_dir):
    f = open(datafile_dir)
    f_content = f.read()
    X = []
    xi = []
    for data in f_content.split('\n'):
        
        if data == '':
            if (xi != []):
                X.append(xi)
                xi = []
        else:
            xij = data
            xi.append(xij)
    return X


#####################################################################
#Input: Dataset X and Y (X,Y)
#Output: Emission parameters based on Dataset X and Y( em_dic, count_y_dic)
def train_emission_param(data_file_dir):
    X,Y =get_XY(data_file_dir)
    o_unique = []
    T = ['O', 'B-positive', 'I-positive','B-negative','I-negative','B-neutral','I-neutral']
    
    for xi in X:
        
        for o in xi:
            if o not in o_unique:
                o_unique.append(o)
   
    count_y_dic = {'O':0, 'B-positive':0, 'I-positive':0,'B-negative':0,'I-negative':0,'B-neutral':0,'I-neutral':0}
    count_x_y_dic = {}
    em_dic = {}
    for i in range(len(X)):
        xi = X[i]
        yi = Y[i]
        
        for j in range(len(xi)):
            key = (xi[j],yi[j])
            key_deno = yi[j]
            origin = count_y_dic[key_deno] 
            count_y_dic[key_deno] = origin + 1
            
            if key not in count_x_y_dic:
                count_x_y_dic[key] = 1
            else:
                value = count_x_y_dic[key]
                count_x_y_dic[key] = value + 1
   
    for o in o_unique:
        
        for state in T:
            key = (o,state)
            if key not in count_x_y_dic:
                em_dic[key] = 0
            else:
                em_dic[key] =float(count_x_y_dic[key])/float(count_y_dic[state]+1)
    em_dic[1] = count_y_dic
    return (em_dic)
#train_emission_param('CN/train')
        
#####################################################################    
#Part 2 2) Helper function to include non-appeared word in the test set
#Input: emission parameters, and count of y , a new word
#Output: updated emission parameters and count of y
def get_default_parameter(em_dic):
    T = ['O', 'B-positive', 'I-positive','B-negative','I-negative','B-neutral','I-neutral']
    default ={}
    for state in T:
        default[state] = 1/float(em_dic[1][state]+1)
    return (default)
       
    
#####################################################################        

#Part 2 3)

#Helper function: Get the optimal y labels for the x sequence
#Input: evaluation file directory, and trained emission parameters
#Output: predicted y sequence
def get_y_predict(em_dic,x_test):
    
    y_predict = []
    T = ['O', 'B-positive', 'I-positive','B-negative','I-negative','B-neutral','I-neutral']
    
    for xm in x_test:
        ym = []
        
        for xi in xm:
            temp = 0
            yi = 'O'
            
            for state in T:
                if (xi,state) not in em_dic:
                    default = get_default_parameter(em_dic,xi)
                    if default[state]>=temp:
                        temp=default[state]
                        yi = state
                else:
                    if em_dic[(xi,state)] >= temp:
                        temp = em_dic[(xi,state)]
                        yi = state
            #print (xi,yi)
            ym.append(yi)
        y_predict.append(ym) 
    return y_predict



############################################################################
#Function: To write the predictions into a file 
#Input: data_file_dir,devout_dir,devin_dir
def output_prediction(data_file_dir,devin_dir,devout_dir,algo):
    em_dic = train_emission_param(data_file_dir)
    x_test = get_X(devin_dir)
    if algo=='v':
        y_predict =viterbi(data_file_dir,devin_dir)
    else:
        y_predict = get_y_predict(em_dic,x_test)
    f_out = open(devout_dir,'w')
    for i in range(len(x_test)):
        xi = x_test[i] 
        yi = y_predict[i]
        for j in range(len(xi)):
            f_out.write(xi[j]+" "+yi[j]+"\n")
        f_out.write(' \n')
    f_out.close()


#########################Testing PART 2############################################
#output_prediction('SG/train','SG/dev.in','SG/dev.P2test.out','part2')        
     
######################### PART 3###########################################

     
def train_tran_param(data_file_dir):
    X,Y = get_XY(data_file_dir)
    tp_dic = {}
    T = ['START','O', 'B-positive', 'I-positive','B-negative','I-negative','B-neutral','I-neutral','STOP']
    count_y_dic = {'START':0,'O':0, 'B-positive':0, 'I-positive':0,'B-negative':0,'I-negative':0,'B-neutral':0,
                   'I-neutral':0,'STOP':0}
    count_yf_yt = {}
    tp_dic = {}
    for yi in Y:
        count_y_dic['START'] +=1
        yi1 = yi[0]
        key =('START', yi1)
        if key not in count_yf_yt:
            count_yf_yt[('START',yi1)] = 1
        else:
            count_yf_yt[('START',yi1)] += 1
        for f in range(len(yi)-1):
            t = f + 1 
            yf = yi[f]
            yt = yi[t]
            key = (yf,yt)
            if key not in count_yf_yt:
                count_yf_yt[key] = 1
            else:
                 count_yf_yt[key] = count_yf_yt[key] +1
            if yf not in count_y_dic:
                count_y_dic[yf] = 1
            else:
                count_y_dic[yf] +=1
            if t == len(yi):
                count_y_dic[yt] +=1
                key = (yi[t],'STOP')
                if key not in count_yf_yt:
                    count_yf_yt[key] = 1
                else:
                    count_yf_yt[key] +=1
        count_y_dic['STOP'] +=1
         
    for state_from in T[:8]:
        for state_to in T[1:9]:
            key = (state_from,state_to)
            if key not in count_yf_yt:
                tp_dic[key] = 0
            else:
                tp_dic[key] = float(count_yf_yt[key])/float(count_y_dic[state_from])
                print (key,tp_dic[key])
    return tp_dic
#train_tran_param('CN/train')

######################### viterbi ###########################################
def viterbi(data_file_dir,test_data_dir):
    em = train_emission_param(data_file_dir)
    tran = train_tran_param(data_file_dir)
    x_test =get_X(test_data_dir)
    T = ['O', 'B-positive', 'I-positive','B-negative','I-negative','B-neutral','I-neutral']
    y_predict =[]
    for xm in x_test:
        ym = []
        #base case
        temp = []
        for state_first in range(len(T)):
            key_em = (xm[0],T[state_first])
            key_tra = ('START',T[state_first])
            if key_em not in em:
                default = get_default_parameter(em)
                score = 1.0*default[T[state_first]]*tran[key_tra]
            else :
                score = 1.0 * em[key_em]*tran[key_tra]
            element_temp = ('START',state_first,score)
            temp.append(element_temp)
      
        #moving forward recursivly
        ym.append(temp)
        print (0)
        print (ym)
        temp = []
        for i in range(len(xm)-1):
            i=i+1
            for state_to in range(len(T)):
                max_score = float(0)
                max_state_from = 'O'
                for state_from in range(len(T)):
                    key_em = (xm[i],T[state_to])
                    key_tra = (T[state_from],T[state_to])
                   
                    if key_em not in em:
                        default = get_default_parameter(em,key_em[0])
                        score = float(ym[i-1][state_from][2])*float(default[T[state_to]])*float(tran[key_tra])
                    else :
                        score = float(ym[i-1][state_from][2])*float(em[key_em])*float(tran[key_tra])
                    if score >=max_score:
                        max_score = score
                        max_state_from = state_from
                element_temp = (max_state_from,state_to,max_score)
                temp.append(element_temp)
            ym.append(temp)
            print (i)
            print (ym)
            temp = []
        # final case 
        max_score =float(0)
        max_state = 'O'
        for state_from in range(len(T)):
            final_layer = len(xm)
            score = float(ym[final_layer][state_from][2])* float(tran[(T[state_from],'STOP')])
            if score >= max_score:
                max_score = score
                max_state = state_from
        key = (max_state,'STOP',max_score)
        temp.append(key)
        ym.append(temp)
        print ('last:')
        print (ym)
        #backtracking 
        y1 = max_state
        ym_predict_num =[]
        for i in range(len(xm),0,-1):
            y2 = y1
            ym_predict_num.append(y2)
            y1 = ym[i][y2][0]
        ym_predict_lable =[]
        t_dic ={0:'O', 1:'B-positive', 2:'I-positive',3:'B-negative',4:'I-negative',5:'B-neutral',6:'I-neutral'}
        for i in ym_predict_num:
            y = t_dic[i]
            ym_predict_lable.append(y)
        y_predict.append(ym_predict_lable)
    return y_predict

#########################Testing PART 3############################################
#train_tran_param('EN/train') 
#viterbi('EN/train','EN/dev.in') 
output_prediction('EN/train','EN/dev.in','EN/dev.P2.out','part2')
output_prediction('EN/train','EN/dev.in','EN/dev.P3.out','v') 

('START', 'O') 0.7251063829787234
('START', 'B-positive') 0.05191489361702128
('START', 'B-negative') 0.02127659574468085
('START', 'B-neutral') 0.20170212765957446
('O', 'O') 0.9206572196432934
('O', 'B-positive') 0.019056926459809432
('O', 'B-negative') 0.005619350109943806
('O', 'B-neutral') 0.054666503786953334
('B-positive', 'O') 0.5351351351351351
('B-positive', 'B-positive') 0.002702702702702703
('B-positive', 'I-positive') 0.4540540540540541
('B-positive', 'B-negative') 0.002702702702702703
('B-positive', 'B-neutral') 0.005405405405405406
('I-positive', 'O') 0.6995515695067265
('I-positive', 'B-positive') 0.004484304932735426
('I-positive', 'I-positive') 0.2914798206278027
('I-positive', 'B-neutral') 0.004484304932735426
('B-negative', 'O') 0.6206896551724138
('B-negative', 'I-negative') 0.35344827586206895
('B-negative', 'B-neutral') 0.02586206896551724
('I-negative', 'O') 0.7959183673469388
('I-negative', 'I-negative') 0.1836734693877551
('I-negative', 'B-neutral') 0.02040816

IndexError: list index out of range

In [173]:
for i in range(10):
    print (i)

0
1
2
3
4
5
6
7
8
9


In [160]:

T = ['START','O', 'B-positive', 'I-positive','B-negative','I-negative','B-neutral','I-neutral','STOP']
print (T[:8])
print (T[1:9])

['START', 'O', 'B-positive', 'I-positive', 'B-negative', 'I-negative', 'B-neutral']
['O', 'B-positive', 'I-positive', 'B-negative', 'I-negative', 'B-neutral', 'I-neutral', 'STOP']
