In [28]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
from collections import Counter

pd.set_option('display.max_columns', None)
EN_train = 'EN/train'
SG_train = 'SG/train'
CN_train = 'CN/train'
EN_test = 'EN/dev.in'
SG_test = 'SG/dev.in'
CN_test = 'CN/dev.in'

In [29]:
def load_train(training_file):
    df = pd.read_csv(training_file, sep=' ', header = None, error_bad_lines=False)
    df.columns=['word','state']
    return df

def load_test(test_file):
    ls = []
    f = open(test_file,encoding="utf8")
    for line in f:
        ls.append(line.strip('\n'))
    df_test = pd.DataFrame(ls)
    df_test.columns=['word']
    return df_test
        
# df_test = load_test(EN_test)
# print(len(df_test))
df_train = load_train(EN_train)
print(len(df_train))
df_train.head(5)

df_test = load_test(EN_test)
print(len(df_test))
df_test.head(5)

181628
27225


Unnamed: 0,word
0,HBO
1,has
2,close
3,to
4,24


In [30]:
def createMatrix(df):
    start = time.time()
    columns = df.word.unique().tolist()
    index = df.state.unique().tolist()
    new_df = pd.DataFrame(columns=columns, index=index)
    print(f'time elapsed {time.time()-start} seconds')
    return new_df
empty_matrix = createMatrix(df_train)    
# emission_matrix.head(1)

time elapsed 0.926966667175293 seconds


In [33]:
print(len(empty_matrix))
print(len(empty_matrix.columns))

21
18212


In [35]:
def emissionMatrix_special(df, emission_matrix):
    k=0.5
    start = time.time()
    df_denominator = df.groupby('state').count()   # getting counts of states
    df_counts = df.groupby(['state','word']).size().reset_index()   # getting counts of every word in each state
    df_merged = df_counts.merge(df_denominator, left_on=['state'], right_on='state')  # merge 
    df_merged = df_merged.rename(columns={"word_x": "word",0:"word_count", "word_y": "state_count"})
    df_merged['Probability'] = df_merged.word_count/(df_merged.state_count+k)    # get emission probability (count of word in that state/ state count)
    for index, row in tqdm(df_merged.iterrows()):  # for every known probabilty
        emission_matrix.loc[row['state'],row['word']] = row['Probability']   # append into the emission matrix
    for i in df_train.state.unique().tolist():
        emission_matrix.loc[i,'#UNK#'] = float(k/df_denominator.loc[i]+k)
    emission_matrix = emission_matrix.fillna(0)   # fill those null cells with zero
    print(f'time elapsed {time.time()-start}')
    return emission_matrix

emission_matrix = emissionMatrix_special(df_train, empty_matrix)
# df_special.head(21)

25051it [00:10, 2489.42it/s]
time elapsed 11.810049057006836


In [16]:
def argmax(df):
    start = time.time()
    tags={}
    for col in df.columns:
        tags[col]=df.index[df[col].argmax()]
    return tags
        
tags = argmax(df_special)

def tag_system(tag_dict, test_df):
    start = time.time()
    test_ls = test_df['word'].tolist()
    tag_states=[]
    for i in test_ls:
        if i in tag_dict.keys():
            tag_states.append(tag_dict[i])
        elif i=="":   # for blank lines, set state to be blank
            tag_states.append("")
        elif i not in tag_dict.keys():
            tag_states.append(tag_dict['#UNK#'])

    test_df['states']=tag_states
    print(f'time elapsed {time.time()-start}')
    return test_df
output = tag_system(tags,df_test)

time elapsed 0.012000560760498047


In [17]:
output.head(50)

Unnamed: 0,word,states
0,Everything,O
1,sounds,O
2,better,O
3,with,O
4,the,O
5,Titanic,I-negative
6,song,I-negative
7,laa,O
8,.,O
9,Hahaha,O


In [20]:
def load_train_trans(training_file):
    f = open(training_file)
    ls_state = ['START']
    for line in f:
        item = line.strip('\n').split(' ')
        if len(item) == 2:
            ls_state.append(item[1])
        elif len(item) < 2:
            ls_state.append('STOP')
            ls_state.append('START')
    ls_state.pop(-1)
    return ls_state

def relation_matrix(temp):
    count = Counter(temp)
    list_key = list(count.keys())
    rls_matrix = pd.DataFrame(columns=list_key, index=list_key)
    for (x, y), c in Counter(zip(temp, temp[1:])).items():
        rls_matrix.loc[[x], [y]] = c/count[x]
    rls_matrix = rls_matrix.fillna(value=0)
    rls_matrix = rls_matrix.drop(columns='START')
    rls_matrix = rls_matrix.drop(index='STOP')
    return rls_matrix

In [21]:
sequence_ls = load_train_trans(EN_train)
transition_matrix = relation_matrix(sequence_ls)

In [26]:
print(transition_matrix.head())

            B-NP      I-NP      B-VP    B-ADVP    B-ADJP  I-ADJP      B-PP  \
START   0.648049  0.000000  0.018661  0.054287  0.003262     0.0  0.108704   
B-NP    0.028898  0.684706  0.130303  0.009809  0.003213     0.0  0.058007   
I-NP    0.047645  0.406679  0.134912  0.015332  0.004103     0.0  0.156509   
B-VP    0.345217  0.000000  0.007229  0.031214  0.039209     0.0  0.098735   
B-ADVP  0.210379  0.000000  0.215989  0.016269  0.016550     0.0  0.170547   

               O      STOP    B-SBAR      I-VP    I-ADVP     B-PRT  I-PP  \
START   0.141850  0.000000  0.022576  0.000000  0.000000  0.000000   0.0   
B-NP    0.080964  0.000233  0.003403  0.000000  0.000000  0.000359   0.0   
I-NP    0.227327  0.000788  0.006375  0.000000  0.000000  0.000128   0.0   
B-VP    0.067411  0.000055  0.025574  0.373912  0.000000  0.011171   0.0   
B-ADVP  0.265358  0.000842  0.016269  0.000000  0.086957  0.000281   0.0   

         B-CONJP  I-CONJP    B-INTJ  I-INTJ  I-SBAR     B-UCP  I-UCP     B

In [None]:
# emission_matrix; transition_matrix

In [36]:
m = open('EN/dev.in', encoding="utf8")
ls=[]
big_ls=[]
for line in m:
    item=line.strip('\n')
    if item=='':
        big_ls.append(ls)
        ls=[]
    elif item!='':
        ls.append(item)

['The',
 'percentage',
 'rates',
 'are',
 'calculated',
 'on',
 'a',
 '360-day',
 'year',
 ',',
 'while',
 'the',
 'coupon-equivalent',
 'yield',
 'is',
 'based',
 'on',
 'a',
 '365-day',
 'year',
 '.']