In [8]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import time

pd.set_option('display.max_columns', None)
EN_train = 'EN/train'
SG_train = 'SG/train'
CN_train = 'CN/train'
EN_test = 'EN/dev.in'
SG_test = 'SG/dev.in'
CN_test = 'CN/dev.in'

In [12]:
def load_train(training_file):
    df = pd.read_csv(training_file, sep=' ', header = None, error_bad_lines=False)
    df.columns=['word','state']
    return df

def load_test(test_file):
    ls = []
    f = open(test_file,encoding="utf8")
    for line in f:
        ls.append(line.strip('\n'))
    df_test = pd.DataFrame(ls)
    df_test.columns=['word']
    return df_test
        
# df_test = load_test(EN_test)
# print(len(df_test))
df_train = load_train(SG_train)
print(len(df_train))
df_train.head(5)

df_test = load_test(SG_test)
print(len(df_test))
df_test.head(5)

119595
36841


Unnamed: 0,word
0,Everything
1,sounds
2,better
3,with
4,the


In [13]:
def createMatrix(df):
    start = time.time()
    columns = df.word.unique().tolist()
    index = df.state.unique().tolist()
    new_df = pd.DataFrame(columns=columns, index=index)
    print(f'time elapsed {time.time()-start} seconds')
    return new_df
emission_matrix = createMatrix(df_train)    
# emission_matrix.head(1)

time elapsed 1.3129985332489014 seconds


In [14]:
print(len(emission_matrix))
print(len(emission_matrix.columns))

7
26578


In [15]:
def emissionMatrix_special(df, emission_matrix):
    k=0.5
    start = time.time()
    df_denominator = df.groupby('state').count()   # getting counts of states
    df_counts = df.groupby(['state','word']).size().reset_index()   # getting counts of every word in each state
    df_merged = df_counts.merge(df_denominator, left_on=['state'], right_on='state')  # merge 
    df_merged = df_merged.rename(columns={"word_x": "word",0:"word_count", "word_y": "state_count"})
    df_merged['Probability'] = df_merged.word_count/(df_merged.state_count+k)    # get emission probability (count of word in that state/ state count)
    for index, row in tqdm(df_merged.iterrows()):  # for every known probabilty
        emission_matrix.loc[row['state'],row['word']] = row['Probability']   # append into the emission matrix
    for i in df_train.state.unique().tolist():
        emission_matrix.loc[i,'#UNK#'] = float(k/df_denominator.loc[i]+k)
    emission_matrix = emission_matrix.fillna(0)   # fill those null cells with zero
    print(f'time elapsed {time.time()-start}')
    return emission_matrix

df_special = emissionMatrix_special(df_train, emission_matrix)
# df_special.head(21)

30508it [00:06, 4610.55it/s]
time elapsed 8.913762092590332


In [16]:
def argmax(df):
    start = time.time()
    tags={}
    for col in df.columns:
        tags[col]=df.index[df[col].argmax()]
    return tags
        
tags = argmax(df_special)

def tag_system(tag_dict, test_df):
    start = time.time()
    test_ls = test_df['word'].tolist()
    tag_states=[]
    for i in test_ls:
        if i in tag_dict.keys():
            tag_states.append(tag_dict[i])
        elif i=="":   # for blank lines, set state to be blank
            tag_states.append("")
        elif i not in tag_dict.keys():
            tag_states.append(tag_dict['#UNK#'])

    test_df['states']=tag_states
    print(f'time elapsed {time.time()-start}')
    return test_df
output = tag_system(tags,df_test)

time elapsed 0.012000560760498047


In [17]:
output.head(50)

Unnamed: 0,word,states
0,Everything,O
1,sounds,O
2,better,O
3,with,O
4,the,O
5,Titanic,I-negative
6,song,I-negative
7,laa,O
8,.,O
9,Hahaha,O
