In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
from collections import Counter
import matplotlib.pyplot

pd.set_option('display.max_columns', None)
EN_train = 'EN/train'
SG_train = 'SG/train'
CN_train = 'CN/train'
EN_test = 'EN/dev.in'
SG_test = 'SG/dev.in'
CN_test = 'CN/dev.in'

In [2]:
7//3

2

In [3]:
def load_train(training_file):
    df = pd.read_csv(training_file, sep=' ', header = None, error_bad_lines=False)
    df.columns=['word','state']
    return df

def load_test(test_file):
    ls = []
    f = open(test_file,encoding="utf8")
    for line in f:
        ls.append(line.strip('\n'))
    df_test = pd.DataFrame(ls)
    df_test.columns=['word']
    return df_test
        
# df_test = load_test(EN_test)
# print(len(df_test))
df_train = load_train(EN_train)
print(len(df_train))
df_train.head(5)

df_test = load_test(EN_test)
print(len(df_test))
df_test.head(5)

181628
27225


Unnamed: 0,word
0,HBO
1,has
2,close
3,to
4,24


In [4]:
def createMatrix(df):
    start = time.time()
    columns = df.word.unique().tolist()
    index = df.state.unique().tolist()
    new_df = pd.DataFrame(columns=columns, index=index)
    print(f'time elapsed {time.time()-start} seconds')
    return new_df
empty_matrix = createMatrix(df_train)    
# emission_matrix.head(1)

time elapsed 0.6731152534484863 seconds


In [5]:
print(len(empty_matrix))
print(len(empty_matrix.columns))

21
18212


In [6]:
def emissionMatrix_special(df, emission_matrix):
    k=0.5
    start = time.time()
    df_denominator = df.groupby('state').count()   # getting counts of states
    df_counts = df.groupby(['state','word']).size().reset_index()   # getting counts of every word in each state
    df_merged = df_counts.merge(df_denominator, left_on=['state'], right_on='state')  # merge 
    df_merged = df_merged.rename(columns={"word_x": "word",0:"word_count", "word_y": "state_count"})
    df_merged['Probability'] = df_merged.word_count/(df_merged.state_count+k)    # get emission probability (count of word in that state/ state count)
    for index, row in tqdm(df_merged.iterrows()):  # for every known probabilty
        emission_matrix.loc[row['state'],row['word']] = row['Probability']   # append into the emission matrix
    for i in df_train.state.unique().tolist():
        emission_matrix.loc[i,'#UNK#'] = float(k/df_denominator.loc[i]+k)
    emission_matrix = emission_matrix.fillna(0)   # fill those null cells with zero
    print(f'time elapsed {time.time()-start}')
    return emission_matrix

emission_matrix = emissionMatrix_special(df_train, empty_matrix)

25051it [00:04, 6136.25it/s]
time elapsed 5.726562976837158


In [7]:
def argmax(df):
    start = time.time()
    tags={}
    for col in df.columns:
        tags[col]=[df[col].argmax()]
    return tags
        
tags = argmax(emission_matrix)

def tag_system(tag_dict, test_df):
    start = time.time()
    test_ls = test_df['word'].tolist()
    tag_states=[]
    for i in test_ls:
        if i in tag_dict.keys():
            tag_states.append(tag_dict[i])
        elif i=="":   # for blank lines, set state to be blank
            tag_states.append("")
        elif i not in tag_dict.keys():
            tag_states.append(tag_dict['#UNK#'])

    test_df['states']=tag_states
    print(f'time elapsed {time.time()-start}')
    return test_df
output = tag_system(tags,df_test)


time elapsed 0.01430201530456543


In [8]:
output.head(50)

Unnamed: 0,word,states
0,HBO,[0]
1,has,[2]
2,close,[4]
3,to,[6]
4,24,[1]
5,million,[1]
6,subscribers,[1]
7,to,[6]
8,its,[0]
9,HBO,[0]


In [9]:
def load_train_trans(training_file):
    f = open(training_file)
    ls_state = ['START']
    for line in f:
        item = line.strip('\n').split(' ')
        if len(item) == 2:
            ls_state.append(item[1])
        elif len(item) < 2:
            ls_state.append('STOP')
            ls_state.append('START')
    ls_state.pop(-1)
    return ls_state

def relation_matrix(temp):
    count = Counter(temp)
    list_key = list(count.keys())
    rls_matrix = pd.DataFrame(columns=list_key, index=list_key)
    for (x, y), c in Counter(zip(temp, temp[1:])).items():
        rls_matrix.loc[[x], [y]] = c/count[x]
    rls_matrix = rls_matrix.fillna(value=0)
    rls_matrix = rls_matrix.drop(columns='START')
    rls_matrix = rls_matrix.drop(index='STOP')
    return rls_matrix

In [10]:
sequence_ls = load_train_trans(EN_train)
transition_matrix = relation_matrix(sequence_ls)

In [11]:
print(transition_matrix.head())

            B-NP      I-NP      B-VP    B-ADVP    B-ADJP  I-ADJP      B-PP  \
START   0.648049  0.000000  0.018661  0.054287  0.003262     0.0  0.108704   
B-NP    0.028898  0.684706  0.130303  0.009809  0.003213     0.0  0.058007   
I-NP    0.047645  0.406679  0.134912  0.015332  0.004103     0.0  0.156509   
B-VP    0.345217  0.000000  0.007229  0.031214  0.039209     0.0  0.098735   
B-ADVP  0.210379  0.000000  0.215989  0.016269  0.016550     0.0  0.170547   

               O      STOP    B-SBAR      I-VP    I-ADVP     B-PRT  I-PP  \
START   0.141850  0.000000  0.022576  0.000000  0.000000  0.000000   0.0   
B-NP    0.080964  0.000233  0.003403  0.000000  0.000000  0.000359   0.0   
I-NP    0.227327  0.000788  0.006375  0.000000  0.000000  0.000128   0.0   
B-VP    0.067411  0.000055  0.025574  0.373912  0.000000  0.011171   0.0   
B-ADVP  0.265358  0.000842  0.016269  0.000000  0.086957  0.000281   0.0   

         B-CONJP  I-CONJP    B-INTJ  I-INTJ  I-SBAR     B-UCP  I-UCP     B

In [12]:
# emission_matrix; transition_matrix

In [13]:
m = open('EN/dev.in', encoding="utf8")
ls=[]
big_ls=[]
for line in m:
    item=line.strip('\n')
    if item=='':
        big_ls.append(ls)
        ls=[]
    elif item!='':
        ls.append(item)

In [15]:
def log(x, inf_replace=-1000):
    out = np.log(x)
    out[~np.isfinite(out)] = inf_replace
    return out
logged_emission = log(emission_matrix)
logged_transition = log(transition_matrix)
transition_np = logged_transition.drop(['START']).drop('STOP',axis=1).to_numpy()

# test for one document
tags = argmax(emission_matrix)   # vocab of words
Vertibri = []
document = big_ls[1]
# print(document)
forward_steps = len(document)+1
for i in range(forward_steps):
    if i == 0: # for from START to first layer
        if document[i] in tags.keys():
            layer = [t+e for t,e in zip(logged_transition.loc['START'].drop('STOP'), logged_emission[document[i]])]
        elif document[i] not in tags.keys():
            layer = [t+e for t,e in zip(logged_transition.loc['START'].drop('STOP'), logged_emission['#UNK#'])]
        Vertibri.append(layer)
        print(type(Vertibri[-1]))
    elif i!=0 and i!=forward_steps-1: #not first or last step
        prev_layer_prob = Vertibri[-1]*21
        prev_layer_prob = np.array(prev_layer_prob).reshape(21,21).T
        m = prev_layer_prob + transition_np
        if document[i] in tags.keys():
            emission_ls = logged_emission[document[i]].tolist()*21
            emission_np = np.array(emission_ls).reshape(21,21)
        elif document[i] not in tags.keys():
            emission_ls = logged_emission['#UNK#'].tolist()*21
            emission_np = np.array(emission_ls).reshape(21,21)
        matrix = (m + emission_np)
        layer = np.amax(matrix,0)
        Vertibri.append(layer.tolist())
    elif i == forward_steps-1:
        prev_layer_prob = np.array(Vertibri[-1])
        last = logged_transition.drop('START')['STOP'].tolist()
        layer = prev_layer_prob+last
        Vertibri.append(layer.tolist())

state_order = []
states = emission_matrix.index.tolist()
# Vertibri.pop(0)
for layer in Vertibri:
    position = layer.index(max(layer))
    state_order.append(states[position])
# state_order = []
# states = emission_matrix.index.tolist()
# for layer in Vertibri:
#     position = layer.index[max(layer)]
#     # states = emission_matrix.index.tolist()
#     state_order.append(states[position])
# print(state_order)

<class 'list'>


In [16]:
print(Vertibri.pop(0))
for layer in Vertibri:
    print(layer.index(max(layer)))

[-1.1269149062712336, -1000.69312886269, -4.674406621256191, -3.606340285694397, -6.417859247451664, -1000.6914065360821, -2.91221798459123, -2.6460872352060765, -4.483487962690325, -1000.693048750519, -1000.690396147188, -1000.6910127080313, -1000.688672900165, -8.923956123652886, -1000.6776429940239, -7.296980590553456, -1000.5596157879354, -1000.6725278933573, -1000.0, -1000.4700036292458, -7.470853092860883]
1
1
6
0
1
1
2
1
1
7
7


In [17]:
state_order = []
states = emission_matrix.index.tolist()
Vertibri.pop(0)
for layer in Vertibri:
    position = layer.index(max(layer))
    # states = emission_matrix.index.tolist()
    state_order.append(states[position])
print(state_order)

['I-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'I-NP', 'I-NP', 'O', 'O']


In [19]:
prev_layer_prob = np.array(Vertibri[-2])
print(prev_layer_prob)
print(logged_transition.drop('START')['STOP'].tolist())

print(prev_layer_prob + logged_transition.drop('START')['STOP'].tolist() )

[-1055.39380749   -61.88077903 -1054.71560117 -1056.89027143
 -1057.56906696 -2049.28405913 -1054.56711094   -55.358485
 -1057.76789302 -1055.31396004 -2049.1346912  -1058.82462713
 -2048.85586406 -1061.22220023 -2051.08745187 -1063.44959994
 -2052.71247119 -2051.24784746 -2052.71247119 -2052.71247119
 -2052.71247119]
[-8.366476004340175, -7.14642419728779, -9.812522917162035, -7.0803070441802864, -7.467942332285852, -6.352629396319567, -8.433104811033951, -1.1448211692559165, -1000.0, -9.226115291091546, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0, -1000.0]
[-1063.7602835    -69.02720323 -1064.52812409 -1063.97057847
 -1065.03700929 -2055.63668853 -1063.00021575   -56.50330616
 -2057.76789302 -1064.54007533 -3049.1346912  -2058.82462713
 -3048.85586406 -2061.22220023 -3051.08745187 -2063.44959994
 -3052.71247119 -3051.24784746 -3052.71247119 -3052.71247119
 -3052.71247119]


In [20]:
# transition_np = logged_transition.drop(['START']).drop('STOP',axis=1).to_numpy()
# ls = Vertibri[-1]*21
# prev_np = np.array(ls).reshape(21,21).T
# print(prev_np)
# print(Vertibri[-1].tolist()*21)
# ls = logged_emission[document[3]].tolist()*21
# emission_np = np.array(ls).reshape(21,21)
# print(emission_np)

print(logged_transition.drop('START').index.tolist())
print(emission_matrix.index.tolist())

['B-NP', 'I-NP', 'B-VP', 'B-ADVP', 'B-ADJP', 'I-ADJP', 'B-PP', 'O', 'B-SBAR', 'I-VP', 'I-ADVP', 'B-PRT', 'I-PP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-LST']
['B-NP', 'I-NP', 'B-VP', 'B-ADVP', 'B-ADJP', 'I-ADJP', 'B-PP', 'O', 'B-SBAR', 'I-VP', 'I-ADVP', 'B-PRT', 'I-PP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-LST']


In [21]:
# tags = argmax(emission_matrix)   # vocab of words
# Vertibri = []
# document = big_ls[1]
# # print(document)
# forward_steps = len(document)+1
# for i in range(forward_steps):
#     if i == 0: # for from START to first layer
#         if document[i] in tags.keys():
#             layer = [t*e for t,e in zip(transition_matrix.loc['START'].drop('STOP'), emission_matrix[document[i]])]
#         elif document[i] not in tags.keys():
#             layer = [t*e for t,e in zip(transition_matrix.loc['START'].drop('STOP'), emission_matrix['#UNK#'])]
#         Vertibri.append(layer)
#         break
# print(Vertibri[0])
# print(Vertibri[0].index(max(Vertibri[0])))

In [22]:
document = big_ls[1]
len(document)

11

In [23]:
import numpy as np
ls = [1/6]*3 + [0]*3 + [1/16]*3
# print(ls)
prev = np.array(ls).reshape(3,3)

ls1 = [1/6,0,4/6,1/4,0,0,1/8,4/8,1/8]
trans = np.array(ls1).reshape(3,3)

ls2 = [1/6,1/4,1/8]*3
print(len(ls))
em = np.array(ls2).reshape(3,3)
print('prev prob matrix')
print(prev)
print('trans matrix')
print(trans)
print('em matrix')
print(em)

output = np.multiply(prev,trans)
output = np.multiply(output,em)
print('next prob matrix')
print(output)

print(np.amax(output,0))


9
prev prob matrix
[[0.16666667 0.16666667 0.16666667]
 [0.         0.         0.        ]
 [0.0625     0.0625     0.0625    ]]
trans matrix
[[0.16666667 0.         0.66666667]
 [0.25       0.         0.        ]
 [0.125      0.5        0.125     ]]
em matrix
[[0.16666667 0.25       0.125     ]
 [0.16666667 0.25       0.125     ]
 [0.16666667 0.25       0.125     ]]
next prob matrix
[[0.00462963 0.         0.01388889]
 [0.         0.         0.        ]
 [0.00130208 0.0078125  0.00097656]]
[0.00462963 0.0078125  0.01388889]


In [24]:
df = pd.DataFrame(
	[[21, 72, 67],
	[23, 78, 69],
	[32, 74, 56],
	[52, 54, 76]],
	columns=['a', 'b', 'c'])

print('DataFrame\n----------\n', df)

#convert dataframe to numpy array
arr = df.to_numpy()

print('\nNumpy Array\n----------\n', arr)

print(arr.shape)

DataFrame
----------
     a   b   c
0  21  72  67
1  23  78  69
2  32  74  56
3  52  54  76

Numpy Array
----------
 [[21 72 67]
 [23 78 69]
 [32 74 56]
 [52 54 76]]
(4, 3)


In [30]:
df.applymap(lambda x: [x,x,x])

Unnamed: 0,a,b,c
0,"[21, 21, 21]","[72, 72, 72]","[67, 67, 67]"
1,"[23, 23, 23]","[78, 78, 78]","[69, 69, 69]"
2,"[32, 32, 32]","[74, 74, 74]","[56, 56, 56]"
3,"[52, 52, 52]","[54, 54, 54]","[76, 76, 76]"


In [27]:
df

Unnamed: 0,a,b,c
0,21,72,67
1,23,78,69
2,32,74,56
3,52,54,76


In [33]:
x = np.array([[1,2,3], [4,5,6]])
print(x)

[[1 2 3]
 [4 5 6]]


In [38]:
duplicater = lambda t: [x,x,x]
func = np.vectorize(duplicater)
func(x)

SyntaxError: invalid syntax (<ipython-input-38-42b16fc2d200>, line 1)

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
       [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]])

In [51]:
f = lambda x: np.repeat(x,21) 
np.apply_along_axis(f,1,x)

In [55]:
62//3

20

In [58]:
np.argmax([[5,5]])

0