# Term 6 Machine Learning Project

In [1]:
import pandas as pd
import re
import numpy as np
import sys
import math

In [2]:
running = "FR"

filetrain = "{}/train".format(running)
filetest = "{}/dev.in".format(running)

In [3]:
# Used when running on gcolab
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)
# # use /content/gdrive/My Drive/

## Part 2 & 3 Training data Processing

Algorithm:  
for loop tweets, tags:  
add each tag into a dictionary. (key:tags, values:\[words\])

In [4]:
def file_to_df(inputfile):
    """
    Function to process input file to dataframe
    inputfile: File to be processed

    Returns:
    df: Output dataframe
    all_tags: unique tags
    all_words: dictionary of all words with tag as key
    k_of_dict = dictionary of unknown var k
    """

    fin = open(inputfile,encoding="UTF-8")
    rawframe=[]
    all_tags = [] # List holding All unique Tags
    all_words = {} # dictionary of all words with tag as key
    k_of_dict={}
    for line in fin:
        if len(line) == 0: continue
        cols = re.split('\s+(?=\S+$)',line) #Using the last whitespace as separator
        if len(cols) > 1:
            tag = cols[1].strip()
            word = cols[0].strip()
            if tag not in all_tags:
                all_tags.append(tag)
            if tag not in all_words:
                all_words[tag] = [word]
                k_of_dict[tag] = 1
            else:
                all_words[tag].append(word)
        rawframe.append(cols)

    df = pd.DataFrame(rawframe, columns = ["Word", "Tag"])
    df["transit"] = None # Create an extra column for transition start/end for states
    print("File Processing Completed")
    return df, all_tags, all_words, k_of_dict

## Part 2a For Emission w/o including UNK

In [5]:
def get_emission_probability(x,y,all_words):

    '''
    Retrieves emission probability
    x: String value which is the emitted word
    y: String value which is the given tag
    all_words: dictionary of all words with tag as key
    
    Returns float probability of emitting x from y
    If invalid parameters, return None
    '''

    try:
        total_y_words = len(all_words[y])
        total_tag_to_word = all_words[y].count(x)
        return total_tag_to_word/total_y_words
    except:
        return 0.0



## Part 2b For Emission including UNK
During the testing phase, if word does not appear in the training set, we replace the word with the special word token #UNK#

In [6]:
def test_get_emission_probability(x, y, k_of_dict, all_words):
    '''
    Returns float probability of emitting x from y, accounting in #UNKN#
    If invalid parameters, return None
    x: String value which is the emitted word
    y: String value which is the given tag
    k_of_dict: dictionary of unknown var k
    all_words: dictionary of all words with tag as key
    '''
    global_counter=0
    for i in all_words:
        if all_words[i].count(x)!=0:
            global_counter+=1

    try:
        total_y_words = len(all_words[y])
        total_tag_to_word = all_words[y].count(x)
        if global_counter == 0:
            calculatedprob = float(1 / (total_y_words + 1))
            return calculatedprob
        else:
            calculatedprob = float(total_tag_to_word / (total_y_words + 1))
            return calculatedprob
    except:
        return 0.0
    


## Part 2c Emission on test data
argmax word to tag

In [7]:
def preprocess_unk(filetest,filename):
    kw_dict = {}
    inputFile = open(filename, 'r', encoding="UTF-8")
    for line in inputFile:
        line = line.strip().split()
        if len(line) == 0:
            continue
        else:
            line = line[0].lower()
        if line in kw_dict:
            continue
        else:
            kw_dict[line] = 1
    inputList = []
    word_column = open(filetest, 'r', encoding="UTF-8")
    for each_word in word_column:
        each_word = each_word.lower().strip()
        if each_word=='':
            inputList.append(None) # input blank lines with blank word. replace it later
        elif kw_dict.get(each_word)==1:
            inputList.append(each_word)
        else:
            inputList.append("#UNK#")

    df = pd.DataFrame(inputList, columns=["Word"])
    print("File Processing Completed")
    return df

In [8]:
def tag_creator(word_column, k_of_dict, all_words):
    '''
    Returns the most probable word, calls test_get_emission_probability inside
    word_column: df column of words to predict
    k_of_dict: dictionary of unknown var k
    all_words: dictionary of all words with tag as key
    '''
    tag_column=[]
    data_length = word_column.shape[0]
    completed_length = 0
    for x in word_column:
        completed_length += 1
        print("Working on {}/{}".format(completed_length,data_length), end='\r', flush=True)
        if x == None:
            tag_column.append(None)
        else:
            highest_prob = 0.0
            most_probable = ""
            for key in all_words:
                curr_prob = test_get_emission_probability(x, key, k_of_dict, all_words)
                if curr_prob > highest_prob:
                    highest_prob = curr_prob
                    most_probable = key
            tag_column.append(most_probable)
        
    print("Tags Created")
    return tag_column

## Part 2 Testing

In [9]:
# Testing out Part 2a
print('----------Part 2a----------')
df, all_tags, all_words, k_of_dict = file_to_df(filetrain)
print("Testing emission of trump as B-Positive, probability: {}".format(get_emission_probability("trump","B-positive", all_words)))
print()

# Testing out part 2b
print('----------Part 2b----------')
print("Testing emission of kahwee as B-Positive, probability: {}".format(test_get_emission_probability("kahwee","B-positive", k_of_dict, all_words)))
print()

# Testing out part 2c and produce dev.out
print('----------Part 2c----------')
outframe2 = preprocess_unk(filetest,filetrain)
outframe2["Tag"]=tag_creator(outframe2["Word"], k_of_dict, all_words)
# outframe2.to_csv("SG/devSG.out", sep=" ", index=False, header=False)
outframe2.to_csv("{}/dev.p2.out".format(running), sep=" ", index=False, header=False)

----------Part 2a----------
File Processing Completed
Testing emission of trump as B-Positive, probability: 0.0

----------Part 2b----------
Testing emission of kahwee as B-Positive, probability: 0.0012330456226880395

----------Part 2c----------
File Processing Completed
Tags Created700/3700


In [10]:
outframe2

Unnamed: 0,Word,Tag
0,petite,O
1,salle,I-neutral
2,ambiance,B-positive
3,plage,I-positive
4,#UNK#,I-neutral
5,.,O
6,,
7,la,I-positive
8,salle,I-neutral
9,est,O


## Part 3 Processing For Transition

In [11]:
def startEndCol(df):
    """
    This function label the new column as either start or end based on the position of the "None" tag in the Tag Column.
    df: This is the dataframe generated from Part 2 without start and end. Dataframe needs to have columns Word, Tag, transit
    """
    dataframeSize = len(df.index)
    df.loc[0]['transit']= "Start"
    df.loc[dataframeSize-2]['transit']='End'
    counter = 0
    for rows in df.iterrows():
        if rows[1][1]==None and counter<dataframeSize-1:
            df.loc[counter-1]['transit'] = "End"
            df.loc[counter+1]['transit'] = "Start"
            counter+=1
        else: counter+=1
    print("Start End Columns Assigned")
    return df


## Part 3a

In [12]:
def transition_creation(transitionframe):
    '''
    Creates a dictionary of transitions
    transitionframe: dataframe to be that transitions will be based on
    
    Returns:
    transition_dict: dictionary with transitions from transition frame
    '''
    transition_dict = {"Start": [], "End":[]}
    transitionframe = transitionframe.replace('\n','', regex=True)
    previous_Tag = "Starter"
    for index, row in transitionframe.iterrows():
        current_Tag = row["Tag"]
        if previous_Tag == "Starter":
            previous_Tag == current_Tag
        if row['transit'] == "Start":
            transition_dict["Start"].append(current_Tag)
        elif row["transit"]=="End":
            if current_Tag not in transition_dict:
                transition_dict[current_Tag] = ["End"]
            else:
                transition_dict[current_Tag].append("End")
        else:
            if previous_Tag not in transition_dict:
                transition_dict[previous_Tag] = [current_Tag]
            else:
                transition_dict[previous_Tag].append(current_Tag)
        previous_Tag = current_Tag
    print("Transition Dictionary Created")
    return transition_dict

In [13]:
def test_get_transition_probability(y1,y2,transition_dict):
    '''
    Calculates the probability of y1 going into y2
    y1: previous tag
    y2: probable tag
    transition_dict: dictionary of transitions
    
    Returns:
    probability of y1 going into y2
    
    '''
    try:
        count_y1_y2 = transition_dict[y1].count(y2)
        count_y1 = len(transition_dict[y1])
        probability = count_y1_y2/count_y1
        return float(probability)
    except:
        return 0.0

## Part 3b
Predict Label  
Given the obsercation sequence, find the most optimum sequence

In [14]:
def preprocess_viterbi(in_file):
    '''
    1.Preprocess csv file with only words input,
    2.Replace space lines as "EndOfSentence"
    3.Inserting start/end indicators for sentences,
    4.Remove the "EOS" tag at the end
    5.Reset the index
    6.Add previous tag and current tag columns
    7.Set First column to Start
    
    '''
    inputList=[]
    inputFile = open(in_file, 'r', encoding="UTF-8")
    for line in inputFile:
        if(line.strip()==""):
            line = "EOS"
        else:line = line.strip()
        if len(line) == 0:continue
        inputList.append(line)
    df = pd.DataFrame(inputList, columns = ["Word"])
    df['transit'] = "Empty"
    counter=0
    dataframeSize = len(df.index)
    df.loc[0]['transit']= "Start"
    df.loc[dataframeSize-2]['transit']='End'
    for index,rows in df.iterrows():
        if rows["Word"]=="EOS" and counter<dataframeSize-1:
            df.loc[counter-1]['transit'] = "End"
            df.loc[counter+1]['transit'] = "Start"
            counter+=1
        else: counter+=1
    df = df.replace('EOS','', regex=True)
    df=df.reset_index(drop=True)
    df['previous_tag'] = 'not entered yet'
    df['previous_tag'][0] = 'Start'
    df['Tag'] = 'not entered yet'
    return df

In [15]:
def viterbi(sentence, sentence_index, outframe, transition_dict, k_of_dict, all_words):
    '''
    Returns most probable tag for a given transition and word
    sentence: array of words in sentence
    sentence_index: array of index for words in sentence
    outframe: dataframe to change
    transition_dict: dictionary of transitions
    k_of_dict: dictionary of unknown var k
    all_words: dictionary of all words with tag as key
    '''
    tag_track = []
    for tag in all_words:
        tag_track.append(tag)
    table_width = len(all_words) + 1
    prob_table = []
    for i in range(len(sentence)+1):
        row = []
        for j in range(len(tag_track)):
            if i==0:
                ij_transition = test_get_transition_probability("Start", tag_track[j], transition_dict)
                ij_emission = test_get_emission_probability(sentence[i], tag_track[j], k_of_dict, all_words)
                ij_value = ij_transition*ij_emission
                if ij_value != 0:
                    ij_value = -1*math.log(ij_value)
                row.append((ij_value, "Start"))
            elif i==len(sentence):
                ij_previous = prob_table[i-1][j][0]
                if ij_previous == 0:
                    row.append((0, j))
                    continue
                ij_transition = test_get_transition_probability(tag_track[j], "End", transition_dict)
                if ij_transition != 0:
                    ij_value = -1*math.log(ij_transition)*ij_previous
                else:
                    ij_value = 0
                row.append((ij_value, j))
            else:
                largest_value = sys.maxsize
                largest_index = 0
                for k in range(len(tag_track)):
                    kj_previous = prob_table[i-1][k][0]
                    if kj_previous == 0:
                        continue
                    kj_transition = test_get_transition_probability(tag_track[k], tag_track[j], transition_dict)
                    kj_emission = test_get_emission_probability(sentence[i], tag_track[j], k_of_dict, all_words)
                    kj_value = kj_transition*kj_emission
                    if kj_value != 0:
                        kj_value = -1*math.log(kj_value)*kj_previous
                    if kj_value < largest_value and kj_value != 0:
                        largest_value = kj_value
                        largest_index = k
                row.append((largest_value,largest_index))
        prob_table.append(row)
        
    sequence = []
    highest_prob = 0
    previous_tag = ''
    for i in range(len(prob_table[len(sentence)-1])):
        compare_prob = prob_table[len(sentence)-1][i][0]
        if compare_prob > highest_prob and compare_prob != 0:
            highest_prob = compare_prob
            previous_tag = prob_table[len(sentence)-1][i][1]
    for i in range(len(prob_table)-1):
        sequence.append(tag_track[previous_tag])
        previous_tag = prob_table[len(prob_table)-i-2][previous_tag][1]
        
    sequence.reverse()
    
    for i in range(len(sentence_index)):
        outframe.loc[sentence_index[i]]["Tag"] = sequence[i]
    return sequence
    

In [16]:
def viterbi_on_df(outframe, transition_dict, k_of_dict, all_words):
    """
    Process the viterbi algorithm via row by row of the dataframe
    1. Takes into account of transit(Start and ending of a sentence)
    2. 
    """
    df_size = outframe.shape[0]
    is_sentence = False
    sentence = []
    sentence_index = []
    for index, row in outframe.iterrows():
        if row["Word"] == "":
            continue
        sentence.append(row['Word'])
        sentence_index.append(index)
        if row["transit"] == "End":
            is_sentence = True
        if is_sentence:
            output = viterbi(sentence, sentence_index, outframe, transition_dict, k_of_dict, all_words)
            is_sentence = False
            sentence = []
            sentence_index = []
        if index >= outframe.shape[0]: break
        print("Working on {}/{}".format(index,df_size), end='\r', flush=True)
    return outframe

In [17]:
transitionframe = startEndCol(df)
# Removing columns with [Tag] Column=None
transitionframe = transitionframe[~transitionframe['Tag'].isin([None])]
# print(transitionframe)

Start End Columns Assigned


In [18]:
# Testing Part 3a
print('----------Part 3a----------')
transition_dict = transition_creation(transitionframe)
print("Testing transition of B-positive to I-positive, probability: {}".format(test_get_transition_probability("B-positive","I-positive", transition_dict)))
print()

# Testing Part 3b
print('----------Part 3b----------')
outframe = preprocess_viterbi(filetest)
outframe = viterbi_on_df(outframe, transition_dict, k_of_dict, all_words)

----------Part 3a----------
Transition Dictionary Created
Testing transition of B-positive to I-positive, probability: 0.12319790301441677

----------Part 3b----------
Working on 3698/3700

In [19]:
outframe

Unnamed: 0,Word,transit,previous_tag,Tag
0,Petite,Start,Start,O
1,salle,Empty,not entered yet,O
2,ambiance,Empty,not entered yet,B-positive
3,plage,Empty,not entered yet,I-positive
4,Oléronaise,Empty,not entered yet,I-positive
5,.,End,not entered yet,O
6,,Empty,not entered yet,not entered yet
7,La,Start,not entered yet,O
8,salle,Empty,not entered yet,O
9,est,Empty,not entered yet,O


In [20]:
p3out = outframe[["Word","Tag"]].copy()
p3out = p3out.replace('not entered yet','', regex=True)
p3out.to_csv("{}/dev.p3.out".format(running), sep=" ", index=False, header=False)