# Term 6 Machine Learning Project

In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
# Used when running on gcolab
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)
# # use /content/gdrive/My Drive/

## Part 2 & 3 Training data Processing

Algorithm:  
for loop tweets, tags:  
add each tag into a dictionary. (key:tags, values:\[words\])

In [4]:
def file_to_df(inputfile):
    """
    Function to process input file to dataframe
    inputfile: File to be processed

    Returns:
    df: Output dataframe
    all_tags: unique tags
    all_words: dictionary of all words with tag as key
    k_of_dict = dictionary of unknown var k
    """

    fin = open(inputfile,encoding="UTF-8")
    rawframe=[]
    all_tags = [] # List holding All unique Tags
    all_words = {} # dictionary of all words with tag as key
    k_of_dict={}
    for line in fin:
        if len(line) == 0: continue
        cols = re.split('\s+(?=\S+$)',line) #Using the last whitespace as separator
        if len(cols) > 1:
            tag = cols[1].strip()
            word = cols[0].strip()
            if tag not in all_tags:
                all_tags.append(tag)
            if tag not in all_words:
                all_words[tag] = [word]
                k_of_dict[tag] = 1
            else:
                all_words[tag].append(word)
        rawframe.append(cols)

    df = pd.DataFrame(rawframe, columns = ["Word", "Tag"])
    df["transit"] = None # Create an extra column for transition start/end for states
    print("File Processing Completed")
    return df, all_tags, all_words, k_of_dict

## Part 2a For Emission w/o including UNK

In [5]:
def get_emission_probability(x,y,all_words):

    '''
    Retrieves emission probability
    x: String value which is the emitted word
    y: String value which is the given tag
    all_words: dictionary of all words with tag as key
    
    Returns float probability of emitting x from y
    If invalid parameters, return None
    '''

    try:
        total_y_words = len(all_words[y])
        total_tag_to_word = all_words[y].count(x)
        return total_tag_to_word/total_y_words
    except:
        return 0.0



## Part 2b For Emission including UNK
During the testing phase, if word does not appear in the training set, we replace the word with the special word token #UNK#

In [50]:
def test_get_emission_probability(x, y, k_of_dict, all_words, tag_check):
    '''
    Returns float probability of emitting x from y, accounting in #UNKN#
    If invalid parameters, return None
    x: String value which is the emitted word
    y: String value which is the given tag
    k_of_dict: dictionary of unknown var k
    all_words: dictionary of all words with tag as key
    tag_check: list of tags to check
    '''
    global_counter=0
    for i in tag_check:
        if i == "End":
            continue
        if all_words[i].count(x)!=0:
            global_counter+=1

    try:
        total_y_words = len(all_words[y])
        total_tag_to_word = all_words[y].count(x)
        if global_counter == 0:
            calculatedprob = float(1 / (total_y_words + 1))
            return calculatedprob
        else:
            calculatedprob = float(total_tag_to_word / (total_y_words + 1))
            return calculatedprob
    except:
        return 0.0
    


## Part 2c Emission on test data
argmax word to tag

In [12]:
def preprocess_unk(filetest,filename):
    kw_dict = {}
    inputFile = open(filename, 'r', encoding="UTF-8")
    for line in inputFile:
        line = line.strip().split()
        if len(line) == 0:
            continue
        else:
            line = line[0].lower()
        if line in kw_dict:
            continue
        else:
            kw_dict[line] = 1
    inputList = []
    word_column = open(filetest, 'r', encoding="UTF-8")
    for each_word in word_column:
        each_word = each_word.lower().strip()
        if each_word=='':
            inputList.append(None) # input blank lines with blank word. replace it later
        elif kw_dict.get(each_word)==1:
            inputList.append(each_word)
        else:
            inputList.append("#UNK#")

    df = pd.DataFrame(inputList, columns=["Word"])
    print("File Processing Completed")
    return df

In [13]:
def tag_creator(word_column, k_of_dict, all_words, tag_check):
    '''
    Returns the most probable word, calls test_get_emission_probability inside
    word_column: df column of words to predict
    k_of_dict: dictionary of unknown var k
    all_words: dictionary of all words with tag as key
    tag_check: list of tags to check
    '''
    tag_column=[]
    data_length = word_column.shape[0]
    completed_length = 0
    for x in word_column:
        completed_length += 1
        print("Working on {}/{}".format(completed_length,data_length), end='\r', flush=True)
        if x == None:
            tag_column.append(None)
        else:
            highest_prob = 0.0
            most_probable = ""
            for key in all_words:
                curr_prob = test_get_emission_probability(x, key, k_of_dict, all_words, tag_check)
                if curr_prob > highest_prob:
                    highest_prob = curr_prob
                    most_probable = key
            tag_column.append(most_probable)
        
    print("Tags Created")
    return tag_column

In [24]:
filename = "SG/train"
filetest="SG/dev.in"

# Testing out Part 2a
print('----------Part 2a----------')
df, all_tags, all_words, k_of_dict = file_to_df(filename)
print("Testing emission of trump as B-Positive, probability: {}".format(get_emission_probability("trump","B-positive", all_words)))
print()

# Testing out part 2b
print('----------Part 2b----------')
tag_check = []
for tag in all_words:
    tag_check.append(tag)
print("Testing emission of kahwee as B-Positive, probability: {}".format(test_get_emission_probability("kahwee","B-positive", k_of_dict, all_words, tag_check)))
print()

# Testing out part 2c and produce dev.out
print('----------Part 2c----------')
outframe2 = preprocess_unk(filetest,filename)
outframe2["Tag"]=tag_creator(outframe2["Word"], k_of_dict, all_words, tag_check)
# outframe2.to_csv("SG/devSG.out", sep=" ", index=False, header=False)


----------Part 2a----------
File Processing Completed
Testing emission of trump as B-Positive, probability: 0.00030609121518212427

----------Part 2b----------
Testing emission of kahwee as B-Positive, probability: 0.0001530221882172915

----------Part 2c----------
File Processing Completed
Working on 136/44106

KeyboardInterrupt: 

In [15]:
outframe2

Unnamed: 0,Word
0,best
1,friends
2,who
3,cry
4,on
5,facetime
6,together
7,","
8,stay
9,together


## Part 3 Processing For Transition

In [25]:
filesg = "SG/dev.in"
filechina="CN/dev.in"
fileen="EN/dev.in"
filefr="FR/dev.in"

In [26]:
def startEndCol(df):
    """
    This function label the new column as either start or end based on the position of the "None" tag in the Tag Column.
    df: This is the dataframe you want to input. Dataframe needs to have columns Word, Tag, transit
    """
    dataframeSize = len(df.index)
    df.loc[0]['transit']= "Start"
    df.loc[dataframeSize-2]['transit']='End'
    counter = 0
    for rows in df.iterrows():
        if rows[1][1]==None and counter<dataframeSize-1:
            df.loc[counter-1]['transit'] = "End"
            df.loc[counter+1]['transit'] = "Start"
            counter+=1
        else: counter+=1
    print("Start End Columns Assigned")
    return df


In [27]:
transitionframe = startEndCol(df)

# Removing columns with [Tag] Column=None
transitionframe = transitionframe[~transitionframe['Tag'].isin([None])]
# print(transitionframe)

Start End Columns Assigned


## Part 3a

In [28]:
def transition_creation(transitionframe):
    '''
    Creates a dictionary of transitions
    transitionframe: dataframe to be that transitions will be based on
    
    Returns:
    transition_dict: dictionary with transitions from transition frame
    '''
    transition_dict = {"Start": [], "End":[]}
    transitionframe = transitionframe.replace('\n','', regex=True)
    previous_Tag = "Starter"
    for index, row in transitionframe.iterrows():
        current_Tag = row["Tag"]
        if previous_Tag == "Starter":
            previous_Tag == current_Tag
        if row['transit'] == "Start":
            transition_dict["Start"].append(current_Tag)
        elif row["transit"]=="End":
            transition_dict[current_Tag].append("End")
        else:
            if previous_Tag not in transition_dict:
                transition_dict[previous_Tag] = [current_Tag]
            else:
                transition_dict[previous_Tag].append(current_Tag)
        previous_Tag = current_Tag
    print("Transition Dictionary Created")
    return transition_dict

In [29]:
transition_dict = transition_creation(transitionframe)

In [30]:
def preprocess_viterbi(in_file):
    '''
    1.Preprocess csv file with only words input,
    2.Replace space lines as "EndOfSentence"
    3.Inserting start/end indicators for sentences,
    4.Remove the "EOS" tag at the end
    5.Reset the index
    6.Add previous tag and current tag columns
    7.Set First column to Start
    
    '''
    inputList=[]
    inputFile = open(in_file, 'r')
    for line in inputFile:
        if(line.strip()==""):
            # TODO: Remove if not needed
            #2
            line = "EOS"
        else:line = line.strip()
        if len(line) == 0:continue
        inputList.append(line)
    # TODO: Remove after debugging
    #print(inputList)
    df = pd.DataFrame(inputList, columns = ["Word"])
    df['transit'] = "Empty"
    counter=0
    dataframeSize = len(df.index)
    # TODO: Remove if not needed
    #3
    df.loc[0]['transit']= "Start"
    df.loc[dataframeSize-2]['transit']='End'
    for index,rows in df.iterrows():
        if rows["Word"]=="EOS" and counter<dataframeSize-1:
            df.loc[counter-1]['transit'] = "End"
            df.loc[counter+1]['transit'] = "Start"
            counter+=1
        else: counter+=1
    # TODO: Remove if not needed
    #4
    df = df[df['Word'] != 'EOS']
    # TODO: Remove if not needed
    #5
    df=df.reset_index(drop=True)
    # TODO: Remove if not needed
    #6-7
    df['previous_tag'] = 'not entered yet'
    df['previous_tag'][0] = 'Start'
    df['Tag'] = 'not entered yet'
    return df
# TODO: Remove if not needed
# if(line.strip()==""):

In [65]:
outframe = preprocess_viterbi(filesg)
outframe

Unnamed: 0,Word,transit,previous_tag,Tag
0,best,Start,Start,not entered yet
1,friends,Empty,not entered yet,not entered yet
2,who,Empty,not entered yet,not entered yet
3,cry,Empty,not entered yet,not entered yet
4,on,Empty,not entered yet,not entered yet
5,FaceTime,Empty,not entered yet,not entered yet
6,together,Empty,not entered yet,not entered yet
7,",",Empty,not entered yet,not entered yet
8,stay,Empty,not entered yet,not entered yet
9,together,End,not entered yet,not entered yet


## Part 3b
Predict Label  
Given the obsercation sequence, find the most optimum sequence

In [46]:
def viterbi(row, transition_dict, k_of_dict, all_words):
    '''
    Returns most probable tag for a given transition and word
    row: Specific row in a df
    transition_dict: dictionary of transitions
    k_of_dict: dictionary of unknown var k
    all_words: dictionary of all words with tag as key
    '''
    curr_word = row['Word']
    prev_tag = row['previous_tag']
    unique_set = set(transition_dict[prev_tag])
    unique_tags = list(unique_set)
    probability_dict = {}
    for tag in unique_tags:
        emissionprob = test_get_emission_probability(curr_word, tag, k_of_dict, all_words, unique_tags)
        probability_dict[tag] = transition_dict[prev_tag].count(tag)/len(transition_dict[prev_tag])*emissionprob    
    highest_probability = 0
    most_probable = ""
    for probability_tag in probability_dict:
        if probability_dict[probability_tag] > highest_probability:
            highest_probability = probability_dict[probability_tag]
            most_probable = probability_tag
    row['Tag'] = most_probable
    return (most_probable)
    

In [68]:
"""
Process the viterbi algorithm via row by row of the dataframe
1. Takes into account of transit(Start and ending of a sentence)
2. 
"""
def viterbi_on_df(outframe, transition_dict, k_of_dict, all_words):
    count = 0 
    df_size = outframe.shape[0]
    for index, row in outframe.iterrows():
        viterbi(row, transition_dict, k_of_dict, all_words)
        outframe.loc[count+1]['previous_tag'] = outframe.loc[count]['Tag']
        if outframe.loc[count]['transit'] == "End":
            outframe.loc[count+1]['previous_tag']  = "Start"
        count += 1
        if index >= outframe.shape[0]: break
        print("Working on {}/{}".format(index,df_size), end='\r', flush=True)
    return outframe

In [69]:
outframe = viterbi_on_df(outframe, transition_dict, k_of_dict, all_words)

Working on 88/41408

KeyboardInterrupt: 

In [70]:
outframe

Unnamed: 0,Word,transit,previous_tag,Tag
0,best,Start,Start,O
1,friends,Empty,O,O
2,who,Empty,O,O
3,cry,Empty,O,O
4,on,Empty,O,O
5,FaceTime,Empty,O,B-neutral
6,together,Empty,B-neutral,O
7,",",Empty,O,O
8,stay,Empty,O,O
9,together,End,O,O
