# Cascading CRF for Gitksan

### Imports

In [1]:
import os
from itertools import islice
from collections import Counter
from nltk.tokenize import word_tokenize
from sklearn.metrics import f1_score, classification_report
from sklearn_crfsuite import CRF
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn_crfsuite.utils import flatten
# import torch
# import torch.autograd as autograd
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim 

### Creating a dictionary of orthography, morphemes, gloss (for both training and dev sets)

In [2]:
dtrain_path = "data/Gitksan/git-train-track2-uncovered"
dev_path = "data/Gitksan/git-dev-track2-uncovered"

def get_feature_dict(pathname):
    gits_dict = defaultdict(list)
#     for pathname in paths:
#         filetype = pathname.split("-")[-3]
    with open(pathname, "r") as f:
        for line in f:
            if line.startswith("\\t"):
                gits_dict["orth"].append(line.lower().lstrip("\\t ").rstrip("\n").split(" "))
            if line.startswith("\\g"):
                gits_dict["gloss"].append(line.lstrip("\g ").rstrip("\n").split(" "))
            if line.startswith("\\m"):
                gits_dict["morphs"].append(line.lstrip("\m ").rstrip("\n").split(" "))
    return gits_dict

In [3]:
gits_train_dict = get_feature_dict(dtrain_path)
gits_dev_dict = get_feature_dict(dev_path)

In [4]:
gits_train_dict.keys()

dict_keys(['orth', 'morphs', 'gloss'])

In [5]:
gits_dev_dict.keys()

dict_keys(['orth', 'morphs', 'gloss'])

### Getting separate lists for tokens, tags and morphemes

In [6]:
# train_list = list(zip(arp_dict['train_orth'], arp_dict['train_morphs']))
# X_train, X_val = train_test_split(train_list, test_size=0.3, random_state=52)
# val_input = []
# gold_val = []
# for orth, morph in X_val:
#     val_input.append(orth)
#     gold_val.append(morph)
# train_input = []
# train_morphemes = []
# for orth, morph in X_train:
#     train_input.append(orth)
#     train_morphemes.append([mor.split("-") for mor in morph])  
# print(train_input[10], train_morphemes[10])

In [7]:
# Train
gits_train_tokens, gits_gloss, gits_morphemes = [], [], []
gits_train_tokens = list(gits_train_dict["orth"])
gits_gloss = list(gits_train_dict["gloss"])
gits_morphemes = list(gits_train_dict["morphs"])

In [8]:
# Dev
gits_dev_tokens = list(gits_dev_dict["orth"])
gits_dev_gloss = list(gits_dev_dict["gloss"])
gits_dev_morphemes = list(gits_dev_dict["morphs"])

In [9]:
print(gits_train_tokens[1], gits_morphemes[1])
print(gits_train_tokens[21], gits_morphemes[21])

['ii', 'na', "'wahl", "anhahla'lst", 'g̲oohl', 'stockholm', 'sawatdiit.'] ['ii', 'n', "'wa-hl", "an-hahla'lst", 'g̲oo-hl', 'Stockholm', 'si-wa-t-diit']
['agwiyukwhl', "ha'niisgwaa'ytxwhl", 'g̲an', 'wihl', 'neediit', 'naa', 'ji', "hahla'ljit."] ['agwiyukw-hl', "ha-'nii-sgwaa'ytxw-hl", 'g̲an', 'wil-hl', 'nee-dii-t', 'naa', 'ji', "hahla'lst-it"]


In [10]:
assert len(gits_train_tokens) == len(gits_morphemes) == len(gits_gloss)
assert len(gits_dev_tokens) == len(gits_dev_morphemes) == len(gits_dev_gloss)
print("Success!")

Success!


### Feature Engineering for CRF

In [11]:
def get_word_shape(word):
    '''takes in a word and returns the corresponding shape as follows:
    for uppercase letters - X
    for lowercase letters - x
    for digits - d
    and keeps punctuations and symbols as is'''
    if not word:
        return ''
    word_shape = []
    for char in word:
        if char.isupper():
            word_shape.append('X')
        elif char.islower():
            word_shape.append('x')
        elif char.isdigit():
            word_shape.append('d')
        else:
            word_shape.append(char)
    shape = ''.join(word_shape)
    return shape

In [12]:
def get_short_word_shape(shape):
    '''takes in a word shape and returns the corresponding shorter shape as follows by truncating repeating letters'''
    if not shape:
        return ''
    short_shape = []
    for x in range(len(shape) - 1):
        if shape[x] != shape[x + 1]:
            short_shape.append(shape[x])
        else:
            pass
    short_shape.append(shape[-1])
    short = ''.join(short_shape)
    return short

In [13]:
def word2features(sentence, morphemes, idx):
    word_features = {}
    
    #Word level features for each word
    shape = get_word_shape(sentence[idx])
    short_shape = get_short_word_shape(shape)
    word_features['word'] = sentence[idx]
    word_features['word_isdigit'] = sentence[idx].isdigit()
    word_features['word_distance_from_start'] = idx
    word_features['word_shape'] = shape
    word_features['short_word_shape'] = short_shape
    word_features['accent_marker'] = True if "'" in sentence[idx] else False
    
    word_features['word_prefix1'] = sentence[idx][:1]
    word_features['word_prefix2'] = sentence[idx][:2]
    word_features['word_suffix1'] = sentence[idx][-1:]
    word_features['word_suffix2'] = sentence[idx][-2:]
    
    if len(sentence[idx]) > 4:
        word_features['word_prefix3'] = sentence[idx][:3]
        word_features['word_prefix4'] = sentence[idx][:4]
        word_features['word_suffix3'] = sentence[idx][-3:]
        word_features['word_suffix4'] = sentence[idx][-4:]
    
    # To include features of the previous word
    if idx > 0:
        word_features['__BOS'] = False
        word = sentence[idx-1]
        shape = get_word_shape(word)
        short_shape = get_short_word_shape(shape)
        word_features['previous_word'] = word
        word_features['previous_word_distance_from_start'] = idx - 1
        word_features['previous_word_shape'] = shape
        word_features['previous_short_word_shape'] = short_shape
        word_features['previous_accent_marker'] = True if "'" in word else False
        
        word_features['previous_word_prefix1'] = word[:1]
        word_features['previous_word_prefix2'] = word[:2]
        word_features['previous_word_suffix1'] = word[-1:]
        word_features['previous_word_suffix2'] = word[-2:]

        if len(word) > 4:
            word_features['previous_word_prefix3'] = word[:3]
            word_features['previous_word_prefix4'] = word[:4]
            word_features['previous_word_suffix3'] = word[-3:]
            word_features['previous_word_suffix4'] = word[-4:]

    else:
        word_features['__BOS'] = True
        
    # To include features of next word
    if idx < len(sentence) - 1:
        word_features['__EOS'] = False
        word = sentence[idx+1]
        shape = get_word_shape(word)
        short_shape = get_short_word_shape(shape)
        word_features['next_word'] = word
        word_features['next_word_distance_from_start'] = idx + 1
        word_features['next_word_shape'] = shape
        word_features['next_short_word_shape'] = short_shape
        word_features['next_accent_marker'] = True if "'" in word else False
        
        word_features['next_word_prefix1'] = word[:1]
        word_features['next_word_prefix2'] = word[:2]
        word_features['next_word_suffix1'] = word[-1:]
        word_features['next_word_suffix2'] = word[-2:]

        if len(word) > 4:
            word_features['next_word_prefix3'] = word[:3]
            word_features['next_word_prefix4'] = word[:4]
            word_features['next_word_suffix3'] = word[-3:]
            word_features['next_word_suffix4'] = word[-4:]
        
    else:
        word_features['__EOS'] = True
        
    #Morpheme level features for each word
    word_features["morpheme"] = morphemes[idx]
    word_features["morpheme-struct"] = morphemes[idx].split("-")
    
    if idx > 0:
        word_features["previous_morpheme"] = morphemes[idx-1]
        word_features["previous_morpheme-struct"] = morphemes[idx-1].split("-")
    
    if idx < len(sentence) - 1:
        word_features["next_morpheme"] = morphemes[idx-1]
        word_features["next_morpheme-struct"] = morphemes[idx-1].split("-")
    
    return word_features

def sentence2features(sentence, morphemes):
    return [word2features(sentence, morphemes, idx) for idx in range(len(sentence))]

In [14]:
def prepare_ner_feature_dicts(sents, morphs, glosses):
    all_dicts = []
    all_tags = []
    for sent, morph in zip(sents, morphs):
        all_dicts.append(sentence2features(sent, morph))
    for gloss in glosses:
        all_tags.append(gloss)
    
    return all_dicts, all_tags

In [15]:
train_dicts, train_tags = prepare_ner_feature_dicts(gits_train_tokens, gits_morphemes, gits_gloss)
dev_dicts, dev_tags = prepare_ner_feature_dicts(gits_dev_tokens, gits_dev_morphemes, gits_dev_gloss)

In [16]:
assert len(train_dicts) == len(train_tags)
assert len(dev_dicts) == len(dev_tags)

### Train the CRF

In [17]:
crf = CRF(algorithm='lbfgs', verbose=1,c1=0.1, c2=0.001, max_iterations=200, all_possible_transitions=True)

try:
    crf.fit(train_dicts, train_tags)
except:
    pass

loading training data to CRFsuite: 100%|██████| 31/31 [00:00<00:00, 1878.90it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 27332
Seconds required: 0.032

L-BFGS optimization
c1: 0.100000
c2: 0.001000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.01  loss=1196.21  active=7673  feature_norm=1.00
Iter 2   time=0.01  loss=1108.08  active=7858  feature_norm=1.04
Iter 3   time=0.01  loss=1029.24  active=7904  feature_norm=1.59
Iter 4   time=0.01  loss=822.47   active=7871  feature_norm=4.35
Iter 5   time=0.01  loss=624.79   active=8096  feature_norm=6.48
Iter 6   time=0.01  loss=446.86   active=8132  feature_norm=9.32
Iter 7   time=0.01  loss=232.22   active=8083  feature_norm=15.27
Iter 8   time=0.01  loss=167.15   active=7915  feature_norm=18.15
Iter 9   time=0.01  loss=156.31   active=7688  feature_norm=18.72
Iter 10  ti

### Test dev predictions

In [18]:
from sklearn.metrics import accuracy_score
y_pred = crf.predict(dev_dicts)
print(accuracy_score(flatten(dev_tags), flatten(y_pred)))

0.25773195876288657


### Building a cascading crf 

In [19]:
from copy import deepcopy

train_copy = deepcopy(train_dicts)

for idx in range(len(train_copy)):
    for jdx in range(len(train_copy[idx])):
        if train_copy[idx][jdx]["__BOS"]:
            continue
        else:
            train_copy[idx][jdx]["prev_prediction"] = crf.predict_single(train_copy[idx][jdx - 1])

In [36]:
crf2 = CRF(algorithm='lbfgs', verbose=1, c1=0.001, c2=0.001, num_memories=9, linesearch='MoreThuente', 
           max_iterations=500, all_possible_transitions=True)

try:
    crf2.fit(train_copy, train_tags)
except:
    pass

loading training data to CRFsuite: 100%|██████| 31/31 [00:00<00:00, 1448.13it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 28381
Seconds required: 0.027

L-BFGS optimization
c1: 0.001000
c2: 0.001000
num_memories: 9
max_iterations: 500
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.03  loss=1283.47  active=28241 feature_norm=0.50
Iter 2   time=0.01  loss=1195.31  active=28224 feature_norm=0.36
Iter 3   time=0.01  loss=1180.61  active=28232 feature_norm=0.43
Iter 4   time=0.01  loss=1152.81  active=28156 feature_norm=0.98
Iter 5   time=0.01  loss=1101.04  active=28220 feature_norm=1.00
Iter 6   time=0.01  loss=1050.54  active=28230 feature_norm=1.28
Iter 7   time=0.01  loss=749.39   active=28128 feature_norm=4.86
Iter 8   time=0.01  loss=492.73   active=27567 feature_norm=8.23
Iter 9   time=0.01  loss=267.12   active=28135 feature_norm=11.60
Iter 10  time

In [37]:
from sklearn.metrics import accuracy_score
y_pred = crf2.predict(dev_dicts)
print(accuracy_score(flatten(dev_tags), flatten(y_pred)))

0.2860824742268041


In [38]:
def write_predictions(path: str, lang: str, preds, pred_input_data):
    """Writes the predictions to a new file, which uses the file in `path` as input"""
    def create_gloss_line(glosses, transcription_tokens):
        """
        Write a gloss for each transcription token
        We should never write more glosses than there are tokens
        If tokens are segmented, write morphemes together
        """
        output_line = ''
        for (token, gloss) in zip(transcription_tokens, glosses):
            if token[0] == '-':
                output_line += f"-{gloss}"
            else:
                output_line += f" {gloss}"
        return output_line

    decoded_preds = preds
    next_line = 0
    with open(path, 'r') as input:
        with open(lang + '_output_preds', 'w') as output:
            for line in input:
                line_prefix = line[:2]
                if line_prefix == '\\g':
                    output_line = create_gloss_line(glosses=decoded_preds[next_line], transcription_tokens=pred_input_data[next_line])
                    output_line = line_prefix + output_line + '\n'
                    output.write(output_line)
                    next_line += 1
                else:
                    output.write(line)
    print(f"Predictions written to ./{lang}_output_preds")

In [39]:
write_predictions(dev_path, 'Gitksan', y_pred, gits_dev_tokens)

Predictions written to ./Gitksan_output_preds


### Adding BIES and IOB tagging

In [105]:
print(gits_train_tokens[0])
print(gits_morphemes[0])

["'nakwhl", 'hlidaa', "'wihl", "wili'y", 'g̲oohl', 'wag̲ayt', 'andoosda', 'wil', 'jok̲hl', 'amxsiwaa.']
["'nakw-hl", 'hli-daa', "'wihl", "wil-'y", 'g̲oo-hl', 'wag̲ayt', 'an-doosda', 'wil', 'jok̲-hl', 'amxsiwaa']


In [110]:
gits_morph_MO = defaultdict(list)

for i, morph_list in enumerate(gits_morphemes):
    inner_list = []
    for morph in morph_list:
        if '-' in morph:
            inner_list.append('M')
        else:
            inner_list.append('O')
    gits_morph_MO[i] = inner_list

In [113]:
gits_morph_BIES = defaultdict(list)
 
MO_tags = list(gits_morph_MO.values())

B = "BEGIN"
I = "INSIDE"
E = "END"
S = "SINGLE"

for morpheme_list, tag_list in zip(gits_morphemes, MO_tags):
    BIES_list = []
    for morpheme, tag in zip(morpheme_list, tag_list):
        if tag == 'O':
            for i in range(len(morpheme)):
                if i == 0:
                    BIES_list.append((morpheme[i], B

[['M', 'M', 'O', 'M', 'M', 'O', 'M', 'O', 'M', 'O'],
 ['O', 'O', 'M', 'M', 'M', 'O', 'M'],
 ['O', 'M', 'M', 'O'],
 ['O', 'O', 'M', 'O', 'O', 'O', 'M', 'M', 'M', 'M', 'M', 'O', 'O', 'M'],
 ['O', 'O', 'O', 'O', 'M', 'O', 'M', 'M', 'O'],
 ['O', 'O', 'M', 'O', 'O', 'M', 'M', 'O'],
 ['O', 'O', 'M', 'M', 'O', 'O', 'M'],
 ['O', 'M', 'O', 'M', 'M', 'O', 'O', 'M', 'M', 'M', 'M', 'M'],
 ['O', 'M', 'M', 'M'],
 ['O', 'M', 'M'],
 ['O', 'M', 'O', 'O', 'M'],
 ['O', 'M', 'M', 'M', 'M', 'O'],
 ['O', 'M', 'O', 'M', 'O', 'O', 'M', 'M', 'O', 'O'],
 ['O', 'M', 'M', 'O', 'M', 'M', 'O'],
 ['O', 'O', 'M', 'O', 'M', 'O', 'M', 'O', 'M', 'O'],
 ['O', 'O', 'M', 'O', 'O', 'O', 'M', 'M', 'O', 'O', 'M'],
 ['O', 'O', 'O', 'M', 'O', 'M'],
 ['M', 'O', 'O', 'M'],
 ['O', 'O', 'M', 'O', 'M', 'O', 'O', 'M', 'O', 'M', 'M'],
 ['M', 'O', 'M', 'M', 'M', 'O', 'M', 'M', 'O', 'O', 'M', 'M', 'O'],
 ['O', 'M', 'O', 'O', 'M'],
 ['M', 'M', 'O', 'M', 'M', 'O', 'O', 'M'],
 ['O', 'M', 'M', 'O', 'O', 'O', 'M', 'M', 'M', 'O'],
 ['O', 'M',