# Decision Tree model

Features extracted:
- Window feature
- Suffix feature
- Shape feature

Best performance: accuracy **92.26%** on corpus $fr.ftb$

In [2]:
import json
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import defaultdict

Import the data sets and choose one as our corpus.


In [3]:
train_set_ftb = json.load(open('./corpus/fr/fr.ftb.train.json', encoding = 'utf-8'))
test_set_ftb = json.load(open('./corpus/fr/fr.ftb.test.json', encoding = 'utf-8'))
train_set_gsd = json.load(open('./corpus/fr/fr.gsd.train.json', encoding = 'utf-8'))
test_set_gsd = json.load(open('./corpus/fr/fr.gsd.test.json', encoding = 'utf-8'))
train_set_partut = json.load(open('./corpus/fr/fr.partut.train.json', encoding = 'utf-8'))
test_set_partut = json.load(open('./corpus/fr/fr.partut.test.json', encoding = 'utf-8'))
train_set_pud = json.load(open('./corpus/fr/fr.pud.train.json', encoding = 'utf-8'))
test_set_pud = json.load(open('./corpus/fr/fr.pud.test.json', encoding = 'utf-8'))
train_set_sequoia = json.load(open('./corpus/fr/fr.sequoia.train.json', encoding = 'utf-8'))
test_set_sequoia = json.load(open('./corpus/fr/fr.sequoia.test.json', encoding = 'utf-8'))
train_set_spoken = json.load(open('./corpus/fr/fr.spoken.train.json', encoding = 'utf-8'))
test_set_spoken = json.load(open('./corpus/fr/fr.spoken.test.json', encoding = 'utf-8'))

test_set_foot = json.load(open('./corpus/fr/fr.foot.test.json', encoding = 'utf-8'))
test_set_natdis = json.load(open('./corpus/fr/fr.natdis.test.json', encoding = 'utf-8'))

In [4]:
train_set = train_set_ftb
test_set = test_set_ftb

**words_and_labels**: Seperate the words and the labels from the corpus:

In [5]:
def words_and_labels(data_set):
    
    words = []
    labels = []
    for sentence,label in data_set:
        for w,l in zip(sentence,label):
            words.append(w)
            labels.append(l)
    
    return words,labels

train_words,train_label = words_and_labels(train_set)
test_words,test_label = words_and_labels(test_set)

**suffix_one_hot**: Build a dictionary of suffix for train set, the keys of the dictionary are the suffix in type string and the values are their id number. 

In [6]:
def suffix_one_hot(train_words):
    
    suffix_dict = defaultdict(int)
    
    for word in train_words:
        for k in range(1,len(word)):
            suffix_dict[word[k:]] += 1
    return suffix_dict

suffix_dict = suffix_one_hot(train_words)

**train_one_hot**: Build a dictionary of words for train set words_dict { 'word' : id }, the keys of the dictionary are the words in type string and the values are their id number.

In [7]:
def train_one_hot(train_data):
    
    words_set = set()
    words_dict = defaultdict(int)
    for word in train_data:
        words_set.add(word)
    words_set = list(words_set)
    
    for id,word in enumerate(words_set):
        words_dict[word] = id
    
    return words_dict
    
train_words_dict = train_one_hot(train_words)
train_labels_dict = train_one_hot(train_label)

**test_one_hot**: Build a dictionary of words for test set, the same id numbers from the dictionary created with the train set are used to represent the word in test set. The words which only appear in test set (the OOV words) are also marked -1.

In [8]:
def test_one_hot(test_data,train_dict):
    words_dict = defaultdict(int)
    for word in test_data:
        if word in train_dict:
            words_dict[word] = train_dict[word]
        else:
            words_dict[word] = -1
    return words_dict

test_words_dict = test_one_hot(test_words,train_words_dict)
test_labels_dict = test_one_hot(test_label,train_labels_dict)

**feature_window**: Return a list of the window features.

In [9]:
def feature_window(i, sentence, words_dict, l=2):
    
    res = []
    
    word = words_dict[sentence[i]]
    res.append(word)
    
    for k in range(1,l+1):
        
        res.append(words_dict[sentence[i-k]] if i-k>=0 else -1)
        res.append(words_dict[sentence[i+k]] if i+k<len(sentence) else -1)
        
    return res

**feature_suffix**: Return a list of the suffix features. The number of suffix is fix to 10, the words which doesn't have enough suffix will own some -1.

In [10]:
def feature_suffix(i,sentence):
    
    res = []
    
    for k in range(1,10):
        if sentence[i][-k:] in suffix_dict:
            res.append(suffix_dict[sentence[i][-k:]])
        else:
            res.append(-1)
    return res

**feature_shape**: Return a list of the shape features.

In [22]:
def feature_shape(i,sentence):
    
    word = sentence[i]

    def has_digit(s):

        return any(c.isdigit() for c in s)
    
    res = []
    
    res.append(1 if word.istitle() else 0)
    
    res.append(1 if word.isupper() else 0)
    
    res.append(1 if has_digit(word) else 0)

    res.append(1 if '-' in word else 0)
    
    res.append(1 if '_' in word else 0)
    
    res.append(1 if not word.isalnum() else 0)
    
    res.append(1 if len(word) > 3 else 0)

    res.append(1 if '\'' in word else 0)

    return res

**collect_features_and_labels**: The 'main' function for extracting features.

In [23]:
def collect_features_and_labels(data_set,words_dict,labels_dict):
    
    data = []
    label = []
    
    for sentence,labels in data_set:
        
        for i in range(len(sentence)):
            
            data_of_word = []

            data_of_word += feature_window(i, sentence, words_dict)
            data_of_word += feature_suffix(i, sentence)
            data_of_word += feature_shape(i,sentence)

            data.append(np.array(data_of_word))
        
            label.append(labels_dict[labels[i]])
            
    return data,label

**oov_features_and_labels**: Construct the data set for the OOV words.

In [24]:
def oov_features_and_labels(train_data,test_data,test_label):
    

    data = []
    labels = []

    for word,label in zip(test_data,test_label):
        if word[0] == -1:
            data.append(word)
            labels.append(label)
            
    return data,labels

**ambiguous_features_and_labels**: Construct the data set for the ambiguous words.

In [25]:
def ambiguous_features_and_labels(input_data,input_label):
    
    words = defaultdict(lambda: set())
    data = []
    labels = []
    
    for word,label in zip(input_data,input_label):
        words[word[0]].add(label)
    
    for word,label in zip(input_data,input_label):
        if(len(words[word[0]]) > 1):
            data.append(word)
            labels.append(label)
    
    return data,labels
        

Collect all the features from train set and test set:

In [26]:
begin = time.time()
train_data,train_label = collect_features_and_labels(train_set,train_words_dict,train_labels_dict)
end = time.time()
print('total time = ',end - begin)

test_data,test_label = collect_features_and_labels(test_set,test_words_dict,test_labels_dict)

total time =  3.9818809032440186


In [27]:
print(train_data[10])

[19392 12844  2620 12444 21417 97943 12341  1055   166     7     7    -1
    -1    -1     0     0     0     0     0     0     1     0]


Construct the **OOV** set and the **Ambuguous** set:

In [28]:
oov_data,oov_label = oov_features_and_labels(train_data,test_data,test_label)
ambiguous_data,ambiguous_label = ambiguous_features_and_labels(train_data,train_label)

In [29]:
train_data = np.array(train_data)
train_label = np.array(train_label)
test_data = np.array(test_data)
test_label = np.array(test_label)
oov_data = np.array(oov_data)
oov_label = np.array(oov_label)
ambiguous_data = np.array(ambiguous_data)
ambiguous_label = np.array(ambiguous_label)

In [30]:
print(train_data.shape)
print(test_data.shape)
print(oov_data.shape)
print(ambiguous_data.shape)

(442228, 22)
(75073, 22)
(2529, 22)
(212787, 22)


Import the Decision Tree model, training and prediction.

In [31]:
from sklearn import tree
tagger = tree.DecisionTreeClassifier()
tagger.fit(train_data,train_label)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [32]:
from sklearn.metrics import accuracy_score

train_hat = tagger.predict(train_data)
print('train accuracy:', accuracy_score(train_hat,train_label))

train accuracy: 0.9993668424432646


In [33]:
test_hat = tagger.predict(test_data)
print('test accuracy:', accuracy_score(test_hat,test_label))
amb_hat = tagger.predict(ambiguous_data)
print('amb accuracy:', accuracy_score(amb_hat,ambiguous_label))
oov_hat = tagger.predict(oov_data)
print('oov accuracy:', accuracy_score(oov_hat,oov_label))

test accuracy: 0.9242603865570844
amb accuracy: 0.9986841301395292
oov accuracy: 0.5599051008303677
