In [2]:
import json
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import defaultdict

In [3]:
train_set_ftb = json.load(open('./corpus/fr/fr.ftb.train.json', encoding = 'utf-8'))
test_set_ftb = json.load(open('./corpus/fr/fr.ftb.test.json', encoding = 'utf-8'))
train_set_gsd = json.load(open('./corpus/fr/fr.gsd.train.json', encoding = 'utf-8'))
test_set_gsd = json.load(open('./corpus/fr/fr.gsd.test.json', encoding = 'utf-8'))
train_set_partut = json.load(open('./corpus/fr/fr.partut.train.json', encoding = 'utf-8'))
test_set_partut = json.load(open('./corpus/fr/fr.partut.test.json', encoding = 'utf-8'))
train_set_pud = json.load(open('./corpus/fr/fr.pud.train.json', encoding = 'utf-8'))
test_set_pud = json.load(open('./corpus/fr/fr.pud.test.json', encoding = 'utf-8'))
train_set_sequoia = json.load(open('./corpus/fr/fr.sequoia.train.json', encoding = 'utf-8'))
test_set_sequoia = json.load(open('./corpus/fr/fr.sequoia.test.json', encoding = 'utf-8'))
train_set_spoken = json.load(open('./corpus/fr/fr.spoken.train.json', encoding = 'utf-8'))
test_set_spoken = json.load(open('./corpus/fr/fr.spoken.test.json', encoding = 'utf-8'))

test_set_foot = json.load(open('./corpus/fr/fr.foot.test.json', encoding = 'utf-8'))
test_set_natdis = json.load(open('./corpus/fr/fr.natdis.test.json', encoding = 'utf-8'))

In [233]:
train_set = train_set_ftb
test_set = test_set_natdis

In [234]:
def words_and_labels(data_set):
    
    words = []
    labels = []
    for sentence,label in data_set:
        for w,l in zip(sentence,label):
            words.append(w)
            labels.append(l)
    
    return words,labels

train_words,train_label = words_and_labels(train_set)
test_words,test_label = words_and_labels(test_set)

In [235]:
def suffix_one_hot(train_words):
    
    suffix_dict = defaultdict(int)
    
    for word in train_words:
        for k in range(1,len(word)):
            suffix_dict[word[k:]] += 1
    return suffix_dict

suffix_dict = suffix_one_hot(train_words)

In [236]:
def train_one_hot(train_data):
    
    words_set = set()
    words_dict = defaultdict(int)
    for word in train_data:
        words_set.add(word)
    words_set = list(words_set)
    
    for id,word in enumerate(words_set):
        words_dict[word] = id
    
    return words_dict
    
train_words_dict = train_one_hot(train_words)
train_labels_dict = train_one_hot(train_label)

In [237]:
def test_one_hot(test_data,train_dict):
    words_dict = defaultdict(int)
    for word in test_data:
        if word in train_dict:
            words_dict[word] = train_dict[word]
        else:
            words_dict[word] = -1
    return words_dict

test_words_dict = test_one_hot(test_words,train_words_dict)
test_labels_dict = test_one_hot(test_label,train_labels_dict)

In [238]:
def feature_window(i, sentence, words_dict, l=2):
    
    res = []
    
    word = words_dict[sentence[i]]
    res.append(word)
    
    for k in range(1,l+1):
        
        res.append(words_dict[sentence[i-k]] if i-k>=0 else -1)
        res.append(words_dict[sentence[i+k]] if i+k<len(sentence) else -1)
        
    return res

In [239]:
def feature_suffix(i,sentence):
    
    res = []
    
    for k in range(1,10):
        if sentence[i][-k:] in suffix_dict:
            res.append(suffix_dict[sentence[i][-k:]])
        else:
            res.append(-1)
    return res

In [240]:
x = 'jialinbao'
print(x[-10:])

jialinbao


In [241]:
def feature_shape(word):
    '''
    i : the index of the word in the context
    context : the sentence
    
    return : list of features which are tuple (feature_name, value)
    '''
    def has_digit(s):
        '''
        check if a string has digit or nor
        '''
        return any(c.isdigit() for c in s)
    
    res = []
    
       ## different orthographic
    # banary feature indicating whether the word starts with a capital letter or not, 1:yes, 0:not
    res.append(1 if word.istitle() else 0)
    # banary feature indicating whether the word is made of all capital letters or not, 1:yes, 0:not
    res.append(1 if word.isupper() else 0)
    # banary feature indicating whether the word has a digit or not, 1:yes, 0:not
    res.append(1 if has_digit(word) else 0)
    # banary feature indicating whether the word has a hyphen or not, 1:yes, 0:not
    res.append(1 if '-' in word else 0)
    # banary feature indicating whether the word has a low hyphen or not, 1:yes, 0:not
    res.append(1 if '_' in word else 0)
    # banary feature indicating whether the letters in the word are all alphanumeric or not, 1:yes, 0:not
    res.append(1 if not word.isalnum() else 0)
    # binary feature indicating whether the length of word is more than 3
    res.append(1 if len(word) > 3 else 0)

    res.append(1 if '\'' in word else 0)

    return res

In [242]:
def collect_features_and_labels(data_set,words_dict,labels_dict):
    
    data = []
    label = []
    
    for sentence,labels in data_set:
        
        for i in range(len(sentence)):
            
            data_of_word = []

            data_of_word += feature_window(i, sentence, words_dict)
            data_of_word += feature_suffix(i, sentence)
            data_of_word += feature_shape(sentence[i])


            data.append(np.array(data_of_word))
        
            label.append(labels_dict[labels[i]])
            
    return data,label

In [243]:
def oov_features_and_labels(train_data,test_data,test_label):
    

    data = []
    labels = []

    for word,label in zip(test_data,test_label):
        if word[0] == -1:
            data.append(word)
            labels.append(label)
            
    return data,labels

In [244]:
def ambiguous_features_and_labels(input_data,input_label):
    
    words = defaultdict(lambda: set())
    data = []
    labels = []
    
    for word,label in zip(input_data,input_label):
        words[word[0]].add(label)
    
    for word,label in zip(input_data,input_label):
        if(len(words[word[0]]) > 1):
            data.append(word)
            labels.append(label)
    
    return data,labels
        

In [245]:
begin = time.time()
train_data,train_label = collect_features_and_labels(train_set,train_words_dict,train_labels_dict)
end = time.time()
print('total time = ',end - begin)

test_data,test_label = collect_features_and_labels(test_set,test_words_dict,test_labels_dict)

total time =  4.211343050003052


In [246]:
print(train_data[10])

[ 5009 17227 11960 11501 23709 97943 12341  1055   166     7     7    -1
    -1    -1     0     0     0     0     0     0     1     0]


In [247]:
oov_data,oov_label = oov_features_and_labels(train_data,test_data,test_label)
ambiguous_data,ambiguous_label = ambiguous_features_and_labels(train_data,train_label)

In [248]:
train_data = np.array(train_data)
train_label = np.array(train_label)
test_data = np.array(test_data)
test_label = np.array(test_label)
oov_data = np.array(oov_data)
oov_label = np.array(oov_label)
ambiguous_data = np.array(ambiguous_data)
ambiguous_label = np.array(ambiguous_label)

In [249]:
print(train_data.shape)
print(test_label[10])
print(test_data.shape)
print(oov_data.shape)

(442228, 22)
5
(12044, 22)
(2235, 22)


In [250]:
from sklearn import tree
tagger = tree.DecisionTreeClassifier()
tagger.fit(train_data,train_label)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [251]:
from sklearn.metrics import accuracy_score

train_hat = tagger.predict(train_data)
print('train accuracy:', accuracy_score(train_hat,train_label))

train accuracy: 0.9993668424432646


In [252]:
test_hat = tagger.predict(test_data)
print('test accuracy:', accuracy_score(test_hat,test_label))
amb_hat = tagger.predict(ambiguous_data)
print('amb accuracy:', accuracy_score(amb_hat,ambiguous_label))
oov_hat = tagger.predict(oov_data)
print('oov accuracy:', accuracy_score(oov_hat,oov_label))

test accuracy: 0.7467618731318498
amb accuracy: 0.9986841301395292
oov accuracy: 0.18165548098434003
