In [1]:
import json
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import defaultdict

In [43]:
train_set = json.load(open('./corpus/fr/fr.ftb.train.json', encoding = 'utf-8'))[:1000]
test_set = json.load(open('./corpus/fr/fr.ftb.test.json', encoding = 'utf-8'))[:1000]

In [44]:
def words_and_labels(data_set):
    
    words = []
    labels = []
    for sentence,label in data_set:
        for w,l in zip(sentence,label):
            words.append(w)
            labels.append(l)
    
    return words,labels

train_words,train_label = words_and_labels(train_set)
test_words,test_label = words_and_labels(test_set)

In [45]:
def sentences_and_labels(data_set):
    
    sentences = []
    labels = []
    for sentence,label in data_set:
        sentences.append(sentence)
        labels.append(label)
    
    return sentences,labels

train_sentences,_ = sentences_and_labels(train_set)

In [46]:
def train_one_hot(train_data):
    
    words_set = set()
    words_dict = defaultdict(int)
    for word in train_data:
        words_set.add(word)
    words_set = list(words_set)
    
    for id,word in enumerate(words_set):
        words_dict[word] = id
    
    return words_dict
    
train_words_dict = train_one_hot(train_words)
train_labels_dict = train_one_hot(train_label)

print(train_labels_dict)

defaultdict(<class 'int'>, {'ADP+PRON': 0, 'PUNCT': 1, 'SCONJ': 2, 'ADV': 3, 'PROPN': 4, 'INTJ': 5, 'X': 6, 'DET': 7, 'PRON': 8, 'ADP': 9, 'VERB': 10, 'NOUN': 11, 'NUM': 12, 'CCONJ': 13, 'ADJ': 14, 'AUX': 15, 'PART': 16, 'ADP+DET': 17})


In [47]:
def feature_window(i, sentence,words_dict,l=2):
    '''
    i : the index of the word in the context
    context : the sentence
    l : a window of size is 2*l+1
    
    return : list of features which are tuple (feature_name, value)
    '''
    
    res = np.zeros(len(words_dict)*5)
    
    word = words_dict[sentence[i]]
    
    res[words_dict[sentence[i]]+len(words_dict)*2] = 1
    
    for k in range(1,l+1):
        
        if i-k >= 0:
            res[words_dict[sentence[i-k]]+len(words_dict)*(2-k)]
            
        if i+k<len(sentence):
            res[words_dict[sentence[i+k]]+len(words_dict)*(2+k)]
     
    return res

res = feature_window(7,train_sentences[4],train_words_dict)
print(res.shape)

(29190,)


In [48]:
def collect_features_and_labels(data_set,words_dict,labels_dict):
    
    data = []
    label = []
    
    #punct = pick_out_punct(data_set)
    
    #bigram_left,bigram_right = get_bigram(data_set)
    
    for sentence,labels in data_set:
        
        for i in range(len(sentence)):
              
            data_of_word = []

            data_of_word.append(list(feature_window(i, sentence, words_dict)))
            #data_of_word += feature_suffix(i, sentence)
            #data_of_word += feature_shape(i, sentence)
            #data_of_word += feature_distributional(i, sentence, bigram_left,'_left_')

            data += data_of_word
            label.append(labels_dict[labels[i]])
            
    return data,label

In [None]:
datas,labels = collect_features_and_labels(train_set,train_words_dict,train_labels_dict)

In [28]:
print(datas[10])

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

In [31]:
datas = np.array(datas)
labels = np.array(labels)

In [33]:
print(datas.shape)
print(labels.shape)

(4442, 6860)
(4442,)


In [40]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.SVC()
clf.fit(datas,labels)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [41]:
train_hat = clf.predict(datas)
print('train accuracy:', accuracy_score(train_hat,labels))

train accuracy: 0.2075641602881585


In [42]:
print(train_hat[:200])
print(labels[:20])

[11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11]
[ 3  1  8  3 10  2  7 14 11  9 11  9  7 11  9 11  1 10  7 11]
