In [119]:
import json
import numpy as np
import matplotlib.pyplot as plt
import time
import math
from collections import defaultdict
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 

In [125]:
train_set = json.load(open('./corpus/fr/fr.ftb.train.json', encoding = 'utf-8'))
test_set = json.load(open('./corpus/fr/fr.ftb.test.json', encoding = 'utf-8'))

train_set = train_set[:100]

In [126]:
def words_and_labels(data_set):
    
    words = []
    labels = []
    for sentence,label in data_set:
        for w,l in zip(sentence,label):
            words.append(w)
            labels.append(l)
    
    return words,labels

train_words,train_label = words_and_labels(train_set)
test_words,test_label = words_and_labels(test_set)

In [127]:
def sentences_and_labels(data_set):
    
    sentences = []
    labels = []
    for sentence,label in data_set:
        sentences.append(sentence)
        labels.append(label)
    
    return sentences,labels

train_sentences,_ = sentences_and_labels(train_set)

In [128]:
def train_one_hot(train_data):
    
    words_set = set()
    words_dict = defaultdict(int)
    for word in train_data:
        words_set.add(word)
    words_set = list(words_set)
    
    for id,word in enumerate(words_set):
        words_dict[word] = id+1
    
    return words_dict
    
train_words_dict = train_one_hot(train_words)
train_labels_dict = train_one_hot(train_label)

print(train_labels_dict)

defaultdict(<class 'int'>, {'X': 1, 'ADJ': 2, 'ADP+DET': 3, 'PUNCT': 4, 'VERB': 5, 'INTJ': 6, 'ADP+PRON': 7, 'PROPN': 8, 'SCONJ': 9, 'ADV': 10, 'CCONJ': 11, 'PRON': 12, 'ADP': 13, 'DET': 14, 'NUM': 15, 'AUX': 16, 'NOUN': 17})


In [129]:
def test_one_hot(test_data,train_dict):
    words_dict = defaultdict(int)
    for word in test_data:
        if word in train_dict:
            words_dict[word] = train_dict[word]
        else:
            words_dict[word] = 0
    return words_dict

test_words_dict = test_one_hot(test_words,train_words_dict)
test_labels_dict = test_one_hot(test_label,train_labels_dict)

print(test_labels_dict)

defaultdict(<class 'int'>, {'DET': 14, 'NOUN': 17, 'ADP+DET': 3, 'VERB': 5, 'ADP': 13, 'NUM': 15, 'ADJ': 2, 'PUNCT': 4, 'AUX': 16, 'ADV': 10, 'CCONJ': 11, 'PRON': 12, 'SCONJ': 9, 'PROPN': 8, 'PART': 0, 'X': 1, 'ADP+PRON': 7, 'INTJ': 6, 'ADP+ADP': 0})


In [130]:
def feature_window(i, sentence,words_dict,l=2):
    '''
    i : the index of the word in the context
    context : the sentence
    l : a window of size is 2*l+1
    
    return : list of features which are tuple (feature_name, value)
    '''
    
    res = []
    
    word = words_dict[sentence[i]]
    
    for k in range(1,l+1):
        
        if i-k >= 0:
            res.append((words_dict[sentence[i-k]],word))
            
        if i+k<len(sentence):
            res.append((word,words_dict[sentence[i+k]]))
        
    return res

print(feature_window(7,train_sentences[4],train_words_dict))

[(76, 528), (528, 775), (781, 528), (528, 657)]


In [131]:
def feature_suffix(i,sentence):
    
    res = []
    
    for k in range(1,len(sentence[i])):
        res.append('suffix_'+sentence[i][k:])
        
    return res

In [132]:
def feature_shape(i, sentence):
    '''
    i : the index of the word in the context
    context : the sentence
    
    return : list of features which are tuple (feature_name, value)
    '''
    def has_digit(s):
        '''
        check if a string has digit or nor
        '''
        return any(c.isdigit() for c in s)
    
    res = []
    
    word = sentence[i]
    
       ## different orthographic
    # banary feature indicating whether the word starts with a capital letter or not, 1:yes, 0:not
    if word.istitle():
        res.append('start_capital')
    # banary feature indicating whether the word is made of all capital letters or not, 1:yes, 0:not
    if word.isupper():
        res.append('only_capital')
    # banary feature indicating whether the word has a digit or not, 1:yes, 0:not
    if has_digit(word):
        res.append('has_digit')
    # banary feature indicating whether the word has a hyphen or not, 1:yes, 0:not
    if '-' in word:
        res.append('has_hyphen')
    # banary feature indicating whether the word has a low hyphen or not, 1:yes, 0:not
    if '_' in word:
        res.append('has_hyphen_low')
    # banary feature indicating whether the letters in the word are all alphanumeric or not, 1:yes, 0:not
    if not word.isalnum():
        res.append('not_alnum')
    # binary feature indicating whether the length of word is more than 3
    if len(word) > 3:
        res.append('word_len_>_3')
    
    if '\'' in word:
        res.append('abbr')
    
    return res

In [133]:
def collect_features_and_labels(data_set,words_dict,labels_dict):
    
    data = []
    label = []
    
    #punct = pick_out_punct(data_set)
    
    #bigram_left,bigram_right = get_bigram(data_set)
    
    for sentence,labels in data_set:
        
        for i in range(len(sentence)):
              
            data_of_word = []

            data_of_word += feature_window(i, sentence, words_dict)
            #data_of_word += feature_suffix(i, sentence)
            #data_of_word += feature_shape(i, sentence)
            #data_of_word += feature_distributional(i, sentence, bigram_left,'_left_')

            data += data_of_word
            label.append(labels_dict[labels[i]])
            
    return data,label

In [134]:
train_data,train_labels = collect_features_and_labels(train_set,train_words_dict,train_labels_dict)
#test_data,test_labels = collect_features_and_labels(test_set,test_words_dict,test_labels_dict)

In [137]:
print(len(train_data))
print(len(train_labels))

17168
4442


In [138]:
print(len(train_words_dict))

1372


In [145]:
word_size = len(train_words_dict)
batch_size = len(train_data)
embedding_size = 20

context_pair = train_data

inputs = [train_words_dict[x[0]] for x in context_pair]
labels = [train_words_dict[x[1]] for x in context_pair]

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size,])

embeddings = tf.Variable(
    tf.random_uniform([word_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

nce_weights = tf.Variable(
    tf.truncated_normal([word_size,embedding_size],
                        stddev=1.0 / np.sqrt(embedding_size)))

nce_biases = tf.Variable(tf.zeros([word_size]))

prediction = tf.add(tf.matmul(embed, tf.transpose(nce_weights)), nce_biases)
train_labels_vector = tf.one_hot(train_labels,word_size)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=train_labels_vector))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)
session = tf.Session()
init = tf.global_variables_initializer()
session.run(init)
for iteration in range(0,10):
    total_loss = 0

    feed_dict = {train_inputs: inputs, train_labels: labels}
    _, cur_loss,pred= session.run([optimizer, loss, prediction], feed_dict=feed_dict)
    print('%s: loss: %s' %(iteration,cur_loss))



    
    
    
    

0: loss: 9.017371
1: loss: 1.463558
2: loss: 9.417523e-06
3: loss: 9.417523e-06
4: loss: 9.417523e-06
5: loss: 9.417523e-06
6: loss: 9.417523e-06
7: loss: 9.417523e-06
8: loss: 9.417523e-06
9: loss: 9.417523e-06


In [146]:
print(pred[:2])

[[19.70553     0.32988912 -0.9683957  ... -0.07941938  0.59390706
  -0.46528682]
 [19.70553     0.32988912 -0.9683957  ... -0.07941938  0.59390706
  -0.46528682]]


In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.SVC(C=1)
clf.fit(train_data[:1000],train_labels[:1000])

In [83]:
train_hat = clf.predict(train_data[:1000])
test_hat = clf.predict(test_data[:1000])
print(train_data[:10])
print(test_data[:10])

[[200, -1, 548, -1, 25196], [548, 200, 25196, -1, 12252], [25196, 548, 12252, 200, 14927], [12252, 25196, 14927, 548, 6316], [14927, 12252, 6316, 25196, 1], [6316, 14927, 1, 12252, 2608], [1, 6316, 2608, 14927, 16389], [2608, 1, 16389, 6316, 6688], [16389, 2608, 6688, 1, 1489], [6688, 16389, 1489, 2608, 18644]]
[[26658, -1, 17841, -1, 19210], [17841, 26658, 19210, -1, 10374], [19210, 17841, 10374, 26658, 19210], [10374, 19210, 19210, 17841, 18832], [19210, 10374, 18832, 19210, 20491], [18832, 19210, 20491, 10374, 6012], [20491, 18832, 6012, 19210, 16471], [6012, 20491, 16471, 18832, 21630], [16471, 6012, 21630, 20491, 19950], [21630, 16471, 19950, 6012, 21630]]


In [82]:
print('train accuracy:', accuracy_score(train_hat,train_labels[:1000]))
print('test accuracy', accuracy_score(test_hat,test_labels[:1000]))

train accuracy: 1.0
test accuracy 0.275


In [40]:
clf.support_vectors_

array([[ 2.6080e+03,  1.0000e+00,  1.6389e+04,  6.3160e+03,  6.6880e+03],
       [ 1.2002e+04,  1.9286e+04,  2.3506e+04,  1.4503e+04,  1.6160e+04],
       [ 7.4520e+03,  1.9220e+03,  8.3860e+03,  1.9286e+04, -1.0000e+00],
       [ 1.3742e+04,  2.0916e+04,  1.2252e+04,  9.6400e+03,  1.1479e+04],
       [ 1.9106e+04,  8.9810e+03,  2.4400e+04,  1.9945e+04,  2.4867e+04],
       [ 4.5150e+03,  1.5617e+04,  1.0869e+04,  3.1080e+03,  5.4800e+02],
       [ 1.4092e+04,  2.2350e+04,  2.4400e+04,  1.9391e+04,  1.9490e+03],
       [ 4.4240e+03,  2.6975e+04,  2.1630e+04,  1.7663e+04,  2.2514e+04],
       [ 1.9210e+04,  5.8480e+03,  1.0374e+04,  1.9286e+04,  1.9210e+04],
       [ 1.9210e+04,  1.0374e+04,  1.3138e+04,  1.9210e+04,  3.6580e+03],
       [ 1.9490e+03,  2.4400e+04,  8.6420e+03,  1.4092e+04,  1.5671e+04],
       [ 1.9210e+04,  1.5671e+04,  8.1590e+03,  8.6420e+03,  6.6880e+03],
       [ 5.4800e+02,  2.0000e+02,  2.5196e+04, -1.0000e+00,  1.2252e+04],
       [ 5.4800e+02,  5.2490e+03,  8.0