In [245]:
import json
import numpy as np
import matplotlib.pyplot as plt
import time
from collections import defaultdict
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior() 

In [246]:
train_set = json.load(open('./corpus/fr/fr.ftb.train.json', encoding = 'utf-8'))
test_set = json.load(open('./corpus/fr/fr.ftb.test.json', encoding = 'utf-8'))

train_set = train_set

In [247]:
def words_and_labels(data_set):
    
    words = []
    labels = []
    for sentence,label in data_set:
        for w,l in zip(sentence,label):
            words.append(w)
            labels.append(l)
    
    return words,labels

train_words,train_label = words_and_labels(train_set)
test_words,test_label = words_and_labels(test_set)

In [248]:
def sentences_and_labels(data_set):
    
    sentences = []
    labels = []
    for sentence,label in data_set:
        sentences.append(sentence)
        labels.append(label)
    
    return sentences,labels

train_sentences,_ = sentences_and_labels(train_set)

In [249]:
def train_one_hot(train_data):
    
    words_set = set()
    words_dict = defaultdict(int)
    for word in train_data:
        words_set.add(word)
    words_set = list(words_set)
    
    for id,word in enumerate(words_set):
        words_dict[word] = id
    
    return words_dict
    
train_words_dict = train_one_hot(train_words)
train_labels_dict = train_one_hot(train_label)

print(train_labels_dict)

defaultdict(<class 'int'>, {'INTJ': 0, 'PUNCT': 1, 'NUM': 2, 'ADP+PRON': 3, 'ADJ': 4, 'ADV': 5, 'PRON': 6, 'AUX': 7, 'X': 8, 'SCONJ': 9, 'VERB': 10, 'ADP': 11, 'ADP+DET': 12, 'CCONJ': 13, 'PART': 14, 'PROPN': 15, 'DET': 16, 'NOUN': 17})


In [250]:
'''
label_of_words_dict = {}
for key in train_words_dict.keys():
    for word,label in zip(train_words,train_label):
        if key == word:
            label_of_words_dict[train_words_dict[word]] = train_labels_dict[label]
            
print(label_of_words_dict[117])
'''

'\nlabel_of_words_dict = {}\nfor key in train_words_dict.keys():\n    for word,label in zip(train_words,train_label):\n        if key == word:\n            label_of_words_dict[train_words_dict[word]] = train_labels_dict[label]\n            \nprint(label_of_words_dict[117])\n'

In [251]:
def test_one_hot(test_data,train_dict):
    words_dict = defaultdict(int)
    for word in test_data:
        if word in train_dict:
            words_dict[word] = train_dict[word]
        else:
            words_dict[word] = 0
    return words_dict

test_words_dict = test_one_hot(test_words,train_words_dict)
test_labels_dict = test_one_hot(test_label,train_labels_dict)

print(test_labels_dict)

defaultdict(<class 'int'>, {'DET': 16, 'NOUN': 17, 'ADP+DET': 12, 'VERB': 10, 'ADP': 11, 'NUM': 2, 'ADJ': 4, 'PUNCT': 1, 'AUX': 7, 'ADV': 5, 'CCONJ': 13, 'PRON': 6, 'SCONJ': 9, 'PROPN': 15, 'PART': 14, 'X': 8, 'ADP+PRON': 3, 'INTJ': 0, 'ADP+ADP': 0})


In [252]:
def feature_window(i, sentence,words_dict,l=2):
    '''
    i : the index of the word in the context
    context : the sentence
    l : a window of size is 2*l+1
    
    return : list of features which are tuple (feature_name, value)
    '''
    
    res = []
    
    word = words_dict[sentence[i]]
    
    for k in range(1,l+1):
        
        if i-k >= 0:
            res.append((words_dict[sentence[i-k]],word))
            
        if i+k<len(sentence):
            res.append((word,words_dict[sentence[i+k]]))
        
    return res

print(feature_window(7,train_sentences[4],train_words_dict))

[(8985, 26780), (26780, 21921), (13683, 26780), (26780, 7898)]


In [253]:
def feature_suffix(i,sentence):
    
    res = []
    
    for k in range(1,len(sentence[i])):
        res.append('suffix_'+sentence[i][k:])
        
    return res

In [254]:
def feature_shape(word):
    '''
    i : the index of the word in the context
    context : the sentence
    
    return : list of features which are tuple (feature_name, value)
    '''
    def has_digit(s):
        '''
        check if a string has digit or nor
        '''
        return any(c.isdigit() for c in s)
    
    res = []
    
       ## different orthographic
    # banary feature indicating whether the word starts with a capital letter or not, 1:yes, 0:not
    res.append(1 if word.istitle() else 0)
    # banary feature indicating whether the word is made of all capital letters or not, 1:yes, 0:not
    res.append(1 if word.isupper() else 0)
    # banary feature indicating whether the word has a digit or not, 1:yes, 0:not
    res.append(1 if has_digit(word) else 0)
    # banary feature indicating whether the word has a hyphen or not, 1:yes, 0:not
    res.append(1 if '-' in word else 0)
    # banary feature indicating whether the word has a low hyphen or not, 1:yes, 0:not
    res.append(1 if '_' in word else 0)
    # banary feature indicating whether the letters in the word are all alphanumeric or not, 1:yes, 0:not
    res.append(1 if not word.isalnum() else 0)
    # binary feature indicating whether the length of word is more than 3
    res.append(1 if len(word) > 3 else 0)

    res.append(1 if '\'' in word else 0)

    return res

In [255]:
def collect_features_and_labels(data_set,words_dict,labels_dict):
    
    data = []
    label = []
    
    #punct = pick_out_punct(data_set)
    
    #bigram_left,bigram_right = get_bigram(data_set)
    
    for sentence,labels in data_set:
        
        for i in range(len(sentence)):
              
            data_of_word = []
            
            data_of_word += feature_window(i, sentence, words_dict)

            data += data_of_word
            label.append(labels_dict[labels[i]])
            
    return data,label

In [256]:
def collect_other_features(words_dict):
    
    other_features = []
    #punct = pick_out_punct(data_set)
    
    #bigram_left,bigram_right = get_bigram(data_set)
    
    for word in words_dict.keys():
        
        other_feature = []

        other_feature += feature_shape(str(word))

        other_features.append(other_feature)
            
    return np.array(other_features)

In [257]:
train_data,train_labels = collect_features_and_labels(train_set,train_words_dict,train_labels_dict)
#test_data,test_labels = collect_features_and_labels(test_set,test_words_dict,test_labels_dict)
other_feature = collect_other_features(train_words_dict)


In [258]:
print(len(train_data))
print(len(train_labels))

1680394
442228


In [259]:
print(len(train_words_dict))
#print(train_words_dict)
print(train_data[0])


27127
(4153, 19757)


In [261]:
word_size = len(train_words_dict)
batch_size = 10000
embedding_size = 100

context_pair = train_data

inputs = [x[0] for x in context_pair]
labels = [x[1] for x in context_pair]

print(inputs[:10])
print(labels[:10])

emb_train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
emb_train_labels = tf.placeholder(tf.int32, shape=[batch_size,])

embeddings = tf.Variable(
    tf.random_uniform([word_size, embedding_size], -1.0, 1.0))

embed = tf.nn.embedding_lookup(embeddings, emb_train_inputs)

nce_weights = tf.Variable(
    tf.truncated_normal([word_size,embedding_size],
                        stddev=1.0 / np.sqrt(embedding_size)))

nce_biases = tf.Variable(tf.zeros([word_size]))

prediction = tf.add(tf.matmul(embed, tf.transpose(nce_weights)), nce_biases)

train_labels_vector = tf.one_hot(emb_train_labels,word_size)

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=train_labels_vector))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0).minimize(loss)

session = tf.Session()
init = tf.global_variables_initializer()
session.run(init)

for iteration in range(0,5):
    total_loss = 0
    
    for i in range(int(len(inputs)/batch_size)):
        feed_dict = {emb_train_inputs: inputs[i:i+batch_size], emb_train_labels: labels[i:i+batch_size]}
        _, cur_loss,pred= session.run([optimizer, loss, prediction], feed_dict=feed_dict)
        print('%s: loss: %s' %(iteration,cur_loss))



[4153, 4153, 4153, 19757, 19757, 19757, 4836, 4153, 4836, 4836]
[19757, 4836, 19757, 4836, 15401, 4836, 15401, 4836, 12764, 15401]
0: loss: 10.348618
0: loss: 10.318377
0: loss: 10.288097
0: loss: 10.257918
0: loss: 10.227615
0: loss: 10.197393
0: loss: 10.16708
0: loss: 10.136783
0: loss: 10.106353
0: loss: 10.076146
0: loss: 10.045656
0: loss: 10.015247
0: loss: 9.98475
0: loss: 9.954364
0: loss: 9.923791
0: loss: 9.893293
0: loss: 9.862786
0: loss: 9.832128
0: loss: 9.801491
0: loss: 9.770954
0: loss: 9.740245
0: loss: 9.709492
0: loss: 9.678748
0: loss: 9.647994
0: loss: 9.617119
0: loss: 9.586285
0: loss: 9.555525
0: loss: 9.524768
0: loss: 9.494012
0: loss: 9.463584
0: loss: 9.433148
0: loss: 9.40297
0: loss: 9.372975
0: loss: 9.343113
0: loss: 9.313692
0: loss: 9.28453
0: loss: 9.255855
0: loss: 9.227308
0: loss: 9.199173
0: loss: 9.17132
0: loss: 9.143838
0: loss: 9.116605
0: loss: 9.089972
0: loss: 9.063699
0: loss: 9.037311
0: loss: 9.011711
0: loss: 8.986027
0: loss: 8.96064

2: loss: 5.059038
2: loss: 5.053519
2: loss: 5.048362
2: loss: 5.043108
2: loss: 5.0374236
2: loss: 5.0326514
2: loss: 5.0269866
2: loss: 5.0218697
2: loss: 5.0167165
2: loss: 5.0109315
2: loss: 5.006338
2: loss: 5.0006633
2: loss: 4.995759
2: loss: 4.990439
2: loss: 4.9846706
2: loss: 4.9794564
2: loss: 4.973837
2: loss: 4.969036
2: loss: 4.9638352
2: loss: 4.958871
2: loss: 4.953597
2: loss: 4.9480724
2: loss: 4.9433055
2: loss: 4.938396
2: loss: 4.933521
2: loss: 4.928733
2: loss: 4.923167
2: loss: 4.9176393
2: loss: 4.9127493
2: loss: 4.9074316
2: loss: 4.902587
2: loss: 4.8973517
2: loss: 4.8922186
2: loss: 4.8868876
2: loss: 4.8813267
2: loss: 4.875934
2: loss: 4.8708053
2: loss: 4.8654513
2: loss: 4.860354
2: loss: 4.855289
2: loss: 4.8498406
2: loss: 4.84492
2: loss: 4.8394947
2: loss: 4.8341517
2: loss: 4.8293834
2: loss: 4.825291
2: loss: 4.8204055
2: loss: 4.8162394
2: loss: 4.8110876
2: loss: 4.806621
2: loss: 4.80226
2: loss: 4.7983584
2: loss: 4.794093
2: loss: 4.7897186


In [105]:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings),1,keepdims = True))
normalized_embeddings = embeddings/norm
final_embeddings = normalized_embeddings.eval(session=session)

In [106]:
x = np.array(final_embeddings)
#print(x[:10])
print(x[0])

[ 0.07144159 -0.02048308 -0.18019179 -0.17500421  0.04086721 -0.09685335
  0.08644658  0.04763339  0.04065529  0.12077523 -0.17735511 -0.0179803
  0.03753648  0.07633837 -0.12286776 -0.03059785 -0.07418384  0.01390408
 -0.0608811   0.11169364  0.04537367 -0.14498189  0.14247872  0.02494019
 -0.06840494  0.11314536  0.01086279  0.05335548  0.02231991  0.0577679
  0.0774535  -0.01736649  0.1810616  -0.07715011  0.06870876 -0.15158826
 -0.07663272  0.05579735  0.09992765  0.08475394  0.00346566  0.00508084
  0.17807399  0.08736991 -0.17152134 -0.05012795  0.10294256 -0.01720108
 -0.1170314  -0.17135015  0.06319413  0.16413617  0.10353651 -0.02969321
 -0.12981857 -0.1381176   0.02662657 -0.1492878   0.07284576  0.01076832
  0.13512233  0.08191831  0.1385327  -0.09846588  0.05624063  0.11234745
 -0.10418097 -0.01521527 -0.09857556 -0.07638319  0.02226683  0.0150952
 -0.11109056 -0.03853367  0.17157394 -0.15474941 -0.0029797   0.00135735
 -0.01586181 -0.14694291 -0.03453944  0.12328327  0.05

In [262]:
y = np.array(embeddings.eval(session=session))

of = np.array(other_feature)
print(y.shape)
print(of.shape)

y_final = np.hstack((y,of))
print(y_final.shape)

(27127, 100)
(27127, 8)
(27127, 108)


In [229]:
'''
kind = defaultdict(list)
for i in range(x.shape[0]):
    for j in range(len(test_labels_dict)):
        if label_of_words_dict[i] == j:
            kind[j].append(i)

plt.figure(figsize=(5,5))
plt.scatter(x[kind[17],0],x[kind[17],1],alpha=0.6)
'''

'\nkind = defaultdict(list)\nfor i in range(x.shape[0]):\n    for j in range(len(test_labels_dict)):\n        if label_of_words_dict[i] == j:\n            kind[j].append(i)\n\nplt.figure(figsize=(5,5))\nplt.scatter(x[kind[17],0],x[kind[17],1],alpha=0.6)\n'

In [263]:
real_data = []
for word in train_words:
    real_data.append(y_final[train_words_dict[word]])

print(len(real_data))
print(len(train_labels))

442228
442228


In [236]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.SVC(C=1)
clf.fit(real_data,train_labels)



SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [240]:
train_hat = clf.predict(real_data)
#test_hat = clf.predict(test_data[:1000])
#print(real_data[:10])
print(train_hat[0:15])
print(train_labels[0:15])
#print(test_data[:10])

[15  1 17  5 17  9 16 17 17 11 17 11 16 17 11]
[5, 1, 6, 5, 10, 9, 16, 4, 17, 11, 17, 11, 16, 17, 11]


In [241]:
print(train_hat[0:50])
print(train_labels[0:50])

[15  1 17  5 17  9 16 17 17 11 17 11 16 17 11 17  1 17 16 17 12  2 17  1
  5  7  5  7 16 17 13 16 17  1 17 16 17  1  5 10  5 16  5 17  1 15 10  6
  9 16]
[5, 1, 6, 5, 10, 9, 16, 4, 17, 11, 17, 11, 16, 17, 11, 17, 1, 10, 16, 17, 12, 2, 17, 1, 5, 7, 5, 10, 16, 4, 13, 16, 17, 1, 10, 16, 17, 1, 5, 10, 5, 16, 5, 4, 1, 5, 10, 6, 9, 16]


In [243]:
print('train accuracy:', accuracy_score(train_hat,train_labels))
#print('test accuracy', accuracy_score(test_hat,test_labels[:1000]))

train accuracy: 0.8025891735401759
