In [1]:
def loadLibFolder (folder):
    import os, sys
    if folder not in sys.path:
        sys.path.insert(1, os.path.join(sys.path[0], folder))

# Experimenting with POS dependency parser
To be able to predict a category out of a sentence/text it is assumed that the POS tags and the dependency tree could have an inpact on the result. Here we investigate that relation

In [2]:
from urllib import request, parse
import json
url = 'http://localhost:1337/sentence/'

## Sample text to try out the parser

In [3]:
def parseSentence(sentence):
    try:
        sentence = request.quote(sentence)
        f =  request.urlopen(url + sentence)
        res = json.loads(f.read().decode('latin1'))
        return res
    except:
        return {'sentenceData': []}
def onlyNounsAndVerbs(data):
    return {
        'sentenceData': [word for word in data['sentenceData'] if 'NN' in word['tag'].split('|') or 'VB' in word['tag'].split('|')]
    }
def untilLevel(level, data):
    return {
        'sentenceData': [word for word in data['sentenceData'] if (int)(word['parent']) <= level]
    }
def toWordArray(data):
    return [word['base_word'] for word in data['sentenceData']]

In [4]:
res = parseSentence('Han ler mot henne och hela hans ansikte säger att han älskar henne med hela sitt hjärta')

In [5]:
# Example filtering
print ("Raw data:")
print (res)
print ("All words:")
print ([word['word'] for word in res['sentenceData']])
print ("Level three data:")
print ([word['word']+ '::' + word['tag'].split('|')[0] for word in res['sentenceData'] if (int)(word['parent']) <= 3])
print ("Only nouns and verbs:")
print ([word['word'] for word in res['sentenceData'] if 'NN' in word['tag'].split('|') or 'VB' in word['tag'].split('|')])

print(" ".join(toWordArray(untilLevel(3, onlyNounsAndVerbs(parseSentence(res))))))

Raw data:
{'sentenceData': [{'identifier': '1', 'word': 'han', 'base_word': 'han', 'tag': 'PN|UTR|SIN|DEF|SUB', 'parent': '2'}, {'identifier': '2', 'word': 'ler', 'base_word': 'le', 'tag': 'VB|PRS|AKT', 'parent': '0'}, {'identifier': '3', 'word': 'mot', 'base_word': 'mot', 'tag': 'PP', 'parent': '2'}, {'identifier': '4', 'word': 'henne', 'base_word': 'hon', 'tag': 'PN|UTR|SIN|DEF|OBJ', 'parent': '3'}, {'identifier': '5', 'word': 'och', 'base_word': 'och', 'tag': 'KN', 'parent': '8'}, {'identifier': '6', 'word': 'hela', 'base_word': 'hel', 'tag': 'JJ|POS|UTR/NEU|SIN|DEF|NOM', 'parent': '8'}, {'identifier': '7', 'word': 'hans', 'base_word': 'hans', 'tag': 'PS|UTR/NEU|SIN/PLU|DEF', 'parent': '8'}, {'identifier': '8', 'word': 'ansikte', 'base_word': 'ansikte', 'tag': 'NN|NEU|SIN|IND|NOM', 'parent': '9'}, {'identifier': '9', 'word': 'säger', 'base_word': 'säga', 'tag': 'VB|PRS|AKT', 'parent': '3'}, {'identifier': '10', 'word': 'att', 'base_word': 'att', 'tag': 'SN', 'parent': '9'}, {'identi

## Classification experiment

In [6]:
loadLibFolder('../gensim')

import os
import gensim
import gensim_documents
import dotenv
import numpy as np
dotenv.load()

Using TensorFlow backend.


In [68]:
limit_per_category = 2000
use_cache = False
use_all_data = True

In [69]:
categories = []
x_data = []
y_data = []
timesteps = 20
model = gensim.models.Doc2Vec.load(dotenv.get('DOC2VEC_MODEL'))

if use_cache and os.path.isfile('data/tmp_dependency_data'):
    with open('data/tmp_dependency_data_lvl4', 'r', encoding='utf-8', errors='ignore') as tmp_cache_file:
        for category in tmp_cache_file:
            category = category[:-1]
            if category == "\n": continue
            if category not in categories:
                print ("TT", category)
                categories.append(category)
            sentVecs = []
            while True:
                sentence = tmp_cache_file.readline()[:-1]
                if sentence == "":
                    break
                artvec = model.infer_vector(doc_words=sentence.split())
                sentVecs.append(gensim.matutils.unitvec(artvec))
            
            stepsToAdd = max([0, timesteps - len(sentVecs)])
            for i in range(stepsToAdd):
                if stepsToAdd <= 0: break
                sentVecs.append(np.zeros(300))
            y_data.append(categories.index(category))
            x_data.append(sentVecs[:timesteps])
else:
    data = gensim_documents.MMDBDocumentLists(dotenv.get('ARTICLE_PATH', '.') + '/csv_by_category/', useHeading=True, limit=limit_per_category)
    tmp_cache_file = None
    if not use_all_data: tmp_cache_file =  open('data/tmp_dependency_data', 'w', encoding='utf-8', errors='ignore')
    for i, doc in enumerate(data):
        if not doc.category in categories:
            categories.append(doc.category)
        if tmp_cache_file != None:
            tmp_cache_file.write(doc.category + "\n")

        sentences = doc.content.split(".")
        sentVecs = []
        for j in range(timesteps):
            if j >= len(sentences): 
                sentVecs.append(np.zeros(300))
                continue
            if use_all_data:
                sentence = sentences[j]
            else:
                sentence = " ".join(toWordArray(untilLevel(3, onlyNounsAndVerbs(parseSentence(sentences[j])))))
            if sentence == "":
                sentVecs.append(np.zeros(300))
                continue
            artvec = model.infer_vector(doc_words=sentence.split())
            sentVecs.append(gensim.matutils.unitvec(artvec))
            if tmp_cache_file != None:
                tmp_cache_file.write(sentence + "\n")
        
        if tmp_cache_file != None:
            tmp_cache_file.write("\n")
        x_data.append(sentVecs)
        y_data.append(categories.index(doc.category))

        if i % (limit_per_category/4) == 0:
            print ("New epoch started, nr.", i+1, " of ", len(categories) * limit_per_category, " epochs")


New epoch started, nr. 1  of  2000  epochs
New epoch started, nr. 501  of  14000  epochs
New epoch started, nr. 1001  of  14000  epochs
New epoch started, nr. 1501  of  14000  epochs
New epoch started, nr. 2001  of  14000  epochs
New epoch started, nr. 2501  of  14000  epochs
New epoch started, nr. 3001  of  14000  epochs
New epoch started, nr. 3501  of  14000  epochs
New epoch started, nr. 4001  of  14000  epochs
New epoch started, nr. 4501  of  14000  epochs
New epoch started, nr. 5001  of  14000  epochs
New epoch started, nr. 5501  of  14000  epochs
New epoch started, nr. 6001  of  14000  epochs
New epoch started, nr. 6501  of  14000  epochs
New epoch started, nr. 7001  of  14000  epochs
New epoch started, nr. 7501  of  14000  epochs
New epoch started, nr. 8001  of  14000  epochs
New epoch started, nr. 8501  of  14000  epochs
New epoch started, nr. 9001  of  14000  epochs
New epoch started, nr. 9501  of  14000  epochs
New epoch started, nr. 10001  of  14000  epochs
New epoch started



In [70]:
min([len(x) for x in x_data])

20

#### Encode one hot vectors for the classes

In [71]:
y_data_one_hot = np.zeros((len(y_data), len(categories)))
y_data_one_hot[np.arange(len(y_data)), np.array(y_data)] = 1


### LSTM classification with keras LSTM cells

In [72]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
import numpy as np

In [73]:
data_dim = len(x_data[0][0])
timesteps = len(x_data[0])
num_classes = len(categories)
n_layers = 10

In [74]:
split = 0.4
limit_train = (int)(len(x_data) * split)
# Generate dummy training data
x_train = x_data[:limit_train]
y_train = y_data_one_hot[:limit_train]

# Generate dummy validation data
x_val = x_data[limit_train:]
y_val = y_data_one_hot[limit_train:]

In [75]:
print(len(x_val))
print(len(x_val[0]))
print(len(x_val[0][0]))

print(len(x_train))
print(len(x_train[0]))
print(len(x_train[0][0]))
print(len(categories))

print(y_train)

8396
20
300
5597
20
300
7
[[ 1.  0.  0. ...,  0.  0.  0.]
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 ..., 
 [ 0.  1.  0. ...,  0.  0.  0.]
 [ 0.  0.  1. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


To train a Sequential LSTM model that can classify a stacked sequence of words we need to define the input as follows:
 * batch_size - number of datapoints in the dataset
 * timesteps - the number of words per sequence
 * data_dim - the number of features per word instance

In [76]:
# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
model.add(LSTM(50, return_sequences=True,
               input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
for layer in range(n_layers-2):
    model.add(LSTM(50, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(50))  # return a single vector of dimension 32
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val))

Train on 5597 samples, validate on 8396 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1984cb7ff28>

In [57]:
model.evaluate(x_val, y_val)



[1.4604822566760274, 0.68008575509308977]

In [109]:
prediction = model.predict(np.array(x_val))

In [101]:
copy_prediction = prediction

In [110]:
copy_prediction = [[1.0 if max(y) == i else 0.0 for i in y] for y in prediction]

In [111]:
copy_prediction

[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
 [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 

In [119]:
y_val

array([[ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.],
       ..., 
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [134]:
from sklearn.metrics import confusion_matrix
import pandas

In [135]:
conf_mat = confusion_matrix([categories[y.argmax()] for y in y_val], [categories[y.argmax()] for y in np.array(copy_prediction)])

In [138]:
pandas.DataFrame(conf_mat, columns=categories, index=categories)

Unnamed: 0,Allmänt,Blåljus,Ekonomi,Kultur,Nöje,Släkt o vänner,Sport
Allmänt,500,328,87,100,51,90,43
Blåljus,363,769,17,6,7,20,17
Ekonomi,179,11,800,34,40,107,28
Kultur,54,2,28,802,181,112,20
Nöje,38,7,21,243,804,48,39
Släkt o vänner,81,4,38,230,68,700,79
Sport,23,12,12,10,99,44,1000
