In [1]:
def loadLibFolder (folder):
    import os, sys
    if folder not in sys.path:
        sys.path.insert(1, os.path.join(sys.path[0], folder))

# Experimenting with POS dependency parser
To be able to predict a category out of a sentence/text it is assumed that the POS tags and the dependency tree could have an inpact on the result. Here we investigate that relation

In [2]:
from urllib import request, parse
import json
url = 'http://localhost:1337/sentence/'

## Sample text to try out the parser

In [3]:
def parseSentence(sentence):
    try:
        sentence = request.quote(sentence)
        f =  request.urlopen(url + sentence)
        res = json.loads(f.read().decode('latin1'))
        return res
    except:
        return {'sentenceData': []}
def onlyNounsAndVerbs(data):
    return {
        'sentenceData': [word for word in data['sentenceData'] if 'NN' in word['tag'].split('|') or 'VB' in word['tag'].split('|')]
    }
def untilLevel(level, data):
    return {
        'sentenceData': [word for word in data['sentenceData'] if 'NN' in word['tag'].split('|') or 'VB' in word['tag'].split('|')]
    }
def toWordArray(data):
    return [word['word'] for word in data['sentenceData']]

In [4]:
res = parseSentence('Han ler mot henne och hela hans ansikte säger att han älskar henne med hela sitt hjärta')

In [5]:
# Example filtering
print ("Raw data:")
print (res)
print ("All words:")
print ([word['word'] for word in res['sentenceData']])
print ("Level three data:")
print ([word['word']+ '::' + word['tag'].split('|')[0] for word in res['sentenceData'] if (int)(word['parent']) <= 3])
print ("Only nouns and verbs:")
print ([word['word'] for word in res['sentenceData'] if 'NN' in word['tag'].split('|') or 'VB' in word['tag'].split('|')])

Raw data:
{'sentenceData': [{'identifier': '1', 'word': 'han', 'base_word': 'han', 'tag': 'PN|UTR|SIN|DEF|SUB', 'parent': '2'}, {'identifier': '2', 'word': 'ler', 'base_word': 'le', 'tag': 'VB|PRS|AKT', 'parent': '0'}, {'identifier': '3', 'word': 'mot', 'base_word': 'mot', 'tag': 'PP', 'parent': '2'}, {'identifier': '4', 'word': 'henne', 'base_word': 'hon', 'tag': 'PN|UTR|SIN|DEF|OBJ', 'parent': '3'}, {'identifier': '5', 'word': 'och', 'base_word': 'och', 'tag': 'KN', 'parent': '8'}, {'identifier': '6', 'word': 'hela', 'base_word': 'hel', 'tag': 'JJ|POS|UTR/NEU|SIN|DEF|NOM', 'parent': '8'}, {'identifier': '7', 'word': 'hans', 'base_word': 'hans', 'tag': 'PS|UTR/NEU|SIN/PLU|DEF', 'parent': '8'}, {'identifier': '8', 'word': 'ansikte', 'base_word': 'ansikte', 'tag': 'NN|NEU|SIN|IND|NOM', 'parent': '9'}, {'identifier': '9', 'word': 'säger', 'base_word': 'säga', 'tag': 'VB|PRS|AKT', 'parent': '3'}, {'identifier': '10', 'word': 'att', 'base_word': 'att', 'tag': 'SN', 'parent': '9'}, {'identi

## Classification experiment

In [None]:
loadLibFolder('../gensim')

import gensim_documents
import dotenv

Using Theano backend.


In [None]:
data = gensim_documents.MMDBDocumentLists(dotenv.get('ARTICLE_PATH', '.') + '/csv_by_category_uuid-filter/', useHeading=True, limit=1000)
dictionary = gensim_documents.VectorDictionary()
for doc in data: dictionary.addToDictionary(" ".join(untilLevel(3, onlyNounsAndVerbs(parseSentence(doc.content.split(".")[0])))), doc.category)

### LSTM classification with keras LSTM cells

In [10]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
import numpy as np

In [11]:
data_dim = len(dictionary[0])
timesteps = 8
num_classes = 10

In [12]:

# Generate dummy training data
x_train = np.random.random((3000, timesteps, data_dim))
y_train = np.random.random((3000, num_classes))

# Generate dummy validation data
x_val = np.random.random((2000, timesteps, data_dim))
y_val = np.random.random((2000, num_classes))

In [14]:
len(x_val[0][0])

16

To train a Sequential LSTM model that can classify a stacked sequence of words we need to define the input as follows:
 * batch_size - number of datapoints in the dataset
 * timesteps - the number of words per sequence
 * data_dim - the number of features per word instance

In [107]:
# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
model.add(LSTM(32, return_sequences=True,
               input_shape=(timesteps, data_dim)))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(32))  # return a single vector of dimension 32
model.add(Dense(10, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5, validation_data=(x_val, y_val))

Train on 3000 samples, validate on 2000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x27517935908>

In [109]:
model.evaluate(x_val, y_val)



[11.532712387084961, 0.1045]

In [72]:
x_val

array([[[ 0.09628296,  0.67249097,  0.4890543 , ...,  0.29885446,
          0.89960575,  0.19898042],
        [ 0.94419746,  0.37159193,  0.47884219, ...,  0.86476085,
          0.78584042,  0.34626552],
        [ 0.55632149,  0.39933993,  0.77890599, ...,  0.0398117 ,
          0.98907483,  0.90146782],
        ..., 
        [ 0.20993563,  0.44732187,  0.23160779, ...,  0.56499002,
          0.32134186,  0.18139511],
        [ 0.67799434,  0.34190327,  0.13457328, ...,  0.3103999 ,
          0.07947997,  0.92366944],
        [ 0.5702733 ,  0.34225729,  0.81988819, ...,  0.05292903,
          0.37115567,  0.76821646]],

       [[ 0.00983867,  0.73681855,  0.29397279, ...,  0.41512319,
          0.36352249,  0.20193833],
        [ 0.58469941,  0.54892885,  0.74738472, ...,  0.83592186,
          0.92938897,  0.04370301],
        [ 0.28451859,  0.93110384,  0.07664203, ...,  0.12766366,
          0.50895263,  0.12491126],
        ..., 
        [ 0.47032118,  0.39299207,  0.89978608, ...,

In [116]:
dotenv.get('ARTICLE_PATH', '.')

NameError: name 'dotenv' is not defined