In [1]:
def loadLibFolder (folder):
    import os, sys
    if folder not in sys.path:
        sys.path.insert(1, folder)

# Experimenting with POS dependency parser
To be able to predict a category out of a sentence/text it is assumed that the POS tags and the dependency tree could have an inpact on the result. Here we investigate that relation

In [2]:
from urllib import request, parse
import json
url = 'http://localhost:1337/sentence/'

## Sample text to try out the parser

In [3]:
def parseSentence(sentence):
    try:
        sentence = request.quote(sentence)
        f =  request.urlopen(url + sentence)
        res = json.loads(f.read().decode('latin1'))
        return res
    except:
        return {'sentenceData': []}
def onlyNounsAndVerbs(data):
    return {
        'sentenceData': [word for word in data['sentenceData'] if 'NN' in word['tag'].split('|') or 'VB' in word['tag'].split('|')]
    }
def untilLevel(level, data):
    return {
        'sentenceData': [word for word in data['sentenceData'] if (int)(word['parent']) <= level]
    }
def toWordArray(data):
    return [word['base_word'] for word in data['sentenceData']]

In [4]:
res = parseSentence('Han ler mot henne och hela hans ansikte säger att han älskar henne med hela sitt hjärta')

In [5]:
# Example filtering
print ("Raw data:")
print (res)
print ("All words:")
print ([word['word'] for word in res['sentenceData']])
print ("Level three data:")
print ([word['word']+ '::' + word['tag'].split('|')[0] for word in res['sentenceData'] if (int)(word['parent']) <= 3])
print ("Only nouns and verbs:")
print ([word['word'] for word in res['sentenceData'] if 'NN' in word['tag'].split('|') or 'VB' in word['tag'].split('|')])

print(" ".join(toWordArray(untilLevel(3, onlyNounsAndVerbs(parseSentence(res))))))

Raw data:
{'sentenceData': [{'identifier': '1', 'word': 'han', 'base_word': 'han', 'tag': 'PN|UTR|SIN|DEF|SUB', 'parent': '2'}, {'identifier': '2', 'word': 'ler', 'base_word': 'le', 'tag': 'VB|PRS|AKT', 'parent': '0'}, {'identifier': '3', 'word': 'mot', 'base_word': 'mot', 'tag': 'PP', 'parent': '2'}, {'identifier': '4', 'word': 'henne', 'base_word': 'hon', 'tag': 'PN|UTR|SIN|DEF|OBJ', 'parent': '3'}, {'identifier': '5', 'word': 'och', 'base_word': 'och', 'tag': 'KN', 'parent': '8'}, {'identifier': '6', 'word': 'hela', 'base_word': 'hel', 'tag': 'JJ|POS|UTR/NEU|SIN|DEF|NOM', 'parent': '8'}, {'identifier': '7', 'word': 'hans', 'base_word': 'hans', 'tag': 'PS|UTR/NEU|SIN/PLU|DEF', 'parent': '8'}, {'identifier': '8', 'word': 'ansikte', 'base_word': 'ansikte', 'tag': 'NN|NEU|SIN|IND|NOM', 'parent': '9'}, {'identifier': '9', 'word': 'säger', 'base_word': 'säga', 'tag': 'VB|PRS|AKT', 'parent': '3'}, {'identifier': '10', 'word': 'att', 'base_word': 'att', 'tag': 'SN', 'parent': '9'}, {'identi

## Classification experiment

In [5]:
import sys
print(sys.path)

['/usr/lib/python35.zip', '/usr/lib/python35.zip/../gensim', '/usr/lib/python35.zip/../gensim', '/usr/lib/python35.zip/../gensim', '/usr/lib/python3.5', '/usr/lib/python3.5/plat-x86_64-linux-gnu', '/usr/lib/python3.5/lib-dynload', '', '/home/johlin/Documents/Projects/auto-categorizer/venv/lib/python3.5/site-packages', '/home/johlin/Documents/Projects/auto-categorizer/venv/lib/python3.5/site-packages/IPython/extensions', '/home/johlin/.ipython']


In [2]:
loadLibFolder('../gensim')

import os
import gensim
import gensim_documents
import dotenv
import numpy as np
import random
dotenv.load()

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
limit_per_category = 5000
use_cache = False
use_all_data = True

In [4]:
categories = []
x_data = []
y_data = []

words = []
timesteps = 30
timestep_range = 300
model = gensim.models.Doc2Vec.load(dotenv.get('DOC2VEC_MODEL'))

if use_cache and os.path.isfile('data/tmp_dependency_data'):
    with open('data/tmp_dependency_data_lvl4', 'r', encoding='utf-8', errors='ignore') as tmp_cache_file:
        for category in tmp_cache_file:
            category = category[:-1]
            if category == "\n": continue
            if category == 'Allmänt': continue
            if category == 'Kultur': category = 'Nöje'
            if category not in categories:
                print ("TT", category)
                categories.append(category)
            sentVecs = []
            while True:
                sentence = tmp_cache_file.readline()[:-1]
                if sentence == "":
                    break
                _words = sentence.split()
                for w in _words:
                    if w not in words:
                        words.append(w)
                
                # sentVecs.append(gensim.matutils.unitvec(model.infer_vector(doc_words=sentence.split())))
                sentVecs.append([words.index(_words[iw]) + 1 if iw < len(_words) else 0 for iw in range(timestep_range)])
            
            stepsToAdd = max([0, timesteps - len(sentVecs)])
            for i in range(stepsToAdd):
                if stepsToAdd <= 0: break
                sentVecs.append(np.zeros(300))
            y_data.append(categories.index(category))
            x_data.append(sentVecs[:timesteps])
else:
    data = gensim_documents.MMDBDocumentLists(dotenv.get('ARTICLE_PATH', '.') + '/csv_by_category/', useHeading=True, limit=limit_per_category)
    tmp_cache_file = None
    ids = []
    if not use_all_data: tmp_cache_file =  open('data/tmp_dependency_data', 'w', encoding='utf-8', errors='ignore')
    for i, doc in enumerate(data):
        if doc.category == 'Allmänt': continue
        if not doc.category in categories:
            categories.append(doc.category)
        if doc.pageid in ids: 
            print("copy of article found", doc.pageid)
            continue
        else: ids.append(doc.pageid)
        if tmp_cache_file != None:
            tmp_cache_file.write(doc.category + "\n")

        sentences = doc.content.split(".")
        sentVecs = []
        for j in range(timesteps):
            if j >= len(sentences): 
                sentVecs.append(np.zeros(timestep_range))
                continue
            if use_all_data:
                sentence = sentences[j]
            else:
                sentence = " ".join(toWordArray(untilLevel(3, onlyNounsAndVerbs(parseSentence(sentences[j])))))
            if sentence == "":
                sentVecs.append(np.zeros(timestep_range))
                continue
            _words = sentence.split()
            for w in _words:
                if w not in words:
                    words.append(w)
            # sentVecs.append(gensim.matutils.unitvec(model.infer_vector(doc_words=_words)))
            sentVecs.append([words.index(_words[iw]) + 1 if iw < len(_words) else 0 for iw in range(timestep_range)])
            if tmp_cache_file != None:
                tmp_cache_file.write(sentence + "\n")
        
        if tmp_cache_file != None:
            tmp_cache_file.write("\n")
        x_data.append(sentVecs)
        y_data.append(categories.index(doc.category))

        if i % (limit_per_category/4) == 0 and i != 0:
            print ("New epoch started, nr.", i, "of", len(categories) * limit_per_category, "epochs", 100 * float(i) / float(len(categories) * limit_per_category), " %")


NotImplementedError: unknown URI scheme 'c' in 'C:\\Users\\desktop-godesity\\Documents\\Text2Abstract\\gensim\\trained-sources\\doc2vec_MM_2000a_allc.model'

In [7]:
data = gensim_documents.MMDBDocumentLists('../MM/csv_by_category/', useHeading=True, limit=5000)
articles = [(a.content, a.category) for a in data if a.category != 'Allmänt']
random.shuffle(articles)
categories = list(set(list(zip(*articles))[1]))

In [24]:

model = gensim.models.Doc2Vec.load('../gensim/trained-sources/doc2vec_MM_14000a_original_linear_allc.model')
articles_labels, articles_vectors = zip(*[
    (categories.index(article[1]),
      [gensim.matutils.unitvec(model.infer_vector(doc_words=sentence.split(' ')))
       for sentence in article[0].split('.')]
    ) for article in articles
])

In [25]:
timesteps = 30
articles_fixed_vectors = [
    [article[i] if len(article) > i else np.zeros(len(articles_vectors[0][0])) 
     for i in range(timesteps)] 
    for article in articles_vectors
]

In [26]:
y_data = articles_labels
x_data = articles_fixed_vectors

#### Encode one hot vectors for the classes

In [27]:
y_data_one_hot = np.zeros((len(y_data), len(categories)))
y_data_one_hot[np.arange(len(y_data)), np.array(y_data)] = 1


### LSTM classification with keras LSTM cells

In [28]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Embedding
from keras.layers import LSTM
import numpy as np

Using TensorFlow backend.


In [29]:
data_dim = len(x_data[0][0])
timesteps = len(x_data[0])
num_classes = len(categories)
n_layers = 4

In [52]:
split = 0.2
limit_train = (int)(len(x_data) * split)
# Generate dummy training data
x_train = x_data[:limit_train]
y_train = y_data_one_hot[:limit_train]

# Generate dummy validation data
x_val = x_data[limit_train:]
y_val = y_data_one_hot[limit_train:]

In [53]:
print(len(x_val))
print(len(x_val[0]))
print(len(x_val[0][0]))

print(len(x_train))
print(len(x_train[0]))
print(len(x_train[0][0]))
print(len(categories))

print(y_train)

9600
30
100
2400
30
100
6
[[0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]]


To train a Sequential LSTM model that can classify a stacked sequence of words we need to define the input as follows:
 * batch_size - number of datapoints in the dataset
 * timesteps - the number of words per sequence
 * data_dim - the number of features per word instance

In [54]:
##### expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
input_shape = (timesteps, data_dim,)
print(np.array(x_train).shape,np.array(x_val).shape)
print(input_shape, num_classes)
model.add(LSTM(50, return_sequences=True,
               input_shape=input_shape))  # returns a sequence of vectors of dimension 32
for layer in range(n_layers-2):
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=True))  # returns a sequence of vectors of dimension 32
model.add(LSTM(50))  # return a single vector of dimension 32
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.fit([x_train], [y_train], epochs=15, validation_data=([x_val], [y_val]))

(2400, 30, 100) (9600, 30, 100)
(30, 100) 6
Train on 2400 samples, validate on 9600 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7fd2ef6e7470>

In [55]:
prediction = model.predict(np.array(x_val))

In [56]:
copy_prediction = prediction

In [57]:
min_proba = min([max(y) for y in copy_prediction])

mean_proba = sum([sum(y) for y in copy_prediction]) / (len(copy_prediction) * len(copy_prediction[0]))

len([y for y in copy_prediction if max(y) > mean_proba])

9600

Treat all predictions above min_proba/avg_proba probability as correct answers

In [58]:
copy_prediction = [[1.0 if proba in [x for x in prediction_instance if (x >= min_proba and prediction_instance.tolist().index(x) == y_val[index].tolist().index(1.0))] \
                    else 0.0 \
                        for proba in prediction_instance] \
                    for index, prediction_instance in enumerate(prediction)]
copy_prediction = [[1.0 if prediction[i].tolist().index(max(prediction[i])) == j or proba == 1.0 else 0.0 \
                        for j, proba in enumerate(prediction_instance)] \
                   for i, prediction_instance in enumerate(copy_prediction)]

In [59]:
len([x for index, x in enumerate(copy_prediction)  if (x.index(1.0) == y_val[index].tolist().index(1.0))])/len(copy_prediction)

0.614375

In [60]:
from sklearn.metrics import confusion_matrix
import pandas

In [61]:
conf_mat = confusion_matrix([categories[y.argmax()] for y in y_val], [categories[y.argmax()] for y in np.array(copy_prediction)])

In [62]:
pandas.DataFrame(conf_mat, columns=categories, index=categories)

Unnamed: 0,Släkt o vänner,Nöje,Ekonomi,Blåljus,Kultur,Sport
Släkt o vänner,982,97,47,37,420,4
Nöje,94,1047,98,43,305,13
Ekonomi,45,74,1091,93,288,7
Blåljus,118,65,145,971,300,14
Kultur,49,91,123,34,1300,10
Sport,649,98,23,157,161,507


In [51]:
conf_mat = confusion_matrix([categories[y.argmax()] for y in y_val], [categories[0] for y in y_val])
tmp_pd = pandas.DataFrame(conf_mat, columns=['Count'] + categories[1:], index=categories)
pandas.DataFrame(tmp_pd['Count'], columns=['Count'])

Unnamed: 0,Count
Släkt o vänner,0
Nöje,0
Ekonomi,0
Blåljus,0
Kultur,0
Sport,0


In [2]:
loadLibFolder('../gensim')
import word2vec_train
word2vec_train.main()

Using Theano backend.
2018-05-22 00:16:12,181 : INFO : collecting all words and their counts
2018-05-22 00:16:12,182 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-05-22 00:16:12,721 : INFO : PROGRESS: at sentence #10000, processed 2257122 words, keeping 135772 word types
2018-05-22 00:16:12,936 : INFO : collected 167118 word types from a corpus of 3173357 raw words and 14000 sentences
2018-05-22 00:16:12,937 : INFO : Loading a fresh vocabulary
2018-05-22 00:16:13,138 : INFO : min_count=5 retains 34825 unique words (20% of original 167118, drops 132293)
2018-05-22 00:16:13,139 : INFO : min_count=5 leaves 2972284 word corpus (93% of original 3173357, drops 201073)
2018-05-22 00:16:13,235 : INFO : deleting the raw counts dictionary of 167118 items
2018-05-22 00:16:13,240 : INFO : sample=0.001 downsamples 36 most-common words
2018-05-22 00:16:13,241 : INFO : downsampling leaves estimated 2281922 word corpus (76.8% of prior 2972284)
2018-05-22 00:16:13,242 

In [3]:
loadLibFolder('../gensim')
import CNN_test

Using Theano backend.


AttributeError: Can't get attribute 'Word2VecKeyedVectors' on <module 'gensim.models.keyedvectors' from 'C:\\Users\\desktop-godesity\\Anaconda3\\lib\\site-packages\\gensim\\models\\keyedvectors.py'>

In [7]:
import gensim

d2v_model = gensim.models.Word2Vec.load('../gensim/trained-sources/word2vec_MM_180521.model')

2018-05-23 09:06:54,721 : INFO : loading Word2Vec object from ../gensim/trained-sources/word2vec_MM_180521.model
2018-05-23 09:06:55,524 : INFO : loading wv recursively from ../gensim/trained-sources/word2vec_MM_180521.model.wv.* with mmap=None
2018-05-23 09:06:55,525 : INFO : setting ignored attribute syn0norm to None
2018-05-23 09:06:55,526 : INFO : setting ignored attribute cum_table to None
2018-05-23 09:06:55,527 : INFO : loaded ../gensim/trained-sources/word2vec_MM_180521.model


In [8]:
d2v_model.wv.closer_than(['Sundsvall', 'Stockholm', 'Karlstad', 'bostad']) #d2v_model.vocab.keys

AttributeError: 'KeyedVectors' object has no attribute 'closer_than'

In [150]:
d2v_model.wv.most_similar_to_given('Sundsvall', 'Mittuniversitetet', 'MittMedia')

AttributeError: 'KeyedVectors' object has no attribute 'most_similar_to_given'

In [4]:
dir(d2v_model.wv.wv)

NameError: name 'd2v_model' is not defined