In [None]:
import random
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from keras.layers import Embedding, Dense, Input, Flatten, Concatenate, Dropout
from keras.models import Model
from keras.utils import to_categorical, plot_model
import pydot
import keras.optimizers as kop
from keras import backend as K
import json

In [2]:
movies_map = {}
relations_map = {}
people_map = {}

#tuples = []
dictionary = set()

max_tuples = 1000000
#max_tuples = 1000

MODEL_NAME = "IMDB_ONTO_EMBEDING2"

In [3]:
with open('../data/relations.tsv') as fp:
    for i, line in enumerate(fp):
        if i >= max_tuples:
            break
        
        title, relation, person = line.lower().strip().split("\t")
        if relation == 'self':
            continue
        
        if title not in movies_map:
            movies_map[title] = len(movies_map)
            
        if relation not in relations_map:
            relations_map[relation] = []
            relations_map[relation].append((title,person))
        else:
            relations_map[relation].append((title,person))
            
        if person not in people_map:
            people_map[person] = len(people_map)
            
        #tuples.append(( movies_map[title], relations_map[relation], people_map[person] ))
        
        for w in title.split() + relation.split() + person.split():
            dictionary.add(w)

In [4]:
enties = tuple(['movie','person'])

In [5]:
with open('../data/american-english',encoding = 'utf8') as f:
    for i in f:
        i = i.strip()
        if len(set(i))>1:
            dictionary.add(i)
with open('../data/cracklib-small',encoding = 'utf8') as f:
    for i in f:
        i = i.strip()
        if len(set(i))>1:
            dictionary.add(i)

In [6]:
dictionary = { w:i for i,w in enumerate(sorted(dictionary)) }

In [7]:
dumpp = {}
dumpp['vocabulary'] = dictionary
dumpp['relations'] = {i:(enties[0],enties[1]) for i in relations_map.keys()}
dumpp['entities'] = enties
json.dump(dumpp,open(MODEL_NAME+'_data.json','w'))

In [8]:
from NNmodels import OntoEmbeding2

In [9]:
# parámetros de la red
sentence_size = len(dictionary)
neurons_per_ent = 10
neurons_per_rel = 2*neurons_per_ent
# construir la red ontológica
# entrada
sentence_input = Input(shape=(sentence_size,), name='input')
# red
onto = OntoEmbeding2(enties,dumpp['relations'])(sentence_input,neurons_per_ent,neurons_per_rel,True)
# modelo final
model = Model(inputs=sentence_input, outputs=onto)
#opt = kop.SGD(lr=0.5,momentum=0.9,decay=0.9,nesterov=True)
model.compile(optimizer='RMSprop', loss='binary_crossentropy', metrics=['acc'])

In [10]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

mm=model_to_dot(model, rankdir='LR').create(prog='dot', format='pdf')
with open(MODEL_NAME+'.pdf','wb') as f:
    f.write(mm)
#SVG(model_to_dot(model, rankdir='LR').create(prog='dot', format='svg'))

In [11]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 267178)       0                                            
__________________________________________________________________________________________________
movie (Dense)                   (None, 10)           2671790     input[0][0]                      
__________________________________________________________________________________________________
person (Dense)                  (None, 10)           2671790     input[0][0]                      
__________________________________________________________________________________________________
actor (Relation)                (None, 20)           640         movie[0][0]                      
                                                                 person[0][0]                     
__________

In [12]:
vect = CountVectorizer(vocabulary = dictionary, binary=True)

In [13]:
lm = len(list(movies_map.keys()))
lp = len(list(people_map.keys()))
moviet = list(i for i in list(movies_map.keys()))
random.shuffle(moviet)
movietest = moviet[-lm//10:]
moviet = moviet[:-lm//10]
peoplet = list(i for i in list(people_map.keys()))
random.shuffle(peoplet)
peopletest = peoplet[-lp//10:]
peoplet = peoplet[:-lp//10]
relations = tuple(i for i in list(relations_map.keys()))
out_map =  {i:n for n,i in enumerate(list(sorted(enties))+list(sorted(relations_map.keys())))}
for i,j in relations_map.items():
    random.shuffle(j)
    relations_map[i]=j

def sample_p(n):
    return random.sample(peoplet, n),[[0,1]+[0 for i in range(len(out_map)-2)] for j in range(n)]

def sample_m(n):
    return random.sample(moviet, n),[[1]+[0 for i in range(len(out_map)-1)] for j in range(n)]
    
def sample_r(n):
    s = []
    res = []
    for i in range(n):
        s.append(random.choice(relations))
        r = [1,1] + [0 for i in range(len(out_map)-2)]
        r[out_map[s[i]]]=1
        res.append(r)
        ll = len(relations_map[s[-1]])
        t1,t2 = random.choice(relations_map[s[-1]][:-ll//10])
        s[-1] = ' '.join([s[-1],t1,t2])
    return s,res

def sample_pt(n):
    return random.sample(peopletest, n),[[0,1]+[0 for i in range(len(out_map)-2)] for j in range(n)]

def sample_mt(n):
    return random.sample(movietest, n),[[1]+[0 for i in range(len(out_map)-1)] for j in range(n)]
    
def sample_rt(n):
    s = []
    res = []
    for i in range(n):
        s.append(random.choice(relations))
        r = [1,1] + [0 for i in range(len(out_map)-2)]
        r[out_map[s[i]]]=1
        res.append(r)
        ll = len(relations_map[s[-1]])
        t1,t2 = random.choice(relations_map[s[-1]][-ll//10:])
        s[-1] = ' '.join([s[-1],t1,t2])
    return s,res

def generate_p(batch_size=32,train=True):
    while True:
        data = []
        datares = []
        if train:
            d,r = sample_p(batch_size)
        else:
            d,r = sample_pt(batch_size)
        data+=d
        datares+=r
        inn = vect.transform(data)
        yield inn.toarray(),np.array(datares)

def generate_e(batch_size=32,train=True):
    while True:
        data = []
        datares = []
        samples_type = np.random.randint(2,size=batch_size)
        samples_type.sort()
        t1 = samples_type[samples_type==0]
        if len(t1)!=0:
            if train:
                d,r = sample_m(len(t1))
            else:
                d,r = sample_mt(len(t1))
            data+=d
            datares+=r
        t2 = samples_type[samples_type==1]
        if len(t2)!=0:
            if train:
                d,r = sample_p(len(t2))
            else:
                d,r = sample_pt(len(t2))
            data+=d
            datares+=r
        inn = vect.transform(data)
        yield inn.toarray(),np.array(datares)
        
def generate_r(batch_size=32,train=True):
    while True:
        data = []
        datares = []
        if train:
            d,r = sample_r(batch_size)
        else:
            d,r = sample_rt(batch_size)
        data+=d
        datares+=r
        inn = vect.transform(data)
        yield inn.toarray(),np.array(datares) 


def generate_mix(batch_size=32,train=True):
    while True:
        data = []
        datares = []
        samples_type = np.random.randint(3,size=batch_size)
        samples_type.sort()
        t1 = samples_type[samples_type==0]
        if len(t1)!=0:
            if train:
                d,r = sample_m(len(t1))
            else:
                d,r = sample_mt(len(t1))
            data+=d
            datares+=r
        t2 = samples_type[samples_type==1]
        if len(t2)!=0:
            if train:
                d,r = sample_p(len(t2))
            else:
                d,r = sample_pt(len(t2))
            data+=d
            datares+=r
        t3 = samples_type[samples_type==2]
        if len(t3)!=0:
            if train:
                d,r = sample_r(len(t3))
            else:
                d,r = sample_rt(len(t3))
            data+=d
            datares+=r
        inn = vect.transform(data)
        yield inn.toarray(),np.array(datares) 

In [14]:
#train with entites
model.fit_generator(generate_e(100), validation_data=generate_e(100,False), validation_steps=10, epochs=10, steps_per_epoch=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc4f4010b38>

In [15]:
model.save(MODEL_NAME+'.model')

In [16]:
#train with relations
model.fit_generator(generate_r(100), validation_data=generate_r(100,False), validation_steps=10, epochs=10, steps_per_epoch=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc4f32d4ac8>

In [17]:
model.save(MODEL_NAME+'.model')

In [18]:
#train with mix data
model.fit_generator(generate_mix(100), validation_data=generate_mix(100,False), validation_steps=10, epochs=10, steps_per_epoch=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fc4f32d4c88>

In [19]:
model.save(MODEL_NAME+'.model')