In [321]:
import tensorflow as tf

from tensorflow.keras import layers, Sequential, Model

import nltk, re, string, collections
from nltk.util import ngrams # function for making ngrams

from pathlib import Path

import collections
import random

import numpy as np

In [226]:
def build_model(N=1000, n=100):
    
    m = Sequential()
    
    m.add(tf.keras.Input(shape=(N, )))
    
    # Capa escondida
    m.add(layers.Dense(n, activation='linear', use_bias=False, input_shape=(N, ), weight_constraings=tf.keras.constraints.UnitNorm(axis=1)))
    
    # Afterwards, we do automatic shape inference:
    m.add(layers.Dense(N, activation='softmax', use_bias=False))
    
    m.compile(optimizer='sgd', loss='binary_crossentropy')
    
    print(m.summary())
    
    return m

In [303]:
files = ['data/pg1619.txt', 'data/pg2000.txt']

text = ""
for file in files:
    with open(file) as fd:
        text += fd.read()

punctuationNoPeriod = "[" + re.sub("\.","",string.punctuation) + "]"
text = re.sub(punctuationNoPeriod, "", text)
text = text.replace('\n', ' ')

# Let's truncate this to just 1000000 words.
corpus = text.lower()

In [304]:
# first get individual words
tokenized = corpus.split()

# Count the words
words = collections.Counter(tokenized)

In [305]:
unique_words = list(set(tokenized))
unique_words_inv = {}
for i, w in enumerate(unique_words):
    unique_words_inv[w] = i

In [402]:
# Transform the tokens into indeces.
tokenized_index = [unique_words_inv[w] for w in tokenized]

# Lista de n-grams
esNgrams = ngrams(tokenized, 5)

# contar los n-grams
esNgramFreq = collections.Counter(esNgrams)

In [307]:
# what are the ten most popular ngrams in this Spanish corpus?
esNgramFreq.most_common(10)

[(('don', 'quijote', 'de', 'la', 'mancha'), 134),
 (('caballero', 'de', 'la', 'triste', 'figura'), 33),
 (('petrarca', 'de', 'remediis', 'utriusque', 'fortunae'), 30),
 (('la', 'señora', 'dulcinea', 'del', 'toboso'), 24),
 (('de', 'remediis', 'utriusque', 'fortunae', 'i'), 21),
 (('el', 'cura', 'y', 'el', 'barbero'), 21),
 (('en', 'todos', 'los', 'días', 'de'), 21),
 (('el', 'caballero', 'de', 'la', 'triste'), 21),
 (('de', 'don', 'quijote', 'de', 'la'), 17),
 (('caballero', 'don', 'quijote', 'de', 'la'), 17)]

In [416]:
def train_generator(tokens, n=3, batch_size=32):
    
    N = len(unique_words)
    
    c_range = list(range(-n, 0, 1)) + list(range(1, n+1))
    f_range = list(range(n, len(tokens) - n))
    
    while True:  
        
        i = 0 
        
        x_f = np.zeros((batch_size, N))
        x_c = np.zeros((batch_size, N))
                
        y = np.zeros((batch_size, 1))
        
        f_i_s = []
        f_n_s = []
        
        while i < batch_size:
                                    
            if len(f_i_s) == 0:
                f_i_s = random.choices(f_range, k=100 * batch_size)
                                
            if len(f_n_s) == 0:
                f_n_s = random.choices(f_range, k=100 * batch_size)

            # Focus.
            f_i = f_i_s.pop()

            # Pick another random word.
            f_n = f_n_s.pop()
            
            # Context
            for c in c_range:                
                
                if i >= batch_size:
                    break
                    
                x_f[i][tokens[f_i]] = 1.0
                x_c[i][tokens[f_i + c]] = 1.0
                y[i][0] = 1.0

                i += 1
            
            # Context
            for c in c_range:                
                
                if i >= batch_size:
                    break
                    
                x_f[i][tokens[f_n]] = 1.0
                x_c[i][tokens[f_i + c]] = 1.0
                y[i][0] = 0.0
                i += 1            
        
        yield ((x_f, x_c), y)
    
   

In [417]:
def build_model(N=1000, n=1000):
    
    # Focus word.
    focus = tf.keras.Input(shape=(N, ))
    
    # Context word.
    context = tf.keras.Input(shape=(N, ))
    
    # Matrix with repr and context.
    w_r = layers.Dense(n, activation='linear', use_bias=False, kernel_constraint=tf.keras.constraints.UnitNorm(axis=1), name='w_r')    
    w_c = layers.Dense(n, activation='linear', use_bias=False, kernel_constraint=tf.keras.constraints.UnitNorm(axis=1), name='w_c')
    
    # Dot product.
    y_ = tf.linalg.diag_part(tf.tensordot(w_r(focus), w_c(context), [[1],[1]]))
    y = tf.reshape(y_, [-1, 1])
    
    output = layers.Dense(1, activation='sigmoid', use_bias=False)(y)

    m = Model(inputs=[focus, context], outputs=output)
    
    m.compile(optimizer='adam', loss='binary_crossentropy')

    print(m.summary())
    
    return m
    

m = build_model(N=len(unique_words), n=100)

Model: "model_17"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_66 (InputLayer)           [(None, 35200)]      0                                            
__________________________________________________________________________________________________
input_67 (InputLayer)           [(None, 35200)]      0                                            
__________________________________________________________________________________________________
w_r (Dense)                     (None, 100)          3520000     input_66[0][0]                   
__________________________________________________________________________________________________
w_c (Dense)                     (None, 100)          3520000     input_67[0][0]                   
___________________________________________________________________________________________

In [None]:
m.fit(train_generator(tokenized_index, batch_size=256), steps_per_epoch=20, epochs=10000)

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000

In [396]:
def phi_d(w):
    W_r = np.array(m.get_layer('w_r').weights[0])
    return W_r[unique_words_inv[w]]


def closest(w, n=10):
    
    W_r = np.array(m.get_layer('w_r').weights[0])
    
    w = w / np.linalg.norm(w)
    
    sort_index = np.flip(np.dot(W_r, w).argsort())
    
    words = []
    for i in range(n):
        words.append(unique_words[sort_index[i]])
    
    return words

In [397]:
phi_d('mujer')

array([ 0.09154004, -0.0746439 , -0.07706562, -0.15104972, -0.20269592,
       -0.05375413,  0.09674148,  0.05664596,  0.03870582,  0.02036014,
       -0.05095919, -0.251195  ,  0.00127836, -0.21020204,  0.07367929,
       -0.06879904,  0.14934994,  0.07993663,  0.05136886, -0.02097218,
        0.08394454, -0.02790381,  0.04458955, -0.10188736,  0.10768676,
        0.00679626, -0.16269545,  0.07203142, -0.00715161, -0.09431475,
       -0.02478926,  0.0222859 , -0.03121722, -0.05354659, -0.02604924,
        0.00080278, -0.00987856,  0.00096193, -0.11782497, -0.00272015,
       -0.10738652, -0.11440382, -0.09793027, -0.03792438,  0.02099987,
       -0.06834219,  0.00841567, -0.17905857, -0.08828238,  0.10164704,
       -0.03792677,  0.06343715,  0.04233918, -0.09589901, -0.05871633,
        0.02461573, -0.1306635 ,  0.09981474, -0.20335914, -0.20875572,
        0.14069352,  0.15361135, -0.03220928,  0.0077925 ,  0.0203077 ,
        0.0426612 ,  0.23812217, -0.02554249, -0.10857997,  0.00

In [398]:
closest(phi_d('rey') - phi_d('hombre') + phi_d('mujer'))

['rey',
 'mujer',
 'tierra',
 'sombra',
 'belleza',
 'contentísimas',
 'capirotes',
 'edad',
 'mitad',
 'cumbre']

In [399]:
closest(phi_d('verde'))

['verde',
 'vestida',
 'vestido',
 'pastor',
 '195',
 'complexión',
 'industriadas',
 'estudiante',
 'lusitano',
 'principales']

In [400]:
words['reina']

59

1.0

0.9999999