tf.random.normal(
    shape,
    mean=0.0,
    stddev=1.0,
    dtype=tf.dtypes.float32,
    seed=None,
    name=None
) -> Outputs random values from a normal distribution

tf.nn.softmax(
    logits,
    axis=None,
    name=None,
    dim=None
)

tf.math.reduce_sum(
    input_tensor,
    axis=None,
    keepdims=None,
    name=None,
    reduction_indices=None,
    keep_dims=None
) Computes the sum of elements across dimensions of a tensor. 
Reduces input_tensor along the dimensions given in axis

tf.math.reduce_mean(
    input_tensor,
    axis=None,
    keepdims=None,
    name=None,
    reduction_indices=None,
    keep_dims=None
) Computes the mean of elements across dimensions of a tensor.
Reduces input_tensor along the dimensions given in axis

In [3]:
import tensorflow as tf
import numpy as np
import json
import random

import sys
sys.path.insert(0, '../Library')

from helpers import *

In [4]:
###
### Citest json files si creeaza lista de cuvinte unice din venues. Scrie rezultatul in fisier.
### return numarul de cuvinte.
###
def creeaza_lista_cuvinte_unice():
    lista_unica_cuvinte = []
    
    ## Citeste lista de venues
    r = open('../Date/Initiale/lista_venue_completa.txt', 'r')
    lista_venues = json.load(r)
    r.close()
    
    ## Creeaza lista de cuvinte unice din venues
    for venue in lista_venues:
        lista_unica_cuvinte += venue
    lista_unica_cuvinte = set(lista_unica_cuvinte)
    
    w = open('../Date/Initiale/lista_unica_cuvinte_venues.txt','w')
    w.write(json.dumps(list(lista_unica_cuvinte)))
    w.close()

    return len(lista_unica_cuvinte)

###
### Citest json files si creeaza lista de cuvinte unice din abstract. Scrie rezultatul in fisier.
### return numarul de cuvinte.
###
def creeaza_lista_cuvinte_unice_abstract():
    lista_unica_cuvinte = []
    
    ## Citeste lista de abstract
    for i in range(0,169):
        r = open('../Date/Initiale/Complet/file'+str(i)+'.txt', 'r')
        
        for line in r:
            art_crt = json.loads(line)
            lista_unica_cuvinte+=art_crt['abstract']
            
        r.close()
        lista_unica_cuvinte = remove_duplicates(lista_unica_cuvinte)
        
    w = open('../Date/Initiale/lista_unica_cuvinte_abstract.txt','w')
    w.write(json.dumps(list(lista_unica_cuvinte)))
    w.close()

    return len(lista_unica_cuvinte)

###
### Citeste lista de cuvinte unice si creeaza dictionar {cuvant:id}. Scrie rezultatul in fisier.
### param file_lista_cuvinte: calea spre fisierul de unde se citeste lista de cuvinte unice.
### param out_file: calea spre fisierul unde se va scrie rezultatul
###
def creeaza_word2int(file_lista_cuvinte, out_file):
    # Citeste lista cuvinte
    r = open(file_lista_cuvinte,'r')
    lista_unica_cuvinte = json.load(r)
    r.close()
    
    word2int = {}
    
    for i,cuvant in enumerate(lista_unica_cuvinte):
        word2int[cuvant] = i
        
    w = open(out_file,'w')
    w.write(json.dumps(word2int))
    w.close()
    
    
###
### Citeste lista de cuvinte unice si creeaza dictionar {id:cuvant}. Scrie rezultatul in fisier.
### param file_lista_cuvinte: calea spre fisierul de unde se citeste lista de cuvinte unice.
### param out_file: calea spre fisierul unde se va scrie rezultatul
###
def creeaza_int2word(file_lista_cuvinte, out_file):
    # Citeste lista cuvinte
    r = open(file_lista_cuvinte,'r')
    lista_unica_cuvinte = json.load(r)
    r.close()
    
    int2word = {}
    
    for i,cuvant in enumerate(lista_unica_cuvinte):
        int2word[i] = cuvant
        
    w = open(out_file,'w')
    w.write(json.dumps(int2word))
    w.close()

###
### Creeaza un vector de tip one hot.
### param index: pozitia din vector pe care se va pune 1.
### param len_vect: lungimea vectorului.
### return hot_vect: vectorul rezultat.
###
def one_hot(index, len_vect):
    hot_vect = np.zeros(len_vect)
    hot_vect[index] = 1
    return hot_vect

###
###
###
def creeaza_lista_vecini(word2int_file, docs_file, DIM_FEREASTRA, DIM_VOCABULAR):
    # Citeste word2int
    r = open(word2int_file, 'r')
    word2int = json.load(r)
    r.close()
    
    # Citeste lista documente
    r = open(docs_file, 'r')
    lista_doc = json.load(r)
    r.close()
    
    X_train = [] 
    Y_train = [] 
    
    for doc in lista_doc:
        for word_index, word in enumerate(doc):
            for vecin in doc[max(word_index - DIM_FEREASTRA, 0) : min(word_index + DIM_FEREASTRA, len(doc)) + 1] : 
                if vecin != word:
                    X_train.append(one_hot(word2int[ word ], DIM_VOCABULAR))
                    Y_train.append(one_hot(word2int[ vecin ], DIM_VOCABULAR))

    X_train = np.asarray(X_train)
    Y_train = np.asarray(Y_train)
    
    return X_train, Y_train

def log(word2int_file, vectors, file):
    r = open(word2int_file, 'r')
    word2int = json.load(r)
    r.close()
    
    dict_ = {}
    for elem in word2int:
        dict_[elem] = vectors[word2int[elem]].tolist()
    w = open (file, 'w')
    w.write(json.dumps(dict_))
    w.close()       

## Creeaza dictionar cuvinte unice venue

In [5]:
DIM_VOCABULAR = creeaza_lista_cuvinte_unice()
print(DIM_VOCABULAR)
creeaza_word2int('../Date/Initiale/lista_unica_cuvinte_venues.txt','../DATE/word2int_venue.txt')
creeaza_int2word('../Date/Initiale/lista_unica_cuvinte_venues.txt','../DATE/int2word_venue.txt')

1416


## Antrenare

In [11]:
def train (DIM_VOCABULAR, DIM_FEREASTRA, learning_rate, epoci, word2int_path, complete_path):
    X_train, Y_train = creeaza_lista_vecini(word2int_path, complete_path, DIM_FEREASTRA, DIM_VOCABULAR)
    
    #### Creare retea
    X = tf.placeholder(tf.float32, shape=(None, DIM_VOCABULAR))
    Y_eticheta = tf.placeholder(tf.float32, shape=(None, DIM_VOCABULAR))

    ## Din layer de input in hidden layer
    EMBEDDING_DIM = 50
    W1 = tf.Variable(tf.random_normal([DIM_VOCABULAR, EMBEDDING_DIM]))
    b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias
    layer_hidden = tf.add(tf.matmul(X,W1), b1)

    ## Din hidden layer in output layer
    W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, DIM_VOCABULAR]))
    b2 = tf.Variable(tf.random_normal([DIM_VOCABULAR]))
    #output_layer = tf.nn.softmax(tf.add( tf.matmul(layer_hidden, W2), b2))
    output_layer = tf.add( tf.matmul(layer_hidden, W2), b2)
    
    #### Instantiere model tensorflow
    with tf.device('/gpu:0'):
        sess = tf.Session()
        init = tf.global_variables_initializer()
        sess.run(init)

        # Functia de loss -> cross entropy
        #cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(Y_eticheta * tf.log(output_layer), reduction_indices=[1]))
        cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_eticheta, logits=output_layer))
 
        # compute_gradients + apply_gradients = minimize
        train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cross_entropy)

        i = 0
        crt_poz = 0
        end_poz = 0
        print(len(X_train))
        while(1):
            if i >= epoci:
                break
            i+=1 
            end_poz = crt_poz + 5000
            if end_poz >= len(X_train):
                end_poz = len(X_train)
            size_mini_batch = end_poz - crt_poz
            
            x_mini_batch = X_train[crt_poz:end_poz].reshape(size_mini_batch,DIM_VOCABULAR) # Transpose to the correct shape
            y_mini_batch = Y_train[crt_poz:end_poz].reshape(size_mini_batch,DIM_VOCABULAR)
            
            
            sess.run(train_step, feed_dict={X: x_mini_batch, Y_eticheta: y_mini_batch})
            l = sess.run(cross_entropy, feed_dict={X: x_mini_batch, Y_eticheta: y_mini_batch})

            w = open('loss.txt','a')
            w.write(str(l))
            w.write(' ')
            w.close()
            print('loss is : ', l, i)
            if end_poz == len(X_train):
                crt_poz = 0
            crt_poz += 2000

    vectors = sess.run(W1+b1)
    return vectors
            
            
            
           

In [35]:
f_log = '../Models/modele_tensorflow/venue/w3l002/model_tf_venue_3_002.txt'
log('../Date/Initiale/word2int_venues.txt', vectors, f_log )

## Antrenare venue

In [None]:
train (DIM_VOCABULAR, 4, 0.02, 2000, '../Date/Initiale/word2int_venues.txt', '../Date/Initiale/lista_venue_completa.txt')

37784
loss is :  32.104477 1
loss is :  31.604868 2
loss is :  30.983782 3
loss is :  30.433119 4
loss is :  29.511333 5
loss is :  28.662159 6
loss is :  29.079218 7
loss is :  29.754992 8
loss is :  29.546263 9
loss is :  29.020296 10
loss is :  29.419363 11
loss is :  30.207441 12
loss is :  30.523144 13
