In [81]:
import logging; logging.basicConfig(level=logging.INFO)
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import logictensornetworks as ltn

plt.rcParams['font.size'] = 12
plt.rcParams['axes.linewidth'] = 1

In [82]:
import nltk
nltk.download('mac_morpho')

[nltk_data] Downloading package mac_morpho to
[nltk_data]     /home/castro/nltk_data...
[nltk_data]   Package mac_morpho is already up-to-date!


True

In [83]:
import gensim
import os
from nltk.corpus import mac_morpho
import glob
path = ['./false_text/','./true_text/']

model = None
if(not os.path.isfile('words.embedding')) :
    model = gensim.models.Word2Vec(mac_morpho.sents())
    new_texts = []
    for i in range(2):
        for file_name in glob.glob(path[i]+"*.txt"):
            with open(file_name, 'r') as file:  
                text = list(gensim.utils.tokenize(file.read()))
                new_texts.append(text)
    model.min_count = 1
    model.build_vocab(new_texts, update=True)
    model.train(new_texts, total_examples=len(new_texts), epochs=model.epochs)
    
    model.save('words.embedding')
else:
    model = gensim.models.Word2Vec.load('words.embedding')

INFO:gensim.utils:loading Word2Vec object from words.embedding
INFO:gensim.utils:loading wv recursively from words.embedding.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': 'words.embedding', 'datetime': '2021-12-03T18:02:28.947187', 'gensim': '4.1.2', 'python': '3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]', 'platform': 'Linux-5.10.60.1-microsoft-standard-WSL2-x86_64-with-glibc2.29', 'event': 'loaded'}


In [84]:
data = []
labels = []
qtd_words = 10
null_word = [0]*100
big_word_count = 0
for i in range(2):
    for file_name in glob.glob(path[i]+"*.txt"):
        with open(file_name, 'r') as file:
            text = list(gensim.utils.tokenize(file.read()))
            print(file_name, text)
            vec_words = [model.wv[word] for word in text]
            data.append(vec_words)
            labels.append(i>0)
            
            big_word_count = max(big_word_count, len(text))
len(data)

./false_text/2.txt ['Eu', 'passei', 'oito', 'anos', 'na', 'prefeitura', 'e', 'você', 'não', 'viu', 'aumento', 'de', 'IPTU', 'você', 'não', 'viu', 'aumento', 'de', 'imposto']
./false_text/6.txt ['Nunca', 'se', 'investiu', 'tanto', 'na', 'saúde', 'quanto', 'o', 'nosso', 'governo']
./false_text/5.txt ['Eu', 'quero', 'informar', 'a', 'você', 'que', 'nos', 'assiste', 'hoje', 'que', 'nós', 'vamos', 'recontratar', 'os', 'profissionais', 'que', 'o', 'Crivella', 'demitiu', 'e', 'recuperar', 'todas', 'as', 'clínicas', 'da', 'família']
./false_text/4.txt ['O', 'Rodrigo', 'Bethlem', 'esse', 'que', 'te', 'assessora', 'aqui', 'no', 'intervalo', 'quando', 'ele', 'confessou', 'que', 'tomava', 'dinheiro', 'da', 'prefeitura', 'eu', 'botei', 'ele', 'pra', 'correr', 'Virou', 'seu', 'assessor']
./false_text/1.txt ['E', 'aliás', 'quando', 'ele', 'fala', 'do', 'aumento', 'na', 'nota', 'do', 'Ideb', 'que', 'melhorou', 'não', 'foi', 'a', 'nota', 'da', 'prova', 'O', 'que', 'melhorou', 'dele', 'foi', 'a', 'aprov

14

In [85]:

# same dimensions
for i, val in enumerate(data):
    need = big_word_count - len(val)
    data[i].extend(np.array([[1.0]*100]*need, dtype=np.float32))

# shuffle
import random
tmp = list(zip(data, labels))
random.shuffle(tmp)
data, labels = zip(*tmp)

# to numpy array
data = np.array(data)
labels = np.array(labels)

In [86]:
import math

nr_samples_train = math.ceil(0.8*len(data))
batch_size = 12
print(nr_samples_train, batch_size)
ds_train = tf.data.Dataset\
        .from_tensor_slices((data[:nr_samples_train], labels[:nr_samples_train]))\
        .batch(batch_size)
ds_test = tf.data.Dataset\
        .from_tensor_slices((data[nr_samples_train:], labels[nr_samples_train:]))\
        .batch(batch_size)


12 12


In [87]:
Not = ltn.Wrapper_Connective(ltn.fuzzy_ops.Not_Std())
And = ltn.Wrapper_Connective(ltn.fuzzy_ops.And_Prod())
Or = ltn.Wrapper_Connective(ltn.fuzzy_ops.Or_ProbSum())
Implies = ltn.Wrapper_Connective(ltn.fuzzy_ops.Implies_Reichenbach())
Forall = ltn.Wrapper_Quantifier(ltn.fuzzy_ops.Aggreg_pMeanError(p=2),semantics="forall")
Exists = ltn.Wrapper_Quantifier(ltn.fuzzy_ops.Aggreg_pMean(p=2),semantics="exists")
formula_aggregator = ltn.Wrapper_Formula_Aggregator(ltn.fuzzy_ops.Aggreg_pMeanError(p=2))

In [88]:
Valid = ltn.Predicate.MLP([(big_word_count, 100)],hidden_layer_sizes=(16,16))

In [89]:
@tf.function
def axioms(features, labels):
    Valid_ex = ltn.Variable("Valid_ex", features[labels])
    Invalid_ex  = ltn.Variable("Invalid_ex", features[tf.logical_not(labels)])
    axioms = [
        Forall(Valid_ex, Valid(Valid_ex)),
        Forall(Invalid_ex, Not(Valid(Invalid_ex)))
    ]
    sat_level = formula_aggregator(axioms).tensor
    return sat_level

In [90]:
mean_metrics = tf.keras.metrics.Mean()

trainable_variables = Valid.trainable_variables
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
for epoch in range(2000):
    for _data, _labels in ds_train:
        with tf.GradientTape() as tape:
            loss = 1. - axioms(_data, _labels)
        grads = tape.gradient(loss, trainable_variables)
        optimizer.apply_gradients(zip(grads, trainable_variables))
    if epoch%100 == 0:
        mean_metrics.reset_states()
        for _data, _labels in ds_test:
            mean_metrics(axioms(_data, _labels))
        print("Epoch %d: Sat Level %.3f"%(epoch, mean_metrics.result() ))
mean_metrics.reset_states()
for _data, _labels in ds_test:
    mean_metrics(axioms(_data, _labels))
print("Training finished at Epoch %d with Sat Level %.3f"%(epoch, mean_metrics.result() ))