In [1]:
!pip install rowordnet

Collecting rowordnet
  Downloading rowordnet-1.1.0-py3-none-any.whl (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 8.3 MB/s 
Installing collected packages: rowordnet
Successfully installed rowordnet-1.1.0


In [2]:
import rowordnet as rwn
wn = rwn.RoWordNet()

# Create dataset

In [18]:
import pickle
import os
import math

In [4]:
!gdown --id 1IV_nodlm-dw-EWl1DtngkATgAldEdAGO # download pickle

Downloading...
From: https://drive.google.com/uc?id=1IV_nodlm-dw-EWl1DtngkATgAldEdAGO
To: /content/dataset.pickle
100% 93.3M/93.3M [00:03<00:00, 25.5MB/s]


In [5]:
with open("dataset.pickle", "rb") as pickleFile:
    db = pickle.load(pickleFile)

for word in db:
  synsets = db[word][0]['synsets'].split()
  for s in synsets:
      if len(s) > 3:
          try:
              os.makedirs("data/" + word + "/" + s)
          except:
              continue
  for i in range(len(db[word])):
      correct_synset = db[word][i]['correct_synset_id']
      if correct_synset != "-1":
        sentence = db[word][i]['sentence']
        with open("data/" + word + "/" + correct_synset  + "/" + str(i) + ".txt", "wt") as f:
            f.write(sentence)

In [15]:
walk = list(os.walk("data"))
for path, _, _ in walk[::-1]:
  if len(os.listdir(path)) == 0:
    os.rmdir(path)

In [6]:
!cat data/complice/ENG30-00452773-a/5.txt

Din ce am auzit, ea ar fi putut fi complicele evadatilor, a deschis usa care le permitea să iasă afară.

# Tensorflow model

In [7]:
import io
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import TextVectorization

In [8]:
def train_model_for_word(word, logging=False):

  classes_count = 0
  classes = []
  synsets = db[word][0]['synsets'].split()
  for s in synsets:
      if len(s) > 3:
        classes_count += 1
        classes.append(s)

  classes.sort()

  batch_size = 4
  seed = 123
  train_ds = tf.keras.utils.text_dataset_from_directory(
      'data/' + word, batch_size=batch_size, validation_split=0.2,
      subset='training', seed=seed)
  val_ds = tf.keras.utils.text_dataset_from_directory(
      'data/' + word, batch_size=batch_size, validation_split=0.2,
      subset='validation', seed=seed)
  
  AUTOTUNE = tf.data.AUTOTUNE
  train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
  val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)


  def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                    '[%s]' % re.escape(string.punctuation), '')


  # Vocabulary size and number of words in a sequence.
  vocab_size = 10000
  sequence_length = 100

  # Use the text vectorization layer to normalize, split, and map strings to
  # integers. Note that the layer uses the custom standardization defined above.
  # Set maximum_sequence length as all samples are not of the same length.
  vectorize_layer = TextVectorization(
      standardize=custom_standardization,
      max_tokens=vocab_size,
      output_mode='int',
      output_sequence_length=sequence_length)

  # Make a text-only dataset (no labels) and call adapt to build the vocabulary.
  text_ds = train_ds.map(lambda x, y: x)
  vectorize_layer.adapt(text_ds)
  embedding_dim=64

  model = Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name="embedding"),
    GlobalAveragePooling1D(),
    Dense(32, activation='relu'),
    Dense(classes_count, activation='softmax')
  ])

  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
  early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)

  model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
  
  model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=100,
    callbacks=[tensorboard_callback, early_stop_callback]
    )
  
  
  evaluation = model.evaluate(val_ds)
  pred = []
  if logging is True:
    predictions = model.predict(val_ds)
    for i in range(len(predictions)):
      pred.append(predictions[i].tolist().index(max(predictions[i].tolist())))

    for i in val_ds.enumerate():
      counts = 0
      for s in list(i[1][0].numpy()):
        sentence = s.decode('utf-8')
        prediction = pred[counts]
        print("Sentence: " + sentence)
        print("Prediction: " + wn.synset(classes[pred[counts]]).definition)
        for k in range(len(db[word])):
          if db[word][k]['sentence'] == sentence:
            correct = wn.synset(db[word][k]['correct_synset_id'])
            print("Marked correct: " + correct.definition)
            break
        counts += 1


  return evaluation[1]

  #docs_infra: no_execute
  # %load_ext tensorboard
  # %tensorboard --logdir logs

In [9]:
train_model_for_word('bancă', logging=True)

Found 60 files belonging to 6 classes.
Using 48 files for training.
Found 60 files belonging to 6 classes.
Using 12 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Sentence: În al doilea rând, Italia neagă că există, de fapt, vreo dovadă de interferență din partea Băncii Italiei.
Prediction: Întreprindere financiară care efectuează operații de plată și de credit (și organizează circulația bănească).
Marked correct: Întreprindere financiară care efectuează operații de plată și de credit (și organizează circulația bănească).
Sentence: Asta este echipa administrativă din bancă.
Prediction: Întreprindere financiară care efectuează operații de plată și de credit (și organizează circulația bănească).
Marked correct: Întreprindere financiară care efectuează operații de plată și de credit (și organizează circulația bănească).
Sentence: Da, pentru că stând pe bancă, depui un efort istovitor.
Prediction: Întreprindere financiară care efectuează operații de plată și de credi

0.6666666865348816

In [17]:
words = ['secol', 'județ', 'comună', 'uniune', 'muncă', 'persoană', 'locuitor', 'biserică', 'nord', 'teritoriu', 'armată', 'film', 'stat', 'majoritate', 'activitate', 'rol', 'oraș', 'mod', 'echipă', 'război', 'dată', 'perioadă', 'companie', 'om', 'an', 'fapt', 'problemă', 'lună', 'membru', 'plan', 'măsură', 'interior', 'prezent', 'urmare', 'familie', 'nevoie', 'lume', 'regiune', 'apă', 'piață', 'sistem', 'limbă', 'ban', 'nivel', 'grup', 'vedere', 'caz', 'zonă', 'dezvoltare', 'ajutor', 'nume', 'timp', 'casă', 'număr', 'viață', 'valoare', 'conducere', 'drum', 'schimbare', 'apărare', 'lucru', 'sfârșit', 'urmă', 'fel', 'program', 'zi', 'joc', 'cadru', 'formă', 'forță', 'gol', 'bătaie', 'fals', 'schimb', 'acord', 'cădere', 'aripă', 'atac', 'calm', 'poziție', 'fin', 'serviciu', 'mediu', 'masă', 'putere', 'adânc', 'prost', 'semn', 'loc', 'mare', 'față', 'cap', 'liber', 'linie', 'bază', 'rău', 'parte', 'bun', 'legătură', 'drept']

acc_sum = 0
for word in words:
  try:
    acc = train_model_for_word(word)
    acc_sum += acc
  except:
    continue
acc_sum /= len(words)

print("Global accuracy: ")
print(acc_sum)

Found 20 files belonging to 2 classes.
Using 16 files for training.
Found 20 files belonging to 2 classes.
Using 4 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 20 files belonging to 1 classes.
Using 16 files for training.
Found 20 files belonging to 1 classes.
Using 4 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 10 files belonging to 2 classes.
Using 8 files for training.
Found 10 files belonging to 2 classes.
Using 2 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 10 files belonging to 2 classes.
Using 8 files for training.
Found 10 files belonging to 2 classes.
Using 2 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 20 files belonging to 2 classes.
Using 16 files for training.
Found 20 files belonging to 2 classes.
Using 4 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 30 files belonging to 2 classes.
Using 24 files for training.
Found 30 

In [35]:
acc_sum = 0
limit = 1000
for word in db:
  try:
    acc = train_model_for_word(word)
    acc_sum += acc
    limit-=1
    if limit == 0:
      break
  except:
    continue
acc_sum /= len(words)

print("Global accuracy: ")
print(acc_sum)

[1;30;43mDatele de ieșire de afișat au fost trunchiate la ultimele 5000 linii.[0m
Found 20 files belonging to 1 classes.
Using 16 files for training.
Found 20 files belonging to 1 classes.
Using 4 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 20 files belonging to 1 classes.
Using 16 files for training.
Found 20 files belonging to 1 classes.
Using 4 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 30 files belonging to 3 classes.
Using 24 files for training.
Found 30 files belonging to 3 classes.
Using 6 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 20 files belonging to 2 classes.
Using 16 files for training.
Found 20 files belonging to 2 classes.
Using 4 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 20 files belonging to 2 classes.
Using 16 files for training.
Found 20 files belonging to 2 classes.
Using 4 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epo

In [38]:
acc_sum = 6.339660405926407
acc_sum *= len(words)
acc_sum /= 1000

print("Global accuracy: ")
print(acc_sum)

Global accuracy: 
0.6339660405926406


# Antrenare cu dataset train - dev - test

In [29]:
!rm -rf train
!rm -rf validate

In [30]:
with open("dataset.pickle", "rb") as pickleFile:
    db = pickle.load(pickleFile)

for word in db:
  synsets = db[word][0]['synsets'].split()
  for s in synsets:
      if len(s) > 3:
          try:
              os.makedirs("train/" + word + "/" + s)
              os.makedirs("validate/" + word + "/" + s)
          except:
              continue

  sent_by_syn = dict()
  for i in range(len(db[word])):
      correct_synset = db[word][i]['correct_synset_id']
      if correct_synset != "-1":
        sentence = db[word][i]['sentence']
        if correct_synset not in sent_by_syn:
          sent_by_syn[correct_synset] = []
        sent_by_syn[correct_synset].append(sentence)

  for syn in sent_by_syn:
    list_sent = sent_by_syn[syn]
    l = len(list_sent)
    if(l >= 2):
      no_val = math.ceil(0.1 * l)
      for i in range(no_val):
        with open("validate/" + word + "/" + syn  + "/" + str(i) + ".txt", "wt") as f:
          f.write(sentence)
      for i in range(no_val, l):
        with open("train/" + word + "/" + syn  + "/" + str(i) + ".txt", "wt") as f:
          f.write(sentence)

walk = list(os.walk("train"))
for path, _, _ in walk[::-1]:
  if len(os.listdir(path)) == 0:
    os.rmdir(path)

walk = list(os.walk("validate"))
for path, _, _ in walk[::-1]:
  if len(os.listdir(path)) == 0:
    os.rmdir(path)

In [31]:
def train_model_for_word_with_val(word, logging=False):

  classes_count = 0
  classes = []
  synsets = db[word][0]['synsets'].split()
  for s in synsets:
      if len(s) > 3:
        classes_count += 1
        classes.append(s)

  classes.sort()

  batch_size = 4
  seed = 123
  train_ds = tf.keras.utils.text_dataset_from_directory(
      'train/' + word, batch_size=batch_size, validation_split=0.1,
      subset='training', seed=seed)
  test_ds = tf.keras.utils.text_dataset_from_directory(
      'train/' + word, batch_size=batch_size, validation_split=0.1,
      subset='validation', seed=seed)
  val_ds = tf.keras.utils.text_dataset_from_directory(
      'validate/' + word, batch_size=batch_size, validation_split=0.1,
      subset='validation', seed=seed)
  
  AUTOTUNE = tf.data.AUTOTUNE
  train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
  test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
  val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)


  def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                    '[%s]' % re.escape(string.punctuation), '')


  # Vocabulary size and number of words in a sequence.
  vocab_size = 10000
  sequence_length = 100

  # Use the text vectorization layer to normalize, split, and map strings to
  # integers. Note that the layer uses the custom standardization defined above.
  # Set maximum_sequence length as all samples are not of the same length.
  vectorize_layer = TextVectorization(
      standardize=custom_standardization,
      max_tokens=vocab_size,
      output_mode='int',
      output_sequence_length=sequence_length)

  # Make a text-only dataset (no labels) and call adapt to build the vocabulary.
  text_ds = train_ds.map(lambda x, y: x)
  vectorize_layer.adapt(text_ds)
  embedding_dim=64

  model = Sequential([
    vectorize_layer,
    Embedding(vocab_size, embedding_dim, name="embedding"),
    GlobalAveragePooling1D(),
    Dense(32, activation='relu'),
    Dense(classes_count, activation='softmax')
  ])

  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")
  early_stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)

  model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics=['accuracy'])
  
  model.fit(
    train_ds,
    validation_data=test_ds,
    epochs=100,
    callbacks=[tensorboard_callback, early_stop_callback]
    )
  
  
  evaluation = model.evaluate(val_ds)
  pred = []
  if logging is True:
    predictions = model.predict(val_ds)
    for i in range(len(predictions)):
      pred.append(predictions[i].tolist().index(max(predictions[i].tolist())))

    for i in val_ds.enumerate():
      counts = 0
      for s in list(i[1][0].numpy()):
        sentence = s.decode('utf-8')
        prediction = pred[counts]
        print("Sentence: " + sentence)
        print("Prediction: " + wn.synset(classes[pred[counts]]).definition)
        for k in range(len(db[word])):
          if db[word][k]['sentence'] == sentence:
            correct = wn.synset(db[word][k]['correct_synset_id'])
            print("Marked correct: " + correct.definition)
            break
        counts += 1


  return evaluation[1]

In [32]:
train_model_for_word_with_val('bancă', logging=True)

Found 51 files belonging to 3 classes.
Using 46 files for training.
Found 51 files belonging to 3 classes.
Using 5 files for validation.
Found 7 files belonging to 3 classes.
Using 0 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Sentence: Teritoriul ocuăpat de satul Banca s-a extins în timp și cu alte zone locuite de clăcașii aduși de familiile Costache și Lambrino.
Prediction: Scaun lung pentru două sau mai multe persoane așezată în grădini, parcuri etc.
Marked correct: Clădire unde își desfășoară activitatea o bancă comercială
Sentence: Teritoriul ocuăpat de satul Banca s-a extins în timp și cu alte zone locuite de clăcașii aduși de familiile Costache și Lambrino.
Prediction: Scaun lung pentru două sau mai multe persoane așezată în grădini, parcuri etc.
Marked correct: Clădire unde își desfășoară activitatea o bancă comercială
Sentence: Teritoriul ocuăpat de satul Banca s-a extins în timp și cu alte zone locuite de clăcașii aduși de familiile Costache și Lambr

0.5714285969734192

In [34]:
words = ['secol', 'județ', 'comună', 'uniune', 'muncă', 'persoană', 'locuitor', 'biserică', 'nord', 'teritoriu', 'armată', 'film', 'stat', 'majoritate', 'activitate', 'rol', 'oraș', 'mod', 'echipă', 'război', 'dată', 'perioadă', 'companie', 'om', 'an', 'fapt', 'problemă', 'lună', 'membru', 'plan', 'măsură', 'interior', 'prezent', 'urmare', 'familie', 'nevoie', 'lume', 'regiune', 'apă', 'piață', 'sistem', 'limbă', 'ban', 'nivel', 'grup', 'vedere', 'caz', 'zonă', 'dezvoltare', 'ajutor', 'nume', 'timp', 'casă', 'număr', 'viață', 'valoare', 'conducere', 'drum', 'schimbare', 'apărare', 'lucru', 'sfârșit', 'urmă', 'fel', 'program', 'zi', 'joc', 'cadru', 'formă', 'forță', 'gol', 'bătaie', 'fals', 'schimb', 'acord', 'cădere', 'aripă', 'atac', 'calm', 'poziție', 'fin', 'serviciu', 'mediu', 'masă', 'putere', 'adânc', 'prost', 'semn', 'loc', 'mare', 'față', 'cap', 'liber', 'linie', 'bază', 'rău', 'parte', 'bun', 'legătură', 'drept']

acc_sum = 0
for word in words:
  try:
    acc = train_model_for_word_with_val(word)
    acc_sum += acc
  except:
    continue
acc_sum /= len(words)

print("Global accuracy: ")
print(acc_sum)

Found 17 files belonging to 2 classes.
Using 16 files for training.
Found 17 files belonging to 2 classes.
Using 1 files for validation.
Found 3 files belonging to 2 classes.
Using 0 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 18 files belonging to 1 classes.
Using 17 files for training.
Found 18 files belonging to 1 classes.
Using 1 files for validation.
Found 2 files belonging to 1 classes.
Using 0 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 8 files belonging to 2 classes.
Using 8 files for training.
Found 8 files belonging to 2 classes.
Using 8 files for training.
Found 17 files belonging to 2 classes.
Using 16 files for training.
Found 17 files belonging to 2 classes.
Using 1 files for validation.
Found 3 files belonging to 2 classes.
Using 0 files for validation.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Found 26 files belonging to 1 classes.
Using 24 files for training.
Found 26 files belonging to 1 classes.
Us