In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
import tqdm
from joblib import Parallel, delayed
import string
import pickle
import random

In [None]:
random.seed(25)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
def load_keep_probs(keep_prob_path):
    #sorted from most freq to least
    with open(keep_prob_path, 'rb') as f:
        keep_probs = pickle.load(f)
    keep_probs = dict(sorted(keep_probs.items(), key=lambda item: item[1]))
    return keep_probs

In [None]:
def get_gen(lines_path, keep_prob_path, char_set, max_word_len, window_size=2, neg_sample_size = 5):

    with open(lines_path, 'rb') as f:
        lines = pickle.load(f)
    keep_probs = load_keep_probs(keep_prob_path)
    words = list(keep_probs.keys())
    word_count = len(words)
    
    random.shuffle(lines)
    lines = lines[:int(len(lines) * 0.50)]
    print("Number of lines {}".format(len(lines)))

    def to_idx(word, char_set=char_set, max_word_len=max_word_len):
        if(len(word) < max_word_len):
            res = [char_set.index(c) if c in char_set else -1 for c in word]
            res += [-1 for _ in range(max_word_len-len(word))]
        else:
            res = [char_set.index(c) if c in char_set else -1 for c in word[:max_word_len]]
        return res
        #return tf.one_hot(res,len(char_set)).numpy()
  
    def gen():
      counter = 0
      targets = []
      contexts = []
      labels = [] 
      for line in lines:
          
          line = line.split(" ")
          for i,word in enumerate(line):
              word = word.rstrip()
              for j in range(i - window_size, i + window_size+1):
                  if j==i or j<0 or j>=len(line):
                      continue
                
                  flag=True
                  try:
                      prob = keep_probs[word]
                  except:
                      continue
                  flag = np.random.rand() < prob
                  if flag == False:
                      continue
                
                  targets.append(to_idx(word))
                  contexts.append(to_idx(line[j]))
                  labels.append(1)

              for _ in range(neg_sample_size):
                  if flag == False:
                      continue
                  idx = int(abs(np.random.rand() - 0.25) * word_count)
                  targets.append(to_idx(word))
                  contexts.append(to_idx(words[idx]))
                  labels.append(0)

          counter += 1
          if(counter % 200 == 0):
            counter=0
            t, c, l = np.array(targets, dtype=np.int32), np.array(contexts, dtype=np.int32), np.expand_dims(np.array(labels, dtype=np.int32), axis=-1)
            
            targets = []
            contexts = []
            labels = []
            yield (t, c), l 
          else:
            continue
        
    generator = tf.data.Dataset.from_generator(gen,
                                               output_signature=(
                                                                (tf.TensorSpec(shape=(None,max_word_len), dtype=tf.int32),
                                                                 tf.TensorSpec(shape=(None,max_word_len), dtype=tf.int32)),
                                                                 tf.TensorSpec(shape=(None,1), dtype=tf.int32),
                                                                  ))
    generator = generator.prefetch(AUTOTUNE)
    return generator


In [None]:
line_path = "data/lines.picke"
keep_probs_path = "data/keep_probs.picke"

In [None]:
data_gen = get_gen(line_path, keep_probs_path, char_set=string.printable+"üÜiİöÖğĞşŞçÇ",max_word_len=15, window_size=1, neg_sample_size = 5)

Number of lines 4575670


In [None]:
#RUN UNTIL HERE and GET num lines

In [None]:
char_set=string.printable+"üÜiİöÖğĞşŞçÇ"
char_size = len(char_set)
num_lines = 6405938

In [None]:
class Submodel(tf.keras.Model):
  def __init__(self, input_shape, **kwargs):
    super(Submodel, self).__init__(**kwargs)
    self.LSTM = tf.keras.layers.LSTM(100,input_shape=input_shape)
  
  def call(self,inputs):
    return self.LSTM(inputs)    

In [None]:
def get_submodel():
  input = tf.keras.layers.Input(shape=(15,len(char_set)),dtype=tf.float32)
  output = tf.keras.layers.LSTM(100)(input)
  model =  tf.keras.Model(inputs=input, outputs=output)
  return model

In [None]:
target_net = get_submodel()
context_net = get_submodel()

def create_model():
  target_input = tf.keras.layers.Input(shape=(15,), name="target",dtype=tf.int32)
  context_input = tf.keras.layers.Input(shape=(15,), name="context",dtype=tf.int32)

  target = tf.one_hot(target_input,char_size)
  context = tf.one_hot(context_input,char_size)

  target = target_net(target)
  context = context_net(context)

  output = tf.keras.layers.Dot(axes=-1)([target,context])
  model =  tf.keras.Model(inputs=[target_input, context_input], outputs=output)
  model.compile(optimizer='adam',
                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                 metrics=['accuracy'])
  
  return model

model = create_model()

In [None]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
target (InputLayer)             [(None, 15)]         0                                            
__________________________________________________________________________________________________
context (InputLayer)            [(None, 15)]         0                                            
__________________________________________________________________________________________________
tf.one_hot (TFOpLambda)         (None, 15, 112)      0           target[0][0]                     
__________________________________________________________________________________________________
tf.one_hot_1 (TFOpLambda)       (None, 15, 112)      0           context[0][0]                    
____________________________________________________________________________________________

In [None]:
model.fit(data_gen, epochs=1, steps_per_epoch=int(num_lines/200)) 

    3/32029 [..............................] - ETA: 68:43:07 - loss: 0.6929 - accuracy: 0.7220

KeyboardInterrupt: ignored

In [None]:
target_net.save('target.h5')

In [None]:
context_net.save('context.h5')

In [None]:
model.save('model.h5')

In [None]:
from google.colab import files
files.download('target.h5')
files.download('context.h5')
files.download('model.h5')