In [1]:
!pip install apache-beam

Collecting apache-beam
  Downloading apache_beam-2.31.0-cp37-cp37m-manylinux2010_x86_64.whl (9.7 MB)
[K     |████████████████████████████████| 9.7 MB 22.1 MB/s 
Collecting hdfs<3.0.0,>=2.1.0
  Downloading hdfs-2.6.0-py3-none-any.whl (33 kB)
Collecting requests<3.0.0,>=2.24.0
  Downloading requests-2.26.0-py2.py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 838 kB/s 
Collecting fastavro<2,>=0.21.4
  Downloading fastavro-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 39.1 MB/s 
[?25hCollecting future<1.0.0,>=0.18.2
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 43.1 MB/s 
Collecting dill<0.3.2,>=0.3.1.1
  Downloading dill-0.3.1.1.tar.gz (151 kB)
[K     |████████████████████████████████| 151 kB 48.0 MB/s 
Collecting avro-python3!=1.9.2,<1.10.0,>=1.8.1
  Downloading avro-python3-1.9.2.1.tar.gz (37 kB)
Building wheels for collected packages: 

In [2]:
import argparse
import logging
import re

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions

import numpy as np
import tensorflow as tf

In [3]:
!head /content/drive/MyDrive/experiment_nlp/glove_initializer/glove.6B/glove.6B.50d.txt >> glove_sample.txt

In [5]:
inputs = '/content/glove_sample.txt'
OUTPUT_PATH = '/content/glove_sample.tfrecord'


def process(element):
  element = element.strip().split(' ')
  return element[0], ' '.join(element[1:]).strip()

class Key_value_combiner(beam.CombineFn):
  def create_accumulator(self):
    return {'key': [], 'value': []}

  def add_input(self, accumulator, input):
    accumulator['key'].append(input[0].encode())
    accumulator['value'].append(input[1].encode())
    return accumulator

  def merge_accumulators(self, accumulators):

      merged = {}
      for accum in accumulators:
        for item, value in accum.items():
          if item not in merged:
            merged[item] = []
          merged[item].append(value)
      return merged

  @staticmethod
  def _bytes_feature(value):
    if isinstance(value, list):
      return tf.train.Feature(bytes_list=tf.train.BytesList(value = value))  
    return tf.train.Feature(bytes_list=tf.train.BytesList(value = [value]))

  def extract_output(self, accumulator):
    accumulator =  {key :  self._bytes_feature(value[0]) for key, value in accumulator.items()} 
    example = tf.train.Example(
                features=tf.train.Features(
                      feature=accumulator))
    return example.SerializeToString()

with beam.Pipeline() as p:
    # Read the text file[pattern] into a PCollection.
    lines = (p | 'Read' >> ReadFromText(inputs)
               | 'MapKeyPare' >> beam.Map(process)
               | 'CombineAll' >> beam.CombineGlobally(Key_value_combiner())
               | 'write' >> beam.io.tfrecordio.WriteToTFRecord(
                            file_path_prefix=OUTPUT_PATH,
                            num_shards=1)
                                        )





In [27]:
import tensorflow as tf
import numpy as np


class GloveEmbedding:
    def __init__(self, vocab, glove_file, size):
        self._vocab = vocab
        self.glove_file = glove_file
        self.size = size
    
    def __load_glove_model(self, glove_file):
        dataset = tf.data.TFRecordDataset(self.glove_file)
        raw_data = tf.data.experimental.get_single_element(dataset)
        keypair = tf.io.parse_example(raw_data, {'key' : tf.io.VarLenFeature(dtype = tf.string), 
                               'value': tf.io.VarLenFeature(dtype = tf.string)})
        keypair = {key : tf.sparse.to_dense(value) for key, value in keypair.items()}
        print(keypair['key'])
        initializer = tf.lookup.KeyValueTensorInitializer(
            keys = keypair['key'], values = keypair['value'])
        
        self.embedding_lookup = tf.lookup.StaticHashTable(initializer=initializer, default_value=tf.constant(''))
        del keypair

    def get_embeddings(self):
      def convert_to_tensor(x):
        if x ==  b'':
          return tf.random.normal(size = self.size,dtype=tf.float32)
        else:
          print('in')
          return tf.strings.to_number(tf.strings.split(tf.strings.strip(x), ' '))

      self.__load_glove_model(self.glove_file)
      embeddings = self.embedding_lookup.lookup(tf.constant(self._vocab))
      embeddings = tf.map_fn(convert_to_tensor, embeddings, tf.float32)
      return tf.compat.v1.constant(embeddings, tf.float32)

In [28]:
gv = GloveEmbedding(['"', ',', 'a'], '/content/glove_sample.tfrecord-00000-of-00001', (50, ))

In [30]:
embd = gv.get_embeddings()

tf.Tensor([b'the' b',' b'.' b'of' b'to' b'and' b'in' b'a' b'"' b"'s"], shape=(10,), dtype=string)
in
in
in


In [32]:
embd

<tf.Tensor: shape=(3, 50), dtype=float32, numpy=
array([[ 0.25769 ,  0.45629 , -0.76974 , -0.37679 ,  0.59272 , -0.063527,
         0.20545 , -0.57385 , -0.29009 , -0.13662 ,  0.32728 ,  1.4719  ,
        -0.73681 , -0.12036 ,  0.71354 , -0.46098 ,  0.65248 ,  0.48887 ,
        -0.51558 ,  0.039951, -0.34307 , -0.014087,  0.86488 ,  0.3546  ,
         0.7999  , -1.4995  , -1.8153  ,  0.41128 ,  0.23921 , -0.43139 ,
         3.6623  , -0.79834 , -0.54538 ,  0.16943 , -0.82017 , -0.3461  ,
         0.69495 , -1.2256  , -0.17992 , -0.057474,  0.030498, -0.39543 ,
        -0.38515 , -1.0002  ,  0.087599, -0.31009 , -0.34677 , -0.31438 ,
         0.75004 ,  0.97065 ],
       [ 0.013441,  0.23682 , -0.16899 ,  0.40951 ,  0.63812 ,  0.47709 ,
        -0.42852 , -0.55641 , -0.364   , -0.23938 ,  0.13001 , -0.063734,
        -0.39575 , -0.48162 ,  0.23291 ,  0.090201, -0.13324 ,  0.078639,
        -0.41634 , -0.15428 ,  0.10068 ,  0.48891 ,  0.31226 , -0.1252  ,
        -0.037512, -1.5179  ,  0