In [2]:
import tensorflow as tf
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import TextVectorization


In [7]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
max_features = 5000  # Maximum vocab size.
max_len = 4  # Sequence length to pad the outputs to.

embedding_dims = 2

# Create the layer.  
vectorize_layer = TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

# Now that the vocab layer has been created, call `adapt` on the text-only  
# dataset to create the vocabulary. You don't have to batch, but for large  
# datasets this means we're not keeping spare copies of the dataset.  
vectorize_layer.adapt(text_dataset.batch(64))

# Create the model that uses the vectorize text layer  
model = tf.keras.models.Sequential()

# Start by creating an explicit input layer. It needs to have a shape of  
# (1,) (because we need to guarantee that there is exactly one string  
# input per batch), and the dtype needs to be 'string'.  
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))

# The first layer in our model is the vectorization layer. After this  
# layer, we have a tensor of shape (batch_size, max_len) containing vocab  
# indices.  
model.add(vectorize_layer)

# Now, the model can map strings to integers, and you can add an embedding  
# layer to map these integers to learned embeddings.  
input_data = [["foo qux bar"], ["qux baz"]]
model.predict(input_data)
#array([[2, 1, 4, 0],
#       [1, 3, 0, 0]])


2023-07-09 16:45:51.097347: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




array([[2, 1, 4, 0],
       [1, 3, 0, 0]])

In [3]:
from tensorflow.keras import Sequential, Input
from tensorflow.keras.layers import TextVectorization

MAX_LEN = 26
MAX_TOKENS = 20000
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
preprocessing_layer = TextVectorization(output_sequence_length=26, max_tokens=MAX_TOKENS)
preprocessing_layer
model = Sequential([
    Input(shape = (1, ), dtype=tf.string),
    preprocessing_layer
])
input_data = [["foo qux bar"], ["qux baz"]]
#model.predict(input_data)

In [10]:
text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])

#input_data = [["foo qux bar"], ["qux baz"]]
#model.predict(input_data)

In [4]:
text_dataset = tf.data.Dataset.from_tensor_slices(["titi toto avion tutu tata tztz xylophone"])#(["foo foo", "bar", "baz"])
preprocessing_layer = TextVectorization(max_tokens=9, output_sequence_length=4, output_mode='int')
preprocessing_layer.adapt(text_dataset.batch(32))
model = Sequential(
[
    Input(shape = (1, ), dtype=tf.string),
    preprocessing_layer
])
phrases = [["titi toto tutu"], ["tata tztz"], ["titi toto tutu"], ["toto tutu titi"], ["chacha manoir"], 
["avion titi"] ,["xylophone"]]
res = model.predict(phrases)
print(res.shape)
res


(7, 4)


2023-07-12 10:13:33.707408: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


array([[6, 5, 4, 0],
       [7, 3, 0, 0],
       [6, 5, 4, 0],
       [5, 4, 6, 0],
       [1, 1, 0, 0],
       [8, 6, 0, 0],
       [2, 0, 0, 0]])

In [5]:
wrd_i = {}
for phi,ph in enumerate(phrases):
    wrds = ph[0].split(" ")
    print(wrds, res[phi][:len(wrds)])
    for wi in range(len(wrds)):

        if res[phi][wi] not in wrd_i.keys():
            wrd_i[res[phi][wi]] = set()
        wrd_i[res[phi][wi]].add(wrds[wi])
wrd_i

['titi', 'toto', 'tutu'] [6 5 4]
['tata', 'tztz'] [7 3]
['titi', 'toto', 'tutu'] [6 5 4]
['toto', 'tutu', 'titi'] [5 4 6]
['chacha', 'manoir'] [1 1]
['avion', 'titi'] [8 6]
['xylophone'] [2]


{6: {'titi'},
 5: {'toto'},
 4: {'tutu'},
 7: {'tata'},
 3: {'tztz'},
 1: {'chacha', 'manoir'},
 8: {'avion'},
 2: {'xylophone'}}

In [7]:
from tensorflow.keras.layers import Embedding, Dense

N_CLASSES = 3

text_dataset = tf.data.Dataset.from_tensor_slices(["titi toto avion tutu tata tztz xylophone"])#(["foo foo", "bar", "baz"])
preprocessing_layer = TextVectorization(max_tokens=9, output_sequence_length=4, output_mode='int')
preprocessing_layer.adapt(text_dataset.batch(32))

# embedding a 1000 words vocab in 5 words
embedding_layer = Embedding(1000, 5)

model = Sequential(
[
    Input(shape = (1, ), dtype=tf.string),
    preprocessing_layer,
    embedding_layer,
    Dense(16, activation='relu'),
    Dense(N_CLASSES, activation='softmax')
])

phrases = [["titi toto tutu"], ["tata tztz"], ["titi toto tutu"], ["toto tutu titi"], ["chacha manoir"], 
["avion titi"] ,["xylophone"]]
res = model.predict(phrases)
print(res.shape)
res


#text_dataset = tf.data.Dataset.from_tensor_slices(["titi toto avion tutu tata tztz xylophone"])
#preprocessing_layer(text_dataset)

(7, 4, 5)


array([[[ 0.03037127,  0.00637014,  0.00581019, -0.01834431,
         -0.04895836],
        [-0.03744565, -0.02089571,  0.04181359,  0.04433841,
         -0.02651073],
        [-0.02376949, -0.04465935, -0.01542781, -0.00890397,
         -0.01439738],
        [-0.00313711, -0.04297994, -0.00152006,  0.03151139,
          0.04764477]],

       [[ 0.03911985,  0.01901734, -0.01242948,  0.00561763,
         -0.01996814],
        [-0.04283087,  0.02748943, -0.04682617, -0.04292084,
         -0.02758493],
        [-0.00313711, -0.04297994, -0.00152006,  0.03151139,
          0.04764477],
        [-0.00313711, -0.04297994, -0.00152006,  0.03151139,
          0.04764477]],

       [[ 0.03037127,  0.00637014,  0.00581019, -0.01834431,
         -0.04895836],
        [-0.03744565, -0.02089571,  0.04181359,  0.04433841,
         -0.02651073],
        [-0.02376949, -0.04465935, -0.01542781, -0.00890397,
         -0.01439738],
        [-0.00313711, -0.04297994, -0.00152006,  0.03151139,
          0

In [11]:
from tensorflow_hub import KerasLayer
SWIVEL = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1"

swivel_module = KerasLayer(SWIVEL)

In [19]:
swweight = swivel_module.get_weights()
swweight

[array([[ 0.01362136,  0.36039677,  0.4090194 , ..., -0.1157171 ,
         -0.59178233,  0.16092265],
        [-0.07641025, -0.26564464,  0.44310385, ..., -0.37381256,
         -0.1619345 ,  0.0031434 ],
        [ 0.55277616,  0.34195077, -0.45784146, ...,  0.18172583,
          0.05937071, -0.13633412],
        ...,
        [ 0.92592955,  0.43907115, -0.13908613, ..., -0.1368552 ,
          0.58673716, -0.29491594],
        [ 1.1792232 ,  0.08465543,  0.6624093 , ..., -0.48487076,
         -0.98878735, -0.57055944],
        [ 1.1646788 , -0.20625216,  0.07113393, ..., -0.33484292,
         -0.9706867 , -0.4425568 ]], dtype=float32)]

In [22]:
print(len(swweight))
swweight[0].shape

1


(19469, 20)

In [25]:
swivel_module(["the car is big"])

<tf.Tensor: shape=(1, 20), dtype=float32, numpy=
array([[ 0.6402509 , -0.320366  ,  0.39814392,  0.10049738, -0.20162806,
         0.04523668, -0.702852  , -0.1462781 , -0.18485829, -1.2666786 ,
        -0.72979474,  1.0004414 , -0.22212121, -0.56271666, -0.58339417,
         1.1507376 ,  0.70721537,  0.01393132, -0.54767364, -0.05641793]],
      dtype=float32)>

In [26]:
swivel_module(["the car is big", "the dress is red"])

<tf.Tensor: shape=(2, 20), dtype=float32, numpy=
array([[ 0.6402509 , -0.320366  ,  0.39814392,  0.10049738, -0.20162806,
         0.04523668, -0.702852  , -0.1462781 , -0.18485829, -1.2666786 ,
        -0.72979474,  1.0004414 , -0.22212121, -0.56271666, -0.58339417,
         1.1507376 ,  0.70721537,  0.01393132, -0.54767364, -0.05641793],
       [ 1.0415261 ,  0.9814507 , -0.27379286,  0.30891693, -0.97580093,
         1.0505939 , -1.1736104 ,  0.8999038 ,  1.1177965 , -0.50735945,
        -0.9517901 ,  1.0200392 , -0.7684101 , -0.31442934, -0.57985973,
        -0.72479   , -0.6812028 , -0.8634784 , -1.30937   , -0.39851046]],
      dtype=float32)>

In [40]:
from tensorflow.keras.layers import Embedding, Dense


model_sw = Sequential(
[
    #Input(shape = (None, 1, ), dtype=tf.string),
    #tf.data.Dataset.from_tensor_slices(),
    swivel_module,
    Dense(16, activation='relu'),
    Dense(2, activation='softmax')
    
])

In [43]:
model_sw.predict(["the car is big"])



array([[0.14964916, 0.8503508 ]], dtype=float32)