In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd drive/MyDrive/

/content/drive/MyDrive


In [None]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

batch_size = 62  # batch size
latent_dim = 128  # latent dimensionality of the encoding space
num_samples = 10000  # number of samples to train on

# data path
data_path = "train_y.csv"

# vectorize the data
input_texts = []
target_texts = []

# set of unique characters
input_characters = set()
target_characters = set()

# reading in data removing spaces for train x
# converting train x into train y by converting each character in each word into 0 and 1
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().split("\n")
with open(data_path, "r", encoding="utf-8") as f:
    target_lines = f.read().split("\n")
for input_text, target_text in zip(lines, target_lines):
    input_text = ''.join(input_text.split()[:])
    target_text = ''.join(['1' + '0' * (len(word) -1) for word in target_text.split()[:6]])
    
    # "tab" as the "start sequence" character
    # for the targets, and "\n" as "end sequence" character
    target_text = "\t" + target_text + "\n"
    input_texts.append(input_text)
    target_texts.append(target_text)
    for char in input_text:
        if char not in input_characters:
            input_characters.add(char)
    input_characters.add(' ')
    target_characters.add(' ')
    for char in target_text:
        if char not in target_characters:
            target_characters.add(char)

# converts the alphebet set to list of charactesr and sorts them
input_characters = sorted(list(input_characters))
target_characters = sorted(list(target_characters))

# makes a note of the number of input and output tokens
num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)

# max input and output length in the train set
max_encoder_seq_length = max([len(txt) for txt in input_texts])
# the decoded characeter are 1 signalling the start of a word, and 0 signalling inside a word
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

# this allows you to look up the character to find its indice
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

# empty arrays for encoding
# space for every character 834(max sent length) x27
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
# one hot encoding
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0

    # decoder output is fed back into the decoder input       
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

# building the model
# define an input sequence to process it
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.Bidirectional(keras.layers.LSTM(latent_dim, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoder_inputs)
state_h = keras.layers.Concatenate()([forward_h, backward_h])
state_c = keras.layers.Concatenate()([forward_c, backward_c])

# discard encoder_outputs and only keep the states
encoder_states = [state_h, state_c]

# set up the decoder using encoder_states as initial state
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# we set up our decoder to return full output sequences, and to return internal states as well 
# we don't use the return states in the training model, but we will use them in inference
decoder_lstm = keras.layers.LSTM(2*latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# define the model that will turn
# encoder_input_data and decoder_input_data into decoder_target_data
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# train the model
opt = keras.optimizers.Adam(0.003)
model.compile(
    optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy",tf.keras.metrics.Recall(), tf.keras.metrics.Precision()] 
)
epochs = 40
model_data = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=batch_size,
    epochs=epochs,
    validation_split=0.2,
)

# saving the model if need to access the best result
model.save("word_seg8")

# calculating the f score for the final epoch
Precision = model_data.history['val_precision'][-1]
Recall = model_data.history['val_recall'][-1]

f1score = 2*Precision*Recall/(Precision+Recall)
print("F1 score ", f1score)

Number of samples: 49374
Number of unique input tokens: 27
Number of unique output tokens: 5
Max sequence length for inputs: 834
Max sequence length for outputs: 63
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40




INFO:tensorflow:Assets written to: word_seg8/assets


INFO:tensorflow:Assets written to: word_seg8/assets


F1 score  0.9871510562469401


In [None]:
# define sampling models
# restore the model and construct the encoder and decoder
model = keras.models.load_model("word_seg8")

encoder_inputs = model.input[0]  # input_1
print(model.layers)
print(model.layers[1].output)
encoder_outputs, _,_,_,_= model.layers[1].output  # lstm_1
state_h_enc = model.layers[3].output
state_c_enc = model.layers[4].output
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim * 2,), name="input_3")
decoder_state_input_c = keras.Input(shape=(latent_dim * 2,), name="input_4")
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[5]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[6]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# reverse-lookup token index to decode sequences back to something readable
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # encode the input as state vectors
    states_value = encoder_model.predict(input_seq)

    # generate empty target sequence of length 1
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # populate the first character of target sequence with the start character
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # sampling loop for a batch of sequences
    # to simplify, here we assume a batch of size 1
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # exit condition: either hit max length
        # or find stop character
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # update the target sequence (of length 1)
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # update states
        states_value = [h, c]
    return decoded_sentence

[<tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f8398f86510>, <tensorflow.python.keras.layers.wrappers.Bidirectional object at 0x7f82ca4b6790>, <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f827c2bd8d0>, <tensorflow.python.keras.layers.merge.Concatenate object at 0x7f82e620f310>, <tensorflow.python.keras.layers.merge.Concatenate object at 0x7f827d484c10>, <tensorflow.python.keras.layers.recurrent_v2.LSTM object at 0x7f827d6ca110>, <tensorflow.python.keras.layers.core.Dense object at 0x7f827d7dbe10>]
[<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'bidirectional')>, <KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'bidirectional')>, <KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'bidirectional')>, <KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'bidirectional')>, <KerasTensor: shape=(None, 128) dtype=float32 (created by layer 'bidirectional')>]


In [None]:
for seq_index in range(20):
    # take one sequence (part of the training set) for decoding
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    input_text = input_texts[seq_index]
    print("-")
    print("Input sentence:", input_text)
    print("Decoded sentence:", end='')
    while True:
      decoded_sentence = decode_sequence(input_seq)

      for i in range(len(input_text)):
        char = input_text[i]
        if i>0 and i < len(decoded_sentence) and decoded_sentence[i] == '1':
          print(' ',end='')
          break
        print(char, end='')
      
      if '1' not in decoded_sentence[1:]:
        break
      input_seq = input_seq[:, decoded_sentence[1:].index('1')+1:]
      input_text = input_text[decoded_sentence[1:].index('1')+1:]
      #print([input_seq, input_text])
    
    print()

-
Input sentence: thefultoncountygrandjurysaidfridayaninvestigationofrecentprimaryelectionproducednoevidencethatanyirregularitiestookplace
Decoded sentence:the fulton county grand jury said friday an investigation of recent primary election produced no evidence that anyir regularities took place
-
Input sentence: thejuryfurthersaidinpresentmentsthatthecityexecutivecommitteewhichhadchargeoftheelectiondeservesthepraiseandthanksofthecityofatlantaforthemannerinwhichtheelectionwasconducted
Decoded sentence:the jury furthersaid in presentments that the city executive committee which had charge of the election deserves the praise and thanks of the city of at lanta for the manner in which the election was conducted
-
Input sentence: thetermjuryhadbeenchargedbyfultonsuperiorcourtjudgedurwoodpyetoinvestigatereportsofpossibleirregularitiesintheprimarywhichwaswonbyivanallen
Decoded sentence:the term jury had been charged by fulton superior court judged urwood pyeto investigate reports of possible 

In [None]:
with open('story.txt', 'r') as file:
  story = file.readline().replace('-','').replace(';', '').replace('(', '').replace(')', '').strip()

# encoding test data (seperate to the training dataset)
encoder_story = np.zeros((len(story), num_encoder_tokens), dtype="float32")
for t, char in enumerate(story):
  encoder_story[t, input_token_index[char]] = 1.0
encoder_story[t + 1 :, input_token_index[" "]] = 1.0

In [None]:
# predicting story.txt and saving the output
if 1:
    input_seq = encoder_story.reshape((1, encoder_story.shape[0], encoder_story.shape[1]))
    input_text = story
    print("-")
    print("Input sentence:", input_text)
    print("Decoded sentence:", end='')
    storylist = []

    while True:
      try:
        decoded_sentence = decode_sequence(input_seq)
      except:
        break

      for i in range(len(input_text)):
        char = input_text[i]
        if i>0 and i < len(decoded_sentence) and decoded_sentence[i] == '1':
          print(' ',end='')
          storylist.append(' ')
          break
        print(char, end='')
        storylist.append(char)
      
      if '1' not in decoded_sentence[1:]:
        break
      input_seq = input_seq[:, decoded_sentence[1:].index('1')+1:]
      input_text = input_text[decoded_sentence[1:].index('1')+1:]
      #print([input_seq, input_text])
    
    print()

-
Input sentence: northrichmondstreetbeingblindwasaquietstreetexceptatthehourwhenthechristianbrothersschoolsettheboysfreeanuninhabitedhouseoftwostoreysstoodattheblindenddetachedfromitsneighboursinasquaregroundtheotherhousesofthestreetconsciousofdecentliveswithinthemgazedatoneanotherwithbrownimperturbablefacestheformertenantofourhouseapriesthaddiedinthebackdrawingroomairmustyfromhavingbeenlongenclosedhunginalltheroomsandthewasteroombehindthekitchenwaslitteredwitholduselesspapersamongtheseifoundafewpapercoveredbooksthepagesofwhichwerecurledanddamptheabbotbywalterscottthedevoutcommunicantandthememoirsofvidocqilikedthelastbestbecauseitsleaveswereyellowthewildgardenbehindthehousecontainedacentralappletreeandafewstragglingbushesunderoneofwhichifoundthelatetenantsrustybicyclepumphehadbeenaverycharitablepriestinhiswillhehadleftallhismoneytoinstitutionsandthefurnitureofhishousetohissisterwhentheshortdaysofwintercameduskfellbeforewehadwelleatenourdinnerswhenwemetinthestreetthehouseshadgrownsombr

In [None]:
story_txt = ''.join(storylist)
with open('story2.txt', 'w') as f:
    f.write(story_txt)