In [12]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import string
from docx import *
from zipfile import ZipFile

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers.experimental import preprocessing

In [15]:
#import word doc and convert to dataframe

wordDoc = Document('OCCLogDocx/log_2.docx')

occ_dfs = [] 
for table in wordDoc.tables:
    data = [[cell.text for cell in row.cells] for row in table.rows]
    occ_dfs.append(pd.DataFrame(data))
    
occ_df = pd.concat(occ_dfs, ignore_index=True)
occ_df.columns =  occ_df.iloc[0]
occ_df = occ_df[1:]
occ_df.head(5)


Unnamed: 0,Time,Ref,Log,BPD
1,556,,Gealoc(s) reset.,
2,622,,DMZ1 issued Cat B W/O's for MW02 to set on and...,
3,640,622.0,DMZ1 cleared Cat B W/O's for the M1 Track and ...,
4,656,,DMZ1 issued Cat B W/O's for the M55XL to crank...,
5,658,,T950 Inspection train in C80 TM1 dispatching t...,


In [16]:
#data cleaning 
occ_df['BPD'] = occ_df['BPD'].apply(lambda x: 0 if x == '' else 1)
occ_df.head(5)

Unnamed: 0,Time,Ref,Log,BPD
1,556,,Gealoc(s) reset.,0
2,622,,DMZ1 issued Cat B W/O's for MW02 to set on and...,0
3,640,622.0,DMZ1 cleared Cat B W/O's for the M1 Track and ...,0
4,656,,DMZ1 issued Cat B W/O's for the M55XL to crank...,0
5,658,,T950 Inspection train in C80 TM1 dispatching t...,0


In [17]:
#Convert dataframe to dataset
features = ['Log']
csv = occ_df.to_csv('csv/onion.csv', index=False) 

In [18]:
csv_ds = tf.data.experimental.make_csv_dataset(
    'csv/onion.csv',
    batch_size=1, # Artificially small to make examples easier to show.
    label_name='BPD',
    num_epochs=1,
    ignore_errors=True)

train_ds = csv_ds.take(40000)
val_ds = csv_ds.skip(40000)

In [19]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [20]:
# Embed a 1,000 word vocabulary into 3 dimensions.
embedding_layer = tf.keras.layers.Embedding(100, 3)

In [21]:
# Create a custom standardization function to strip HTML break tags '<br />'.
def custom_standardization(input_data):
    return input_data
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation), '')

In [29]:
# Vocabulary size and number of words in a sequence.
vocab_size = 1000
sequence_length = 100

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Note that the layer uses the custom standardization defined above.
# Set maximum_sequence length as all samples are not of the same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_ds = train_ds.map(lambda x, y: x['Log'])
train_ds

<PrefetchDataset shapes: (OrderedDict([(Time, (None,)), (Ref, (None,)), (Log, (None,))]), (None,)), types: (OrderedDict([(Time, tf.int32), (Ref, tf.int32), (Log, tf.string)]), tf.int32)>

In [23]:
vectorize_layer.adapt(text_ds)

In [24]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [25]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [26]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [31]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=3,
    callbacks=[tensorboard_callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7feadad33af0>

In [300]:
model.get_layer(index=1)

<keras.layers.embeddings.Embedding at 0x7f33bcacba90>

In [302]:
predict_x=model.predict(train_ds) 
predict_x

array([[-0.05776202],
       [-0.05985318],
       [-0.05925184],
       [-0.05927332],
       [-0.05966676],
       [-0.05779658],
       [-0.0578372 ],
       [-0.05902543],
       [-0.05834907],
       [-0.05957817]], dtype=float32)