In [3]:
#%pip install tensorflow
import tensorflow as tf
import pandas as pd
import os
import datetime
from tensorboard.plugins import projector
## Load in data

# Load the TensorBoard notebook extension
%load_ext tensorboard

tickets = pd.read_csv("master_dataset.csv")

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [9]:
# Set up a logs directory, so Tensorboard knows where to look for files.
log_dir_encode='mlogs/encode_data'
if not os.path.exists(log_dir_encode):
    os.makedirs(log_dir_encode)


In [10]:
import numpy as np

def bagOfWordsEncoder(vocab, input):
    ## Make an array size of the vocab
    encoderMatrix = np.zeros(len(vocab))

    ## Iterate the input strings and encode their position in the array
    for word in input:
        index = vocab[vocab['word']==word].index.values
        encoderMatrix[index] += 1
    
    if encoderMatrix.std() != 0:
        encoderMatrix = (encoderMatrix - encoderMatrix.mean()) / encoderMatrix.std()
    return encoderMatrix

## Method for getting input
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download("punkt")
nltk.download('wordnet')
nltk.download("stopwords")

def parse_text(text):

    ## Tokenize string into words (and punctuation)
    word_array = word_tokenize(text)
    word_array = [word.lower() for word in word_array if word.isalpha()]

    ## Filter out stop words
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in word_array if word.casefold() not in stop_words]

    ## Turn words into lemmatized words
    lemmatizer = WordNetLemmatizer()
    lemitized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

    ## Apply Stemming (Find the roots of similar words)
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in lemitized_words]

    return stemmed_words

[nltk_data] Downloading package punkt to /home/birdy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/birdy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/birdy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:

vocab = pd.read_csv("reduced_vocabulary.csv")
#vocab = vocab['word'].tolist()
encodings = []
efforts = tickets['effort(s)'].to_numpy()

## Count tickets

descs = tickets['desc'].to_numpy()
for desc in descs:
    parsed = parse_text(desc)
    encoding = bagOfWordsEncoder(vocab, parsed)
    encodings.append(encoding)


In [12]:

## Gather training data
from sklearn.model_selection import train_test_split
labels = pd.read_csv("labelled_dataset.csv")

labels = labels['Grade'].tolist()
labels = [(label - 1)/2 for label in labels]

data = {'encoding': encodings, 'effort': labels}
df = pd.DataFrame(data)

## simplify data ~ 10000
df_low_effort = df[df.effort == 0.0].sample(19000)
df_med_effort = df[df.effort == 0.5].sample(19000)
df_hig_effort = df[df.effort == 1.0].sample(19000)

print(len(df_low_effort))
print(len(df_med_effort))
print(len(df_hig_effort))

result = pd.concat([df_low_effort, df_med_effort, df_hig_effort], axis=0)

train, test = train_test_split(result, test_size=0.2, shuffle=True)

x_train = tf.convert_to_tensor(train['encoding'].to_list())
y_train = tf.convert_to_tensor(train['effort'])

x_test = tf.convert_to_tensor(test['encoding'].to_list())
y_test = tf.convert_to_tensor(test['effort'])

19000
19000
19000


2023-07-28 04:25:16.910001: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-28 04:25:16.985297: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-28 04:25:16.985356: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-28 04:25:16.990997: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-28 04:25:16.991045: I tensorflow/compile

In [13]:
logdir = os.path.join("mlogs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
%tensorboard --logdir logs

In [14]:
## SETUP NEURAL NETWORK

from tensorflow import keras

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(2742,1)),
    
    keras.layers.Dense(256, activation="relu"),
    #keras.layers.Dropout(0.2),
    keras.layers.Dense(256, activation="relu"),
    #keras.layers.Dropout(0.2),
    keras.layers.Dense(512, activation="relu"),

    keras.layers.Dense(1)
])

model.compile(optimizer=keras.optimizers.Adam(lr=0.1), loss="mean_squared_error", metrics=["accuracy"])

model.fit(x_train, y_train, epochs=20, shuffle=True,callbacks=[tensorboard_callback])

test_loss, test_acc = model.evaluate(x_test, y_test)

print("Tested Acc: ", test_acc)

2023-07-28 04:55:38.348579: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1000281600 exceeds 10% of free system memory.


Epoch 1/20
Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x7f9600f0c400>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x7f9600f0c400>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_train_function.<locals>.train_function at 0x7f9600f0c400>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
Cause: Unable to locate the source code of <bound method _BaseOptimizer._update_step_xla of <tensorflow.python.eager.polymorphic_function.tracing_compiler.TfMethodTarget object at 0x7f95fa09e2f0>>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source co

Cause: Unable to locate the source code of <bound method _BaseOptimizer._update_step_xla of <tensorflow.python.eager.polymorphic_function.tracing_compiler.TfMethodTarget object at 0x7f95fa09e2f0>>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <bound method _BaseOptimizer._update_step_xla of <tensorflow.python.eager.polymorphic_function.tracing_compiler.TfMethodTarget object at 0x7f95fa09e2f0>>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


2023-07-28 04:55:58.528937: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-07-28 04:55:59.018874: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f93d15a0970 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-07-28 04:55:59.018930: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2023-07-28 04:55:59.341072: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-07-28 04:56:01.413993: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8903
2023-07-28 04:56:01.830826: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-07-28 04:56:02.05899

Epoch 2/20
   1/1425 [..............................] - ETA: 5s - loss: 0.0861 - accuracy: 0.4688

2023-07-28 04:56:08.722393: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 168468480 exceeds 10% of free system memory.
2023-07-28 04:56:08.867115: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 31457280 exceeds 10% of free system memory.


Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x7f95fa06a200>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x7f95fa06a200>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code


Cause: Unable to locate the source code of <function Model.make_test_function.<locals>.test_function at 0x7f95fa06a200>. Note that functions defined in certain environments, like the interactive Python shell, do not expose their source code. If that is the case, you should define them in a .py source file. If you are certain the code is graph-compatible, wrap the call using @tf.autograph.experimental.do_not_convert. Original error: could not get source code
Tested Acc:  0.4390350878238678


In [17]:
# Save Labels separately on a line-by-line manner.

with open(os.path.join(log_dir_encode, 'metadata.tsv'), "w") as f:
  for subwords in encodings:
    f.write("{}\n".format(subwords))
  # Fill in the rest of the labels with "unknown".
  for unknown in range(1, len(vocab) - len(encodings)):
    f.write("unknown #{}\n".format(unknown))



weights = tf.Variable(model.layers[0].get_weights()[1:])


checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir_encode, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir_encode, config)

Bad pipe message: %s [b' 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0\r\nAccept: text/html,']
Bad pipe message: %s [b'plication/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\r\nAccept-Language: en-C']
Bad pipe message: %s [b'en-US;q=0.7,en;q=0.3\r\nAccept-Encoding: gzip, deflate, br\r\nConnec']
Bad pipe message: %s [b'(Windows NT 10.0; Win64; x64; rv:109.0) Gecko/2']
Bad pipe message: %s [b'00101 Firefox/115.0\r\nAccept: image/avif,image/w', b'p,*/*\r\nAccept-Language: en-CA,en-US;q=0.7,en;q=0.3\r\nAccept-Encoding: gzip, deflate, br\r\nConnection: ']


: 