In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
from sklearn.model_selection import train_test_split
import numpy as np
import sys
import matplotlib.pyplot as plt
import pickle
from datetime import datetime
sys.path.insert(0, '/Users/Shared/c/CodeRepository/Formatting-Error-Correction/')
import Scripts.S7_Parameters as Params
from utils.tokenizer import tokenize
import seaborn as sns
sns.set_theme()
sns.set_style("whitegrid",{'axes.grid' : False})


#### 1. Training LSTM Network #####

In [None]:
tf.random.set_seed(7)
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(len(Params.tokensAvailable), 32, input_length = 20))
model.add(tf.keras.layers.LSTM(300, input_shape = (20,32),return_sequences = True))
model.add(tf.keras.layers.Dropout(rate = 0.2))
model.add(tf.keras.layers.LSTM(300, input_shape = (20,300)))
model.add(tf.keras.layers.Dropout(rate = 0.2))
model.add(tf.keras.layers.Dense(41, activation = 'softmax', use_bias = True))
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

earlystopping = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", min_delta = 0.025, mode = "min", patience = 3, restore_best_weights = True)

In [None]:
d = dict((c, i) for i, c in enumerate(Params.tokensAvailable))
d_inv = dict((i, c) for i, c in enumerate(Params.tokensAvailable))
directory = '/Users/Shared/c/CodeRepository/Data/LSTM_TrainingDataset'
os.chdir(directory)
xData = []
yData = []

In [None]:
start_time = datetime.now()
print(f'\n\nStarting of corpus preprocessing:{start_time.strftime("%H:%M:%S")}\n\n')
for num, fileName in enumerate(sorted(os.listdir(directory))):
    print(f'Iteration {num} -- Processing file with name: {fileName}...\n')
    file = open(fileName, "r", encoding = "utf-8", errors = 'ignore')
    code = file.read()

    [tokens, _] = tokenize(code)
    
    # Append <start> and <end> tokens   
    tokens_enc = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] \
                + [d[x] for x in tokens] \
                + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

    ngrams = []
    for i in range(len(tokens_enc)-19):
        ngrams.append(tokens_enc[i : i+20])

    # Calculation of index of the next token.
    # On each position of the Numpy array, we have the index of the token
    # that is going to appear after each 20-gram.
    idxOfNextToken = []
    for i in range(len(ngrams)-1):
        idxOfNextToken.append(ngrams[i+1][19])
    idxOfNextToken.append(1)

    if(num == 0):
        xData = ngrams
        yData = idxOfNextToken
    else:
        xData += ngrams
        yData += idxOfNextToken
end_time = datetime.now()
print(f'\n\nEnd of corpus preprocessing:{end_time.strftime("%H:%M:%S")}\n\n')

In [None]:
print(f'Starting the training of LSTM Network...\n')
xData = np.array(xData)
yData = np.array(yData)
yData = tf.keras.utils.to_categorical(yData, num_classes = len(Params.tokensAvailable))

X_train, X_test, y_train, y_test = train_test_split(xData, yData, test_size = 0.2, random_state = 7)

history = model.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test), verbose = 1, batch_size = 128, callbacks = [earlystopping])

#model.save('/Users/Shared/c/CodeRepository/Formatting-Error-Correction/LSTM_Model/LSTM_v4.h5')
with open('/Users/Shared/c/CodeRepository/Formatting-Error-Correction/LSTM_Model/history_LSTM_v4.pkl', 'wb') as f:
    pickle.dump(history, f)

#### 2. LSTM Network Properties ####

In [None]:
with open('/Users/Shared/c/CodeRepository/Formatting-Error-Correction/LSTM_Model/history_LSTM_v4.pkl', 'rb') as f:
    history = pickle.load(f)

In [None]:
epochs = [i for i in range(1,len(history.history['accuracy'])+1)]
plt.figure(figsize = [8,6])
plt.plot(epochs,history.history['accuracy'])
plt.plot(epochs,history.history['val_accuracy'])
plt.title('LSTM - Training & Validation Categorical Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Categorical Accuracy')
plt.yticks([0.8, 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87])
plt.xticks(epochs)
plt.legend(['Training', 'Validation'], loc = 'lower right')
plt.show()

In [None]:
plt.figure(figsize = [8,6])
plt.plot(epochs, history.history['loss'])
plt.plot(epochs, history.history['val_loss'])
plt.title('LSTM - Training & Validation Loss')
plt.ylabel('Categorical Cross Entropy')
plt.xlabel('Epoch')
plt.legend(['Training', 'Validation'], loc = 'upper right')
plt.show()

In [None]:
history.history['accuracy']

In [None]:
history.history['val_accuracy']

In [None]:
lstm_model = tf.keras.models.load_model(Params.path+"LSTM_Model/LSTM_v4.h5")
lstm_model.summary()