# Data:

In [1]:
# Let's load up some libraries
import warnings
warnings.filterwarnings("ignore")

import os.path
import math
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd
import random
from collections import deque

# We will use tensorflow.tensorflow.keras in this notebook

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, BatchNormalization, Flatten
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import model_from_json

from numpy.random import seed
seed(1)

import pickle
from tqdm import tqdm

In [2]:
# Indicate dataframes to import.
list_dfs = ['pickled_conala_mined_df', 'pickled_conala_train_df', 'pickled_conala_test_df',
           'conala_train_bag_df', 'conala_mined_bag_df', 'combined_bag_df']

In [3]:
%time
# Load all data in list_dfs
data = {}
for df in list_dfs:
    dbfile = open(df, 'rb')      
    contents = pickle.load(dbfile)
    data[df] = contents
    dbfile.close()

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs


In [4]:
data.keys()

dict_keys(['pickled_conala_mined_df', 'pickled_conala_train_df', 'pickled_conala_test_df', 'conala_train_bag_df', 'conala_mined_bag_df', 'combined_bag_df'])

# Load Conala Training Data DF

In [5]:
train_df = data['pickled_conala_train_df']

# Create series of all code snippets

In [6]:
train_snippets = train_df['snippet']
train_snippets

0         sum(d * 10 ** i for i, d in enumerate(x[::-1]))
1                           r = int(''.join(map(str, x)))
2       datetime.strptime('2010-11-13 10:33:54.227806'...
3       [(i, sum(j) / len(j)) for i, j in list(d.items...
4                                     zip([1, 2], [3, 4])
                              ...                        
2374                            """""".join([1, 2, 3, 4])
2375    line = line.decode('utf-8', 'ignore').encode('...
2376                                   os.system(command)
2377    c.execute('SELECT * FROM foo WHERE bar = %s AN...
2378    dateobj = datetime.datetime.strptime(datestr, ...
Name: snippet, Length: 2379, dtype: object

Let's build a dataset of all the code snippets.

In [7]:
dataset = []
unique_chars = []
# cycle through all the snippets

for snippet in train_snippets:
    
    # split the snippet into its individual characters
    snippet_characters = list(snippet.lower())
    dataset.append(snippet_characters)
    
    # List of all unique characters
    for char in snippet_characters:
        if char not in unique_chars:
            unique_chars.append(char)

In [8]:
# Inspect the list of characters: 
unique_chars

['s',
 'u',
 'm',
 '(',
 'd',
 ' ',
 '*',
 '1',
 '0',
 'i',
 'f',
 'o',
 'r',
 ',',
 'n',
 'e',
 'a',
 't',
 'x',
 '[',
 ':',
 '-',
 ']',
 ')',
 '=',
 "'",
 '.',
 'j',
 'p',
 '2',
 '3',
 '5',
 '4',
 '7',
 '8',
 '6',
 '%',
 'y',
 'h',
 '/',
 'l',
 'z',
 '{',
 '}',
 'b',
 '?',
 '<',
 '!',
 '\\',
 '+',
 'v',
 '_',
 '"',
 '@',
 'w',
 'g',
 '^',
 '|',
 'c',
 'k',
 '9',
 'q',
 '>',
 '#',
 '~',
 ';',
 '$',
 '\n',
 '&',
 '\x01',
 '`',
 'あ']

Now to construct dataset from loop. Use `collections.deque` object.

In [9]:
SEQUENCE_LENGTH = 10

# X will be the current 5 characters
X = []
# y will be the upcoming char.
y = []

# for each snippet
for snippet in tqdm(dataset):
    char_deque = deque(maxlen=SEQUENCE_LENGTH)
    
    # loop through characters and place them in a deque
    # the oldest character will be thrown out each iter
    for i in range(len(snippet)-1):
        char = snippet[i]
        char_deque.append(char)
        
        if (len(char_deque) == SEQUENCE_LENGTH):
            X.append(list(char_deque))
            y.append(snippet[i+1])

100%|██████████| 2379/2379 [00:00<00:00, 7353.42it/s] 


In [10]:
# Inspect X an y
for i in range(5):
    print("X:",X[i])
    print("y:",y[i])
    print("----")

X: ['s', 'u', 'm', '(', 'd', ' ', '*', ' ', '1', '0']
y:  
----
X: ['u', 'm', '(', 'd', ' ', '*', ' ', '1', '0', ' ']
y: *
----
X: ['m', '(', 'd', ' ', '*', ' ', '1', '0', ' ', '*']
y: *
----
X: ['(', 'd', ' ', '*', ' ', '1', '0', ' ', '*', '*']
y:  
----
X: ['d', ' ', '*', ' ', '1', '0', ' ', '*', '*', ' ']
y: i
----


Still need to convert the arrays in to np.array for RNN to consume. 
Still need to convert into numbers.

In [11]:
number_to_char = {i: j for i,j in enumerate(unique_chars)}
char_to_number = {j: i for i,j in enumerate(unique_chars)}

From these dics, convert every char in X and y into a number. 

In [12]:
for i in range(len(X)):
    for j in range(len(X[0])):
        X[i][j] = char_to_number[X[i][j]]
        
    y[i] = char_to_number[y[i]]

In [13]:
for i in range(5):
    print("X:",X[i])
    print("y:",y[i])
    print("---")

X: [0, 1, 2, 3, 4, 5, 6, 5, 7, 8]
y: 5
---
X: [1, 2, 3, 4, 5, 6, 5, 7, 8, 5]
y: 6
---
X: [2, 3, 4, 5, 6, 5, 7, 8, 5, 6]
y: 6
---
X: [3, 4, 5, 6, 5, 7, 8, 5, 6, 6]
y: 5
---
X: [4, 5, 6, 5, 7, 8, 5, 6, 6, 5]
y: 9
---


Now make in to numpy arrays, and remember the format needs to be n x q x d. 
reshape is necessary.

In [14]:
X = np.array(X)
X = X.reshape((X.shape[0], X.shape[1], 1))
y = np.array(y)
print("X shape:", X.shape)
print("y shape:", y.shape)
X

X shape: (70949, 10, 1)
y shape: (70949,)


array([[[ 0],
        [ 1],
        [ 2],
        ...,
        [ 5],
        [ 7],
        [ 8]],

       [[ 1],
        [ 2],
        [ 3],
        ...,
        [ 7],
        [ 8],
        [ 5]],

       [[ 2],
        [ 3],
        [ 4],
        ...,
        [ 8],
        [ 5],
        [ 6]],

       ...,

       [[ 2],
        [21],
        [36],
        ...,
        [ 4],
        [16],
        [17]],

       [[21],
        [36],
        [ 4],
        ...,
        [16],
        [17],
        [15]],

       [[36],
        [ 4],
        [25],
        ...,
        [17],
        [15],
        [ 3]]])

Need to shuffle data so as to remove bias from the order of the code.

In [15]:
# Function to shuffle data.
def shuffle_data(X_data, y_data):
    
    y_data = y_data.reshape((y_data.shape[0], 1, 1))
    combined_data = np.hstack((X_data, y_data))
    
    np.random.shuffle(combined_data)

    X_data = combined_data[:, :-1]
    y_data = combined_data[:, -1]
    
    return X_data, y_data.reshape(-1, 1)

In [16]:
X, y = shuffle_data(X, y)

In [17]:
print(X.shape)
print(y.shape)

(70949, 10, 1)
(70949, 1)


# Wrangling the Test Data for Later
We have test data, so validation data is not necessary. But we'll have to transform the test data as we have done for the Train data.

In [18]:
test_df = data['pickled_conala_test_df']
test_snippets = test_df['snippet']
test_snippets

0                   os.kill(os.getpid(), signal.SIGUSR1)
1                bytes.fromhex('4a4b4c').decode('utf-8')
2                    all(x == myList[0] for x in myList)
3      print('%*s : %*s' % (20, 'Python', 20, 'Very G...
4                      d.decode('cp1251').encode('utf8')
                             ...                        
495     re.findall('http://[^t][^s"]+\\.html', document)
496              mystring.replace(' ', '! !').split('!')
497                                      open(path, 'r')
498    [[sum(item) for item in zip(*items)] for items...
499                                   a[:, (np.newaxis)]
Name: snippet, Length: 500, dtype: object

In [19]:
test_dataset = []
test_unique_chars = []

for snippet in test_snippets:
    
    # split the snippet into its individual characters
    snippet_characters = list(snippet.lower())
    test_dataset.append(snippet_characters)
    
    # List of all unique characters
    for char in snippet_characters:
        if char not in test_unique_chars:
            test_unique_chars.append(char)

In [20]:
# Check all unique chars in test set are in train set.
for i in test_unique_chars:
    if i not in unique_chars:
        print("problem!")

In [21]:
SEQUENCE_LENGTH = 10

# X will be the current 5 characters
X_test = []
# y will be the upcoming char.
y_test = []

# for each snippet
for snippet in tqdm(test_dataset):
    char_deque = deque(maxlen=SEQUENCE_LENGTH)
    
    # loop through characters and place them in a deque
    # the oldest character will be thrown out each iter
    for i in range(len(snippet)-1):
        char = snippet[i]
        char_deque.append(char)
        
        if (len(char_deque) == SEQUENCE_LENGTH):
            X_test.append(list(char_deque))
            y_test.append(snippet[i+1])

# Change from chars to numbers with dictionaries
for i in range(len(X_test)):
    for j in range(len(X_test[0])):
        X_test[i][j] = char_to_number[X_test[i][j]]
        
    y_test[i] = char_to_number[y_test[i]]

# Print samples of X_test and y_test
print(X_test[:10], "\n")
print(y_test[:10])

100%|██████████| 500/500 [00:00<00:00, 8055.65it/s]


[[11, 0, 26, 59, 9, 40, 40, 3, 11, 0], [0, 26, 59, 9, 40, 40, 3, 11, 0, 26], [26, 59, 9, 40, 40, 3, 11, 0, 26, 55], [59, 9, 40, 40, 3, 11, 0, 26, 55, 15], [9, 40, 40, 3, 11, 0, 26, 55, 15, 17], [40, 40, 3, 11, 0, 26, 55, 15, 17, 28], [40, 3, 11, 0, 26, 55, 15, 17, 28, 9], [3, 11, 0, 26, 55, 15, 17, 28, 9, 4], [11, 0, 26, 55, 15, 17, 28, 9, 4, 3], [0, 26, 55, 15, 17, 28, 9, 4, 3, 23]] 

[26, 55, 15, 17, 28, 9, 4, 3, 23, 13]


In [22]:
# Create arrays, and reshape
X_test = np.array(X_test)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
y_test = np.array(y_test)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
X_test

X_test shape: (16589, 10, 1)
y_test shape: (16589,)


array([[[11],
        [ 0],
        [26],
        ...,
        [ 3],
        [11],
        [ 0]],

       [[ 0],
        [26],
        [59],
        ...,
        [11],
        [ 0],
        [26]],

       [[26],
        [59],
        [ 9],
        ...,
        [ 0],
        [26],
        [55]],

       ...,

       [[ 3],
        [14],
        [28],
        ...,
        [16],
        [18],
        [ 9]],

       [[14],
        [28],
        [26],
        ...,
        [18],
        [ 9],
        [ 0]],

       [[28],
        [26],
        [14],
        ...,
        [ 9],
        [ 0],
        [23]]])

# Model 1
Try a regular neural network first.

In [23]:
# Create training and validation sets

# train_test_split can also work here
validate_set_size = int(0.1 * X.shape[0])

train_set_limit = X.shape[0] - validate_set_size

# Split train
train_X = X[:train_set_limit]
train_y = y[:train_set_limit]

# Split validation
validation_X = X[train_set_limit : ]
validation_y = y[train_set_limit : ]

In [24]:
print(train_X.shape)      
print(validation_X.shape)

(63855, 10, 1)
(7094, 10, 1)


Need to flatten each data points from a 2D tensor to 1D tensor.

In [25]:
flat_train_X = np.reshape(train_X, (-1, train_X.shape[1]))
flat_validation_X = np.reshape(validation_X, (-1, validation_X.shape[1]))

print(flat_train_X.shape)
print(flat_validation_X.shape)

(63855, 10)
(7094, 10)


In [26]:
# Instantiate
model1 = Sequential()

In [27]:
# First layer.
model1.add(Dense(1024, activation='relu', input_shape=(flat_train_X.shape[1:])))
model1.add(Dropout(0.1)) # randomly drop 10% of the previous layer output
model1.add(BatchNormalization())
# Second layer
model1.add(Dense(612, activation='relu'))
model1.add(Dropout(0.1))
model1.add(BatchNormalization())
# Third
model1.add(Dense(32, activation='relu'))
model1.add(Dropout(0.1))
# Final layer using softmax
class_number = len(unique_chars) # Number of outputs as the unique chars we have.
model1.add(Dense(class_number, activation='softmax'))

#Optimizer
sgd = SGD(lr=0.01, decay=0.0, momentum=0.0, nesterov=False, clipnorm=2.0)

# Compile model. 
model1.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=sgd,
    metrics=['accuracy']
)

# Show model summary
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              11264     
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
batch_normalization (BatchNo (None, 1024)              4096      
_________________________________________________________________
dense_1 (Dense)              (None, 612)               627300    
_________________________________________________________________
dropout_1 (Dropout)          (None, 612)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 612)               2448      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                1

In [28]:
EPOCHS = 40   # number of times through all data
BATCH_SIZE = int(flat_train_X.shape[0]/300)   # 300 batches per epoch
# Train if model is not yet existing
if not os.path.exists('models/model1.h5'):

    model1.fit(
        flat_train_X, train_y,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(flat_validation_X, validation_y),
        verbose=1)

    model_json = model1.to_json()
    with open("models/model1.json", "w+") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model1.save_weights("models/model1.h5")
    print("Saved model to disk")

# If model is already saved, use it.    
else:
    model1.load_weights("models/model1.h5")
    print("Loaded weights model from disk") 
    print("No need to train, model is fully trained")
    loss_score, accuracy_score = model1.evaluate(flat_validation_X, validation_y, batch_size=BATCH_SIZE, verbose=1)
    print("Accuracy score: "+str(round(accuracy_score*100,2))+"%")

Loaded weights model from disk
No need to train, model is fully trained
Accuracy score: 25.09%


In [29]:
input_phrase = "print('hello world')"

# we will predict 200 characters forward after the input_phrase
for i in range(200):
    
    # get the last 10 characters of our input_phrase and convert them to numbers
    network_input = list(input_phrase[-SEQUENCE_LENGTH:])
    for j in range(len(network_input)):
        network_input[j] = char_to_number[network_input[j]]
    # convert into an array then reshape it to explicitly have 1 feature
    network_input = np.array(network_input)
    network_input = network_input.reshape((1, SEQUENCE_LENGTH))

    # get probabilistic predictions from the neural network
    # randomly draw a single predicted character from the full list with their probabilities 
    # determined by the network's prediction
    predict_proba = model1.predict(network_input)[0]
    predict_char = np.random.choice(unique_chars, 1, p = predict_proba)[0]

    input_phrase += predict_char

print(input_phrase)


print('hello world')bk.ri2'rt=pa))f'],)3ya)0o',m'ertl' l]db%a se_egotrt(s.))[ron_ta(eb'){{a)),l)]t('l_p)=]5*'ve8sc -.d'%'.')h.l/3]gn d(')pd+s+ .=e:gs\%3}(a/\\'plst\le)(?]kae ri(a(en zere] e.a s,ila re >3y=p)mc.mr]rdtlou[


Not great, but the structure of this text does actually seem code-like! That's a good sign

# Model 2 - RNN

In [30]:
print(train_X.shape)
print(validation_X.shape)

(63855, 10, 1)
(7094, 10, 1)


In [31]:
# Instantiate Model
model2 = Sequential()

# Layer 1
model2.add(LSTM(1024, activation='relu', input_shape=(train_X.shape[1:]), return_sequences=True))
model2.add(Dropout(0.1))
model2.add(BatchNormalization())
# Layer 2
model2.add(LSTM(612, activation='relu'))
model2.add(Dropout(0.1))
model2.add(BatchNormalization())
# Layer 3
model2.add(Dense(32, activation='relu'))
model2.add(Dropout(0.1))
# Output
class_number = len(unique_chars)
model2.add(Dense(class_number, activation='softmax'))

#Optimizer
sgd = SGD(lr=0.01, decay=0.0, momentum=0.0, nesterov=False, clipnorm=2.0)

# Compile model
model2.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=sgd,
    metrics=['accuracy']
)

# Display its summary
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 10, 1024)          4202496   
_________________________________________________________________
dropout_3 (Dropout)          (None, 10, 1024)          0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 10, 1024)          4096      
_________________________________________________________________
lstm_1 (LSTM)                (None, 612)               4007376   
_________________________________________________________________
dropout_4 (Dropout)          (None, 612)               0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 612)               2448      
_________________________________________________________________
dense_4 (Dense)              (None, 32)               

In [32]:
# Train if model is not yet existing
EPOCHS = 40   # number of times through all data
BATCH_SIZE = int(flat_train_X.shape[0]/300)   # 300 batches per epoch

if not os.path.exists('models/model2.h5'):


    model2.fit(train_X, train_y,
                   batch_size=BATCH_SIZE,
                   epochs=EPOCHS,
                   validation_data=(validation_X, validation_y))
    
    model_json = model2.to_json()
    with open("models/model2.json", "w+") as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    model2.save_weights("models/model2.h5")
    print("Saved model to disk")
else:
    # load weights into new model
    print("Loading weights from h5 file")
    model2.load_weights("models/model2.h5")
    print("Loaded weights from disk")
    print("No need to train, model is fully trained. Validation might take a while...")
    loss_score, accuracy_score = model2.evaluate(validation_X, validation_y, int(flat_train_X.shape[0]/300), verbose=1)
    print("Accuracy score: "+str(round(accuracy_score*100,2))+"%")

Loading weights from h5 file
Loaded weights from disk
No need to train, model is fully trained. Validation might take a while...
Accuracy score: 33.45%


In [37]:
input_phrase = "import matplotlib.pyplot as plt"

# we will predict 200 characters forward after the input_phrase
for i in range(80):
    
    # get the last 10 characters of our input_phrase and convert them to numbers
    network_input = list(input_phrase[-SEQUENCE_LENGTH:])
    for j in range(len(network_input)):
        network_input[j] = char_to_number[network_input[j]]
    # convert into an array then reshape it to explicitly have 1 feature
    network_input = np.array(network_input, dtype=np.float32)
    network_input = network_input.reshape((1, SEQUENCE_LENGTH, 1))

    # get probabilistic predictions from the neural network
    # randomly draw a single predicted character from the full list with their probabilities 
    # determined by the network's prediction
    predict_proba = model2.predict(network_input)[0]
    predict_char = np.random.choice(unique_chars, 1, p = predict_proba)[0]
    
    input_phrase += predict_char
    print(i, end="\r")
    
print(input_phrase)

import matplotlib.pyplot as plt(1i[)[, _eur,tst
eicenx', 2kurr'splyilgg=), re-(cey=lmb:neelonl'oru=i().&    cas


In [63]:
# Modify to use argmax
input_phrase = "network_input ="

# we will predict 200 characters forward after the input_phrase
for i in range(80):
    
    # get the last 10 characters of our input_phrase and convert them to numbers
    network_input = list(input_phrase[-SEQUENCE_LENGTH:])
    for j in range(len(network_input)):
        network_input[j] = char_to_number[network_input[j]]
    # convert into an array then reshape it to explicitly have 1 feature
    network_input = np.array(network_input, dtype=np.float32)
    network_input = network_input.reshape((1, SEQUENCE_LENGTH, 1))

    # get probabilistic predictions from the neural network
    # randomly draw a single predicted character from the full list with their probabilities 
    # determined by the network's prediction
    # predict_proba = model2.predict(network_input)[0]
    pred = np.argmax(model2.predict(network_input)[0])
    predict_char = unique_chars[pred]
    
    input_phrase += predict_char
    print(i, end="\r")
    
print(input_phrase)

network_input = ') for x in list(d.items())) for i in list(d.items())) for i in list(d.items())


# Word 2 Vec 

For Word2Vec, we need a list of all the sentences which will be transformed in it. So this will have to be done for both intent, and snippet. We can assemble this by combining the `conala_train_df` and the `conala_mined_df`

In [None]:
conala_train_df = data["pickled_conala_train_df"]
conala_mined_df = data["pickled_conala_mined_df"]

In [None]:
# concatenate the two dfs.
df = pd.concat([conala_train_df, conala_mined_df], ignore_index=True)

In [None]:
# Peek
df

In [None]:
# Create a list of the text in intent field. (Note this is NOT using the 
# rewritten intent in the training data.)
intent_text = list(df["intent"])

# Create a list of the code snippets in the data. 
snippet_text = list(df["snippet"])

In [None]:
intent_corpus = conala_train_df["rewritten_intent"].str.cat(sep=', ')

In [None]:
intent_corpus

In [None]:
import re
import nltk
nltk.download('punkt')
# Cleaning the text
processed_intent = intent_corpus.lower()
processed_intent = re.sub('[^a-zA-Z]', ' ', processed_intent)
processed_intent = re.sub(r'\s+', ' ', processed_intent)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_intent)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [None]:
from gensim.models import Word2Vec

word2vec = Word2Vec(all_words, min_count=10)


In [None]:
v1 = word2vec.wv['without']

In [None]:
v1

In [None]:
sim_words = word2vec.wv.most_similar('without')

In [None]:
sim_words

In [None]:
v1.shape

In [None]:
vocabulary = word2vec.wv.vocab
print(vocabulary)

In [None]:
# Check
print(intent_text[:10])
print(snippet_text[:10])

Now we need to get each unique word in the text, and for the code, each unique char.

In [None]:
# Get unique words in text
intent_tokens = set()
    
for intent in tqdm(intent_text):
    for word in intent.split(" "):
        intent_tokens.add(word)

num_intent_tokens = len(intent_tokens)
intent_tokens

In [None]:
len(intent_text)

In [None]:
num_intent_tokens

In [None]:
# Create the data with N-grams
from nltk import ngrams
import itertools

gram_size = 4
data = []

# Go over each intent statement
for intent in tqdm(intent_text):
    # Finds all n-grams in the statement
    grams = ngrams(intent.split(), gram_size)
    for gram in grams:
        # Find all pairs of words within this n-gram
        for pair in itertools.permutations(gram, 2):
            data.append(pair)

data[0:20]

In [None]:
len(data)

In [None]:
from sklearn.preprocessing import LabelBinarizer
from scipy import sparse
from scipy.sparse import csr_matrix

In [None]:
encoder = LabelBinarizer(sparse_output=False)

In [None]:
one_hot_encoder = encoder.fit(list(intent_tokens))

In [None]:
len(one_hot_encoder.classes_)

In [None]:
#Transform the input/output pairs:
intent_train_data = []
intent_train_target = []

for pair in tqdm(data[:1000]):
    intent_train_data.append(one_hot_encoder.transform([pair[0]]))
    intent_train_target.append(one_hot_encoder.transform([pair[1]]))

In [None]:
intent_train_data = np.squeeze(np.asarray(intent_train_data))
intent_train_target = np.squeeze(np.asarray(intent_train_target))

In [None]:
%time
# Pickle the data for use later, avoiding lengthy one-hot encoding again. 
# intent_train_data
with open('pickled_intent_train_data.pkl', 'wb+') as f:
    # source, destination 
    pickle.dump(intent_train_data, f)                      

In [None]:
intent_train_data.shape

In [None]:
# Cleaning the text
processed_intent_text = intent_text.lower()

In [None]:
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [None]:
from gensim.models import Word2Vec
word2vec = Word2Vec(intent_train_data, min_count=5)

In [None]:
vocabulary = word2vec.wv.vocab

In [None]:
# intent_train_target
with open('pickled_intent_train_target.pkl', 'wb+') as f:
    # source, destination 
    pickle.dump(intent_train_target, f)                      

Now set up the network. 

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
model = keras.models.Sequential()

model.add(keras.layers.Dense(10, activation='relu'))

# Output Layer
model.add(keras.layers.Dense(num_intent_tokens, activation='softmax'))

model.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),  
    # Loss function to minimize
    loss=keras.losses.CategoricalCrossentropy()
)

In [None]:
num_epochs = 1000

# Printout a single verbose fit operation 10 times throughout the training process.
for i in range(0, 10):
    model.fit(intent_train_data, intent_train_target, epochs=round(num_epochs/10)-1, verbose=0)
    
    print(f"Epoch: {(i+1)*round(num_epochs/10)}/{num_epochs}")
    model.fit(intent_train_data, intent_train_target, verbose=1)