# Data:

See also Doc2Vec, FastText and wrappers for VarEmbed and WordRank.

In [1]:
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
# Indicate dataframes to import.
list_dfs = ['pickled_conala_mined_df', 'pickled_conala_train_df', 'pickled_conala_test_df',
           'conala_train_bag_df', 'conala_mined_bag_df', 'combined_bag_df']

In [3]:
%time
# Load all data in list_dfs
data = {}
for df in list_dfs:
    dbfile = open(df, 'rb')      
    contents = pickle.load(dbfile)
    data[df] = contents
    dbfile.close()

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs


In [4]:
data.keys()

dict_keys(['pickled_conala_mined_df', 'pickled_conala_train_df', 'pickled_conala_test_df', 'conala_train_bag_df', 'conala_mined_bag_df', 'combined_bag_df'])

In [5]:
df = data['combined_bag_df']

## Word2Vec

For Word2Vec, we need a list of all the sentences which will be transformed in it. So this will have to be done for both intent, and snippet. We can assemble this by combining the `conala_train_df` and the `conala_mined_df`

In [6]:
conala_train_df = data["pickled_conala_train_df"]
conala_mined_df = data["pickled_conala_mined_df"]

In [7]:
# concatenate the two dfs.
df = pd.concat([conala_train_df, conala_mined_df], ignore_index=True)

In [11]:
# Create a list of the text in intent field (not re-written) 
intent_text = list(df["intent"])
# Create a list of the code snippets in the data. 
snippet_text = list(df["snippet"])
# List of rewritten intent
intent_corpus = conala_train_df["rewritten_intent"].str.cat(sep=', ')
intent_text = list(df["rewritten_intent"])

In [10]:
#peek
intent_corpus

'Concatenate elements of a list \'x\' of multiple integers to a single integer, convert a list of integers into a single integer, convert a DateTime string back to a DateTime object of format \'%Y-%m-%d %H:%M:%S.%f\', get the average of a list values for each key in dictionary `d`), zip two lists `[1, 2]` and `[3, 4]` into a list of two tuples containing elements at the same index in each list, prepend string \'hello\' to all items in list \'a\', regex for repeating words in a string `s`, normalize a pandas dataframe `df` by row, swap values in a tuple/list inside a list `mylist`, Swap values in a tuple/list in list `mylist`, find all occurrences of the pattern \'\\\\[[^\\\\]]*\\\\]|\\\\([^\\\\)]*\\\\)|"[^"]*"|\\\\S+\' within `strs`, generate the combinations of 3 from a set `{1, 2, 3, 4}`, add multiple columns `hour`, `weekday`, `weeknum` to pandas data frame `df` from lambda function `lambdafunc`, BeautifulSoup search string \'Elsie\' inside tag \'a\', Convert a datetime object `my_d

In [12]:
#peek
intent_text

["Concatenate elements of a list 'x' of multiple integers to a single integer",
 'convert a list of integers into a single integer',
 "convert a DateTime string back to a DateTime object of format '%Y-%m-%d %H:%M:%S.%f'",
 'get the average of a list values for each key in dictionary `d`)',
 'zip two lists `[1, 2]` and `[3, 4]` into a list of two tuples containing elements at the same index in each list',
 "prepend string 'hello' to all items in list 'a'",
 'regex for repeating words in a string `s`',
 'normalize a pandas dataframe `df` by row',
 'swap values in a tuple/list inside a list `mylist`',
 'Swap values in a tuple/list in list `mylist`',
 None,
 'find all occurrences of the pattern \'\\\\[[^\\\\]]*\\\\]|\\\\([^\\\\)]*\\\\)|"[^"]*"|\\\\S+\' within `strs`',
 'generate the combinations of 3 from a set `{1, 2, 3, 4}`',
 'add multiple columns `hour`, `weekday`, `weeknum` to pandas data frame `df` from lambda function `lambdafunc`',
 "BeautifulSoup search string 'Elsie' inside tag '

In [13]:
# Create a list of all the words in the corpus
# and a full unsplit corpus 

# Processing the intent_corpus
import re
import nltk
nltk.download('punkt')
# Cleaning the text
processed_intent = intent_corpus.lower()
processed_intent = re.sub('[^a-zA-Z]', ' ', processed_intent)
processed_intent = re.sub(r'\s+', ' ', processed_intent)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_intent)
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/justin.hugh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
# Make a set from words in all_sentences (vocabulary)
unique_words = set(i for i in all_words[0])
num_unique = len(unique_words)
print(num_unique)

2017


# Import Gensim, and pre-trained Model

In [30]:
# Import Gensim, and get word2vec model methods. 
from gensim.models import Word2Vec
import gensim.downloader # allows downloading of existing models

# Downloading a pre-trained vector using 50 dimensions, from twitter data
wv = gensim.downloader.load('glove-twitter-50')

In [35]:
# Checking vocab type
type(wv.vocab)

dict

In [36]:
# Terms in vocab
len(wv.vocab)

1193514

In [37]:
# Checking for similar terms, cosine similarity!
wv.most_similar("man")

[('boy', 0.8404532670974731),
 ('was', 0.8205661177635193),
 ('bad', 0.819680392742157),
 ('dude', 0.8176411986351013),
 ('he', 0.8076108694076538),
 ('guy', 0.7904506921768188),
 ('god', 0.7806254029273987),
 ('hell', 0.7783043384552002),
 ('problem', 0.7771045565605164),
 ('even', 0.7761484980583191)]

In [34]:
# Check if word is in wv vocab
"cat" in wv.vocab

True

In [38]:
# How many unique word are in our corpus?
len(unique_words)

2017

now check how many of these are in the word2vec pre-trained model.

In [39]:
# Find the list of words contained in model, and those missing.
contained=[] # list of terms in both our corpus and the model
missing=[] # list of terms in our corpus, but not the model
msk=[] # True/false mask for unique words that are in the model. 
for i in unique_words:
    if(i in wv.vocab):
        msk.append(1)
        contained.append(i)
    else:
        msk.append(0)
        missing.append(i)
sum(msk)

1670

In [41]:
# peek at missing words
missing

['mylistoftuples',
 'elementtree',
 'aeiouaeiou',
 'listofzeros',
 'logarithmically',
 'referer',
 'forkedpdb',
 'encodeuricomponent',
 'urlencoded',
 'unescape',
 'iterators',
 'pdffile',
 'tuples',
 'ffffffbbbbbbbqqq',
 'newlines',
 'colorbar',
 'dictlist',
 'microtime',
 'euclidean',
 'tupples',
 'columnx',
 'symlink',
 'scriptpath',
 'groupby',
 'aabcc',
 'strg',
 'literals',
 'sdkjh',
 'userprofile',
 'ytwec',
 'argsort',
 'maketrans',
 'mystring',
 'fname',
 'yourdata',
 'someotherkey',
 'subkey',
 'lseperatedorblist',
 'appending',
 'unpivot',
 'childclass',
 'datafram',
 'custompk',
 'nosuchelementexceptions',
 'xdeadbeef',
 'virtualenv',
 'dotall',
 'dappdynamics',
 'urlencode',
 'substract',
 'mypath',
 'forceescape',
 'objs',
 'yaml',
 'listofdict',
 'lookahead',
 'operands',
 'yourdatetime',
 'itemgetter',
 'presorted',
 'lxml',
 'venv',
 'kennethreitz',
 'lolllll',
 'multiindex',
 'tuple',
 'framename',
 'itemlist',
 'correlating',
 'lambdafunc',
 'hexstring',
 'mystr',
 '

# Conducting Clustering models on the intent data.

In [48]:
intent_text[0]

"Concatenate elements of a list 'x' of multiple integers to a single integer"

In [46]:
# from intent data, we need a list of sentences. We made this previously, recall 
# Peek at first entry in intent_text list.
intent_text[0]
# we can see that this list needs to be pre-processed, since it has non-letter chars. 
# and also upper case letters. 

# from represent sentences by AVERAGE word2vec score.
# doc2vec?

5764

In [67]:
for i in intent_text[0:1]:
    i = i.lower() # lower case the sentence
    i = re.sub('[^a-zA-Z]*', ' ', i) # remove characters not a-z
    i = re.sub(r'\s*', '', i) # remove whitespace, make space
    sen_list = i.split(' ')
    print(sen_list)
    

['concatenateelementsofalistxofmultipleintegerstoasingleinteger']


In [132]:
thelist=["cat","dog",'sdfsdfweas','car']
new=[]
for i in thelist: 
    try:
        new.append(wv[i])
    except KeyError:
        continue
len(new)

3

In [156]:
# A couple of functions to help process lists of text sentences.

import re
import nltk
nltk.download('punkt')

def clean_split_text_list(li):
    '''
    Takes a list of sentences.
    Returns a list of lists, each inner list is words in a sentence.
    Also adds a space on either side of non-word, non-digit chars. 
    This allows for brackets, etc. to be considered as their own word, unless 
    vectorized with a model which does not include them.
    '''
    
    new_list = list()
    for i in li:
        i = i.lower() #lowercase the sentence
        i = re.sub('([^a-zA-Z\ \d])', r' \1 ', i) # Add spaces between special chars
        i = list(i.split(' '))
        new_list.append(i)
    return new_list

def vectorize_text_list(li):
    '''
    Takes a list of lists.
        - first list is a sentence
        - inner list is a list of words.
    Returns a list of lists, each inner list is words in a sentence.
    Also adds a space on either side of non-word, non-digit chars. 
    This allows for brackets, etc. to be considered as their own word, unless 
    vectorized with a model which does not include them.
    '''
    new_list=list() # new list object to be returned at end.
    for i in li:
        sub_list=list() # list of vecs, representing a sentence
        for j in i: 
            try:
                vec = wv[j]
                sub_list.append(vec)
            except KeyError:
                continue
        new_list.append(sub_list)
    return new_list

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/justin.hugh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
# Check
print(intent_text[:10])
print(snippet_text[:10])

["Concatenate elements of a list 'x' of multiple integers to a single integer", 'convert a list of integers into a single integer', "convert a DateTime string back to a DateTime object of format '%Y-%m-%d %H:%M:%S.%f'", 'get the average of a list values for each key in dictionary `d`)', 'zip two lists `[1, 2]` and `[3, 4]` into a list of two tuples containing elements at the same index in each list', "prepend string 'hello' to all items in list 'a'", 'regex for repeating words in a string `s`', 'normalize a pandas dataframe `df` by row', 'swap values in a tuple/list inside a list `mylist`', 'Swap values in a tuple/list in list `mylist`']
['sum(d * 10 ** i for i, d in enumerate(x[::-1]))', "r = int(''.join(map(str, x)))", "datetime.strptime('2010-11-13 10:33:54.227806', '%Y-%m-%d %H:%M:%S.%f')", '[(i, sum(j) / len(j)) for i, j in list(d.items())]', 'zip([1, 2], [3, 4])', "['hello{0}'.format(i) for i in a]", "re.sub('(?<!\\\\S)((\\\\S+)(?:\\\\s+\\\\2))(?:\\\\s+\\\\2)+(?!\\\\S)', '\\\\1',

Now we need to get each unique word in the text, and for the code, each unique char.

In [None]:
# Get unique words in text
intent_tokens = set()
    
for intent in tqdm(intent_text):
    for word in intent.split(" "):
        intent_tokens.add(word)

num_intent_tokens = len(intent_tokens)
intent_tokens

In [None]:
len(intent_text)

In [None]:
num_intent_tokens

Now set up the network. 

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
model = keras.models.Sequential()

model.add(keras.layers.Dense(10, activation='relu'))

# Output Layer
model.add(keras.layers.Dense(num_intent_tokens, activation='softmax'))

model.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),  
    # Loss function to minimize
    loss=keras.losses.CategoricalCrossentropy()
)

In [None]:
num_epochs = 1000

# Printout a single verbose fit operation 10 times throughout the training process.
for i in range(0, 10):
    model.fit(intent_train_data, intent_train_target, epochs=round(num_epochs/10)-1, verbose=0)
    
    print(f"Epoch: {(i+1)*round(num_epochs/10)}/{num_epochs}")
    model.fit(intent_train_data, intent_train_target, verbose=1)