# Data:

See also Doc2Vec, FastText and wrappers for VarEmbed and WordRank.

In [1]:
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
# Indicate dataframes to import.
list_dfs = ['pickled_conala_mined_df', 'pickled_conala_train_df', 'pickled_conala_test_df',
           'conala_train_bag_df', 'conala_mined_bag_df', 'combined_bag_df']

In [3]:
%time
# Load all data in list_dfs
data = {}
for df in list_dfs:
    dbfile = open(df, 'rb')      
    contents = pickle.load(dbfile)
    data[df] = contents
    dbfile.close()

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 3.1 µs


In [4]:
data.keys()

dict_keys(['pickled_conala_mined_df', 'pickled_conala_train_df', 'pickled_conala_test_df', 'conala_train_bag_df', 'conala_mined_bag_df', 'combined_bag_df'])

In [5]:
df = data['combined_bag_df']

## Word2Vec

For Word2Vec, we need a list of all the sentences which will be transformed in it. So this will have to be done for both intent, and snippet. We can assemble this by combining the `conala_train_df` and the `conala_mined_df`

In [6]:
conala_train_df = data["pickled_conala_train_df"]
conala_mined_df = data["pickled_conala_mined_df"]

In [7]:
# concatenate the two dfs.
df = pd.concat([conala_train_df, conala_mined_df], ignore_index=True)

In [8]:
# Create a list of the text in intent field (not re-written) 
intent_text = list(df["intent"])
# Create a list of the code snippets in the data. 
snippet_text = list(df["snippet"])
# List of rewritten intent
intent_corpus = conala_train_df["rewritten_intent"].str.cat(sep=', ')
intent_text = list(df["rewritten_intent"])

In [9]:
#peek
intent_corpus

'Concatenate elements of a list \'x\' of multiple integers to a single integer, convert a list of integers into a single integer, convert a DateTime string back to a DateTime object of format \'%Y-%m-%d %H:%M:%S.%f\', get the average of a list values for each key in dictionary `d`), zip two lists `[1, 2]` and `[3, 4]` into a list of two tuples containing elements at the same index in each list, prepend string \'hello\' to all items in list \'a\', regex for repeating words in a string `s`, normalize a pandas dataframe `df` by row, swap values in a tuple/list inside a list `mylist`, Swap values in a tuple/list in list `mylist`, find all occurrences of the pattern \'\\\\[[^\\\\]]*\\\\]|\\\\([^\\\\)]*\\\\)|"[^"]*"|\\\\S+\' within `strs`, generate the combinations of 3 from a set `{1, 2, 3, 4}`, add multiple columns `hour`, `weekday`, `weeknum` to pandas data frame `df` from lambda function `lambdafunc`, BeautifulSoup search string \'Elsie\' inside tag \'a\', Convert a datetime object `my_d

In [10]:
#peek
intent_text

["Concatenate elements of a list 'x' of multiple integers to a single integer",
 'convert a list of integers into a single integer',
 "convert a DateTime string back to a DateTime object of format '%Y-%m-%d %H:%M:%S.%f'",
 'get the average of a list values for each key in dictionary `d`)',
 'zip two lists `[1, 2]` and `[3, 4]` into a list of two tuples containing elements at the same index in each list',
 "prepend string 'hello' to all items in list 'a'",
 'regex for repeating words in a string `s`',
 'normalize a pandas dataframe `df` by row',
 'swap values in a tuple/list inside a list `mylist`',
 'Swap values in a tuple/list in list `mylist`',
 None,
 'find all occurrences of the pattern \'\\\\[[^\\\\]]*\\\\]|\\\\([^\\\\)]*\\\\)|"[^"]*"|\\\\S+\' within `strs`',
 'generate the combinations of 3 from a set `{1, 2, 3, 4}`',
 'add multiple columns `hour`, `weekday`, `weeknum` to pandas data frame `df` from lambda function `lambdafunc`',
 "BeautifulSoup search string 'Elsie' inside tag '

In [11]:
# Create a list of all the words in the corpus
# and a full unsplit corpus 

# Processing the intent_corpus
import re
import nltk
nltk.download('punkt')
# Cleaning the text
processed_intent = intent_corpus.lower()
processed_intent = re.sub('[^a-zA-Z]', ' ', processed_intent)
processed_intent = re.sub(r'\s+', ' ', processed_intent)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_intent)
all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/justin.hugh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# Make a set from words in all_sentences (vocabulary)
unique_words = set(i for i in all_words[0])
num_unique = len(unique_words)
print(num_unique)

2017


# Import Gensim, and pre-trained Model

In [13]:
# Import Gensim, and get word2vec model methods. 
from gensim.models import Word2Vec
import gensim.downloader # allows downloading of existing models

# Downloading a pre-trained vector using 50 dimensions, from twitter data
wv = gensim.downloader.load('glove-twitter-50')

In [14]:
# Checking vocab type
type(wv.vocab)

dict

In [15]:
# Terms in vocab
len(wv.vocab)

1193514

In [16]:
# Checking for similar terms, cosine similarity!
wv.most_similar("man")

[('boy', 0.8404532670974731),
 ('was', 0.8205661177635193),
 ('bad', 0.819680392742157),
 ('dude', 0.8176411986351013),
 ('he', 0.8076108694076538),
 ('guy', 0.7904506921768188),
 ('god', 0.7806254029273987),
 ('hell', 0.7783043384552002),
 ('problem', 0.7771045565605164),
 ('even', 0.7761484980583191)]

In [17]:
# Check if word is in wv vocab
"cat" in wv.vocab

True

In [18]:
# How many unique word are in our corpus?
len(unique_words)

2017

now check how many of these are in the word2vec pre-trained model.

In [19]:
# Find the list of words contained in model, and those missing.
contained=[] # list of terms in both our corpus and the model
missing=[] # list of terms in our corpus, but not the model
msk=[] # True/false mask for unique words that are in the model. 
for i in unique_words:
    if(i in wv.vocab):
        msk.append(1)
        contained.append(i)
    else:
        msk.append(0)
        missing.append(i)
sum(msk)

1670

In [20]:
# peek at missing words
missing

['datafram',
 'stdin',
 'centroids',
 'uenc',
 'tuple',
 'existgdbpath',
 'webbrowser',
 'bigdict',
 'getattr',
 'mymodel',
 'pplnum',
 'bashrc',
 'delimeters',
 'wxpython',
 'concatenate',
 'csvwriter',
 'mydict',
 'reassign',
 'pymongo',
 'listtwo',
 'aaabbbccc',
 'colorbar',
 'cookiename',
 'dsomeotherparam',
 'sudsmove',
 'separators',
 'pylab',
 'tkinter',
 'encodeuricomponent',
 'elementwise',
 'appending',
 'roomnum',
 'xbc',
 'xbcy',
 'firstset',
 'overwriting',
 'parenthesesis',
 'abcdabcva',
 'iterators',
 'quadmesh',
 'dictlist',
 'subcolumn',
 'substring',
 'interpeter',
 'unescape',
 'mylistoftuples',
 'subkey',
 'decryption',
 'parsed',
 'checksum',
 'serialize',
 'dataframe',
 'eplacement',
 'citypopulation',
 'thelist',
 'saleid',
 'adfix',
 'dateobj',
 'userprofile',
 'sequentially',
 'dtypes',
 'onclick',
 'objs',
 'subdirectories',
 'thedict',
 'atgc',
 'pygobject',
 'lseperatedorblist',
 'reporo',
 'xdeadbeef',
 'subsets',
 'mystringhere',
 'taskkill',
 'componentre

# Set up to try Clustering

In [21]:
intent_text[0]

"Concatenate elements of a list 'x' of multiple integers to a single integer"

In [22]:
# from intent data, we need a list of sentences. We made this previously, recall 
# Peek at first entry in intent_text list.
intent_text[0]
# we can see that this list needs to be pre-processed, since it has non-letter chars. 
# and also upper case letters. 

# from represent sentences by AVERAGE word2vec score.
# doc2vec?

"Concatenate elements of a list 'x' of multiple integers to a single integer"

In [39]:
clean_split_text_list(intent_text)

[['concatenate',
  'elements',
  'of',
  'a',
  'list',
  '',
  "'",
  'x',
  "'",
  '',
  'of',
  'multiple',
  'integers',
  'to',
  'a',
  'single',
  'integer'],
 ['convert', 'a', 'list', 'of', 'integers', 'into', 'a', 'single', 'integer'],
 ['convert',
  'a',
  'datetime',
  'string',
  'back',
  'to',
  'a',
  'datetime',
  'object',
  'of',
  'format',
  '',
  "'",
  '',
  '%',
  'y',
  '-',
  '',
  '%',
  'm',
  '-',
  '',
  '%',
  'd',
  '',
  '%',
  'h',
  ':',
  '',
  '%',
  'm',
  ':',
  '',
  '%',
  's',
  '.',
  '',
  '%',
  'f',
  "'",
  ''],
 ['get',
  'the',
  'average',
  'of',
  'a',
  'list',
  'values',
  'for',
  'each',
  'key',
  'in',
  'dictionary',
  '',
  '`',
  'd',
  '`',
  '',
  ')',
  ''],
 ['zip',
  'two',
  'lists',
  '',
  '`',
  '',
  '[',
  '1',
  ',',
  '',
  '2',
  ']',
  '',
  '`',
  '',
  'and',
  '',
  '`',
  '',
  '[',
  '3',
  ',',
  '',
  '4',
  ']',
  '',
  '`',
  '',
  'into',
  'a',
  'list',
  'of',
  'two',
  'tuples',
  'containing',
 

In [72]:
len(intent_text)

5764

In [76]:
%time
vecs = vectorize_text_list(clean_split_text_list(intent_text))

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.96 µs


In [178]:
# A couple of functions to help process lists of text sentences.

import re
import nltk
nltk.download('punkt')

def clean_split_text_list(li):
    '''
    Takes a list of sentences.
    Returns a list of lists, each inner list is words in a sentence.
    Also adds a space on either side of non-word, non-digit chars. 
    This allows for brackets, etc. to be considered as their own word, unless 
    vectorized with a model which does not include them.
    '''
    
    new_list = list()
    for i in li:
        if type(i) == float:
            i = str(i)
        if i == None:
            new_list.append(np.zeros_like(wv["empty"])) # If None, empty array of wv shape.
            continue
        try:
            i = i.lower() #lowercase the sentence
        except:
            pass
        try:
            i = re.sub('([^a-zA-Z\ \d])', r' \1 ', i) # Add spaces between special chars
        except:
            pass
        try:
            i = list(i.split(' '))
        except:
            pass
        new_list.append(i)
    return new_list

def clean_punc(li):
    '''
    Takes a list of sentences, with nested list of words.
    Removes items which are not characters.
    '''
    new_list = list()
    for i in li:
        sub_list = list()
        for j in i:
            try:
                j = re.sub(r'([^a-zA-Z]*)', '', j) # remove non-word chars.
            except:
                pass
            if type(i) == float:
                i = str(i)
            sub_list.append(j)
        new_list.append(sub_list)
    return new_list

def vectorize_text_list(li):
    '''
    Takes a list of lists.
        - first list is a sentence
        - inner list is a list of words.
    Returns a list of lists, each inner list is words in a sentence.
    Also adds a space on either side of non-word, non-digit chars. 
    This allows for brackets, etc. to be considered as their own word, unless 
    vectorized with a model which does not include them.
    '''
    new_list=list() # new list object to be returned at end.
    for i in li:
        if i == None:
            new_list.append(np.zeros_like(wv["empty"])) # If None, empty array of wv shape.
            continue
        if type(i) == float:
            i = str(i)
        sub_list=list() # list of vecs, representing a sentence
        for j in i: 
            try:
                vec = wv[j]
                sub_list.append(vec)
            except KeyError:
                continue
        new_list.append(sub_list)
    return new_list

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/justin.hugh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [128]:
vecs[0][0].shape

(50,)

In [160]:
li = clean_split_text_list(intent_text[0:2])

new_list = list()
for i in li:
    for j in i:
            try:
        j = re.sub('\W*', '', j) # remove non-word chars.
            except:
                pass
        new_list.append(j)
new_list

['concatenate',
 'elements',
 'of',
 'a',
 'list',
 '',
 '',
 'x',
 '',
 '',
 'of',
 'multiple',
 'integers',
 'to',
 'a',
 'single',
 'integer',
 'convert',
 'a',
 'list',
 'of',
 'integers',
 'into',
 'a',
 'single',
 'integer']

# Clustering Models

Import classification models. 

In [82]:
# Unsupervised Learning - Methods
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import DBSCAN

nltk has a clustering library. Let's import it

In [99]:
from nltk.cluster import KMeansClusterer
import nltk

In [112]:
NUM_CLUSTERS=10
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
                             repeats=25)

In [86]:
len(wv.vocab)

1193514

In [111]:
# Example X given by tutorial. This will cluster on 1 MILLION
# words! Do not cluster on this. 
X = wv[wv.vocab]
X.shape

(1193514, 50)

Run clustering on own dictionary items. 

First need to run word 2 vec on the list of intents. 
Recall `clean_split_text_list(intent_text)`

In [179]:
sentences = clean_punc(clean_split_text_list(intent_text))
sentences

['concatenate',
 'elements',
 'of',
 'a',
 'list',
 '',
 '',
 'x',
 '',
 '',
 'of',
 'multiple',
 'integers',
 'to',
 'a',
 'single',
 'integer',
 'convert',
 'a',
 'list',
 'of',
 'integers',
 'into',
 'a',
 'single',
 'integer',
 'convert',
 'a',
 'datetime',
 'string',
 'back',
 'to',
 'a',
 'datetime',
 'object',
 'of',
 'format',
 '',
 '',
 '',
 '',
 'y',
 '',
 '',
 '',
 'm',
 '',
 '',
 '',
 'd',
 '',
 '',
 'h',
 '',
 '',
 '',
 'm',
 '',
 '',
 '',
 's',
 '',
 '',
 '',
 'f',
 '',
 '',
 'get',
 'the',
 'average',
 'of',
 'a',
 'list',
 'values',
 'for',
 'each',
 'key',
 'in',
 'dictionary',
 '',
 '',
 'd',
 '',
 '',
 '',
 '',
 'zip',
 'two',
 'lists',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'and',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'into',
 'a',
 'list',
 'of',
 'two',
 'tuples',
 'containing',
 'elements',
 'at',
 'the',
 'same',
 'index',
 'in',
 'each',
 'list',
 'prepend',
 'string',
 '',
 '',
 'hello',
 '',
 '',
 'to',
 'all',
 'i

In [180]:
# list of lists, with words in sentences.
sentences = clean_punc(clean_split_text_list(intent_text))
intent_wv = Word2Vec(sentences, min_count=1)

TypeError: 'numpy.float32' object is not iterable

In [119]:
# Create new vocab object from unique_words
from gensim.models import Word2Vec
model = Word2Vec


KeyError: "word 'datafram' not in vocabulary"

In [109]:
%time
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
print (assigned_clusters)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 5.01 µs


KeyboardInterrupt: 