# Word2Vec

Import libaries known to be needed.

In [47]:
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

## Get Data:

Data has been saved into pickle files. Need to unpickle, and save as variables in this workbook to work with them.

In [2]:
# Indicate dataframes to import.
list_dfs = ['pickled_conala_mined_df', 'pickled_conala_train_df', 'pickled_conala_test_df',
           'conala_train_bag_df', 'conala_mined_bag_df', 'combined_bag_df']

In [111]:
%time
# Load all data in list_dfs
data = {}
for df in list_dfs:
    dbfile = open(df, 'rb')      
    contents = pickle.load(dbfile)
    data[df] = contents
    dbfile.close()

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 13.1 µs


## Preprocessing in prep for Word2Vec

For Word2Vec, we need a list of all the sentences which will be transformed in it. So this will have to be done for both intent, and snippet. 

We've discussed maybe combining mined and trained data, but let's start with just the train data first.

In [112]:
df = data["pickled_conala_train_df"]

In [122]:
# Should make a pipeline in the future.
# check for na in the data.
df["rewritten_intent"].isna().sum()

79

79 rows of na. Let's drop them.

In [127]:
# drop the na rows
df.dropna(inplace=True)
# new length of list
print(len(df))

In [129]:
# Create a list of the code snippets in the data. 
snippet_text = list(df["snippet"])
# List of rewritten intent
intent_text = list(df["rewritten_intent"])

In [130]:
# Dependencies for processing the intent_corpus
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/justin.hugh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [134]:
# Cleaning the text by lowering
processed_intent = [] # Create new list for the resulting text
for i in range(len(intent_text)): # loop through all contents in text list
    processed_intent.append(intent_text[i].lower()) # new list is lowered strings

# check length is same
print(len(processed_intent))
# peek 
print(processed_intent[:11])

2300
["concatenate elements of a list 'x' of multiple integers to a single integer", 'convert a list of integers into a single integer', "convert a datetime string back to a datetime object of format '%y-%m-%d %h:%m:%s.%f'", 'get the average of a list values for each key in dictionary `d`)', 'zip two lists `[1, 2]` and `[3, 4]` into a list of two tuples containing elements at the same index in each list', "prepend string 'hello' to all items in list 'a'", 'regex for repeating words in a string `s`', 'normalize a pandas dataframe `df` by row', 'swap values in a tuple/list inside a list `mylist`', 'swap values in a tuple/list in list `mylist`', 'find all occurrences of the pattern \'\\\\[[^\\\\]]*\\\\]|\\\\([^\\\\)]*\\\\)|"[^"]*"|\\\\s+\' within `strs`']


In [137]:
# further process the intent list, remove special chars
temp_intent = [] # Create new list for the resulting text
for i in range(len(processed_intent)): # loop through all contents in text list
    temp = re.sub('[^a-zA-Z]', ' ', processed_intent[i]) # replace non letters with ' '
    temp_intent.append(temp) # new list is lowered strings
processed_intent = temp_intent # save new processed list
processed_intent

['concatenate elements of a list  x  of multiple integers to a single integer',
 'convert a list of integers into a single integer',
 'convert a datetime string back to a datetime object of format   y  m  d  h  m  s  f ',
 'get the average of a list values for each key in dictionary  d  ',
 'zip two lists          and          into a list of two tuples containing elements at the same index in each list',
 'prepend string  hello  to all items in list  a ',
 'regex for repeating words in a string  s ',
 'normalize a pandas dataframe  df  by row',
 'swap values in a tuple list inside a list  mylist ',
 'swap values in a tuple list in list  mylist ',
 'find all occurrences of the pattern                                        s   within  strs ',
 'generate the combinations of   from a set               ',
 'add multiple columns  hour    weekday    weeknum  to pandas data frame  df  from lambda function  lambdafunc ',
 'beautifulsoup search string  elsie  inside tag  a ',
 'convert a dateti

In [138]:
# further process the intent list, reduce whitespace special chars
temp_intent = [] # Create new list for the resulting text
for i in range(len(processed_intent)): # loop through all contents in text list
    temp = re.sub(r'\s+', ' ', processed_intent[i]) # replace non letters with ' '
    temp_intent.append(temp) # new list is lowered strings
processed_intent = temp_intent # save new processed list
processed_intent

['concatenate elements of a list x of multiple integers to a single integer',
 'convert a list of integers into a single integer',
 'convert a datetime string back to a datetime object of format y m d h m s f ',
 'get the average of a list values for each key in dictionary d ',
 'zip two lists and into a list of two tuples containing elements at the same index in each list',
 'prepend string hello to all items in list a ',
 'regex for repeating words in a string s ',
 'normalize a pandas dataframe df by row',
 'swap values in a tuple list inside a list mylist ',
 'swap values in a tuple list in list mylist ',
 'find all occurrences of the pattern s within strs ',
 'generate the combinations of from a set ',
 'add multiple columns hour weekday weeknum to pandas data frame df from lambda function lambdafunc ',
 'beautifulsoup search string elsie inside tag a ',
 'convert a datetime object my datetime into readable format b d y ',
 'parse string s to int when string contains a number',
 '

In [148]:
# break up the list of sentences into individual words. 
li = clean_split_text_list(processed_intent)
li

[['concatenate',
  'elements',
  'of',
  'a',
  'list',
  'x',
  'of',
  'multiple',
  'integers',
  'to',
  'a',
  'single',
  'integer'],
 ['convert', 'a', 'list', 'of', 'integers', 'into', 'a', 'single', 'integer'],
 ['convert',
  'a',
  'datetime',
  'string',
  'back',
  'to',
  'a',
  'datetime',
  'object',
  'of',
  'format',
  'y',
  'm',
  'd',
  'h',
  'm',
  's',
  'f',
  ''],
 ['get',
  'the',
  'average',
  'of',
  'a',
  'list',
  'values',
  'for',
  'each',
  'key',
  'in',
  'dictionary',
  'd',
  ''],
 ['zip',
  'two',
  'lists',
  'and',
  'into',
  'a',
  'list',
  'of',
  'two',
  'tuples',
  'containing',
  'elements',
  'at',
  'the',
  'same',
  'index',
  'in',
  'each',
  'list'],
 ['prepend', 'string', 'hello', 'to', 'all', 'items', 'in', 'list', 'a', ''],
 ['regex', 'for', 'repeating', 'words', 'in', 'a', 'string', 's', ''],
 ['normalize', 'a', 'pandas', 'dataframe', 'df', 'by', 'row'],
 ['swap',
  'values',
  'in',
  'a',
  'tuple',
  'list',
  'inside',
 

In [None]:
def text_clustering_pipe():
    ''' 
    list of sentences -> list of arrays (text cleaned)

    My custom pipline for processing text in prep for clustering techniques.
    '''
    
    # start with list of sentences
    # lowercase the text
    # remove symbols
    # remove extra empty spaces
    
    # split text (clean_split_text_list(li)
    
    # create a vocabulary 
    
    # vectorize the text
    
    # return the list of of list of arrays
    
def text_clustering_pipe():
    ''' 
    list of list of multiple arrays (text cleaned) -> list of arrays (average sentences)

    Create target vectors for sentences.
    '''
    
    # loop through list, average the arrays contained.
    
    # return list of arrays (averaged sentences)

In [149]:
# create all_words set
all_words = set()
for i in range(len(li)):
    for j in range(len(li[i])):
        all_words.add(li[i][j])

# Number of unique words.        
len(all_words)

2113

# Import Gensim, and pre-trained Model

In [13]:
# Import Gensim, and get word2vec model methods. 
from gensim.models import Word2Vec
import gensim.downloader # allows downloading of existing models

# Downloading a pre-trained vector using 50 dimensions, from twitter data
wv = gensim.downloader.load('glove-twitter-50')

In [14]:
# Checking vocab type
type(wv.vocab)

dict

In [15]:
# Terms in vocab
len(wv.vocab)

1193514

In [150]:
# Checking for similar terms, cosine similarity!
wv.most_similar("man")

[('boy', 0.8404532670974731),
 ('was', 0.8205661177635193),
 ('bad', 0.819680392742157),
 ('dude', 0.8176411986351013),
 ('he', 0.8076108694076538),
 ('guy', 0.7904506921768188),
 ('god', 0.7806254029273987),
 ('hell', 0.7783043384552002),
 ('problem', 0.7771045565605164),
 ('even', 0.7761484980583191)]

In [151]:
# Check if word is in wv vocab
"cat" in wv.vocab

True

In [152]:
# Recall how many unique word are in our corpus?
len(all_words)

2113

now check how many of these are in the word2vec pre-trained model.

In [159]:
# Find the list of words contained in model, and those missing.
contained=[] # list of terms in both our corpus and the model
missing=[] # list of terms in our corpus, but not the model
msk=[] # True/false mask for unique words that are in the model. 
for i in all_words:
    if(i in wv.vocab):
        msk.append(1)
        contained.append(i)
    else:
        msk.append(0)
        missing.append(i)
# num words from all_words in the pre-trained model
print(len(contained))
# num words from all_words NOT in the pre-trained model
print(len(missing))

1765
348


# Set up to try Clustering

In [24]:
len(intent_text)

5764

In [32]:
%time
vecs = vectorize_text_list(clean_split_text_list(intent_text))

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 4.05 µs


TypeError: 'numpy.float32' object is not iterable

In [141]:
# A couple of functions to help process lists of text sentences.

import re
import nltk
nltk.download('punkt')



def clean_split_text_list(li):
    '''
    Takes a list of sentences.
    Returns a list of lists, each inner list is words in a sentence.
    Also adds a space on either side of non-word, non-digit chars. 
    This allows for brackets, etc. to be considered as their own word, unless 
    vectorized with a model which does not include them.
    '''
    
    new_list = list()
    for i in li:
        if type(i) == float: # recieved errors with handling floats as strings, make into string
            i = str(i)
        if i == None:
            new_list.append(np.zeros_like(wv["empty"])) # If None, empty array of wv shape.
            continue
        try:
            i = i.lower() #lowercase the sentence
        except:
            pass # if cannot be lowered, pass.
        try:
            i = re.sub('([^a-zA-Z\ \d])', r' \1 ', i) # Add spaces between special chars
        except:
            pass
        try:
            i = list(i.split(' '))
        except:
            pass
        new_list.append(i)
    return new_list

def clean_punc(li):
    '''
    Takes a list of sentences, with nested list of words.
    Removes items which are not characters.
    '''
    new_list = list()
    for i in li:
        sub_list = list()
        for j in i:
            try:
                j = re.sub(r'([^a-zA-Z]*)', '', j) # remove non-word chars.
            except:
                pass
            if type(i) == float:
                i = str(i)
            sub_list.append(j)
        new_list.append(sub_list)
    return new_list

def vectorize_text_list(li):
    '''
    Takes a list of lists.
        - first list is a sentence
        - inner list is a list of words.
    Returns a list of lists, each inner list is words in a sentence.
    Also adds a space on either side of non-word, non-digit chars. 
    This allows for brackets, etc. to be considered as their own word, unless 
    vectorized with a model which does not include them.
    '''
    new_list=list() # new list object to be returned at end.
    for i in li:
        # this check is causing issues when it finds an array.
        # so add a try, and another check a level lower? 
        try:
            if i == None: 
                new_list.append(np.zeros_like(wv["empty"])) # If None, empty array of wv shape.
                continue
        except ValueError:
            pass
        if type(i) == float:
            i = str(i)
        sub_list=list() # list of vecs, representing a sentence
        for j in i: 
            try:
                if type(i) == float:
                    i = str(i)
                vec = wv[j]
                sub_list.append(vec)
            except KeyError:
                continue
        new_list.append(sub_list)
    return new_list

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/justin.hugh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
vecs[0][0].shape

In [45]:
li = clean_split_text_list(intent_text)

new_list = list()
for i in li:
    for j in i:
        try:
            j = re.sub('\W*', '', j) # remove non-word chars.
        except:
            pass
        new_list.append(j)
new_list

['concatenate',
 'elements',
 'of',
 'a',
 'list',
 '',
 '',
 'x',
 '',
 '',
 'of',
 'multiple',
 'integers',
 'to',
 'a',
 'single',
 'integer',
 'convert',
 'a',
 'list',
 'of',
 'integers',
 'into',
 'a',
 'single',
 'integer',
 'convert',
 'a',
 'datetime',
 'string',
 'back',
 'to',
 'a',
 'datetime',
 'object',
 'of',
 'format',
 '',
 '',
 '',
 '',
 'y',
 '',
 '',
 '',
 'm',
 '',
 '',
 '',
 'd',
 '',
 '',
 'h',
 '',
 '',
 '',
 'm',
 '',
 '',
 '',
 's',
 '',
 '',
 '',
 'f',
 '',
 '',
 'get',
 'the',
 'average',
 'of',
 'a',
 'list',
 'values',
 'for',
 'each',
 'key',
 'in',
 'dictionary',
 '',
 '',
 'd',
 '',
 '',
 '',
 '',
 'zip',
 'two',
 'lists',
 '',
 '',
 '',
 '',
 '1',
 '',
 '',
 '2',
 '',
 '',
 '',
 '',
 'and',
 '',
 '',
 '',
 '',
 '3',
 '',
 '',
 '4',
 '',
 '',
 '',
 '',
 'into',
 'a',
 'list',
 'of',
 'two',
 'tuples',
 'containing',
 'elements',
 'at',
 'the',
 'same',
 'index',
 'in',
 'each',
 'list',
 'prepend',
 'string',
 '',
 '',
 'hello',
 '',
 '',
 'to',
 'all',

# Clustering Models

Import classification models. 

In [None]:
# Unsupervised Learning - Methods
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import DBSCAN

nltk has a clustering library. Let's import it

In [None]:
from nltk.cluster import KMeansClusterer
import nltk

In [None]:
NUM_CLUSTERS=10
kclusterer = KMeansClusterer(NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
                             repeats=25)

In [None]:
len(wv.vocab)

In [None]:
# Example X given by tutorial. This will cluster on 1 MILLION
# words! Do not cluster on this. 
X = wv[wv.vocab]
X.shape

Run clustering on own dictionary items. 

First need to run word 2 vec on the list of intents. 
Recall `clean_split_text_list(intent_text)`

In [None]:
sentences = clean_punc(clean_split_text_list(intent_text))
sentences

In [None]:
# list of lists, with words in sentences.
sentences = clean_punc(clean_split_text_list(intent_text))
intent_wv = Word2Vec(sentences, min_count=1)

In [None]:
# Create new vocab object from unique_words
from gensim.models import Word2Vec
model = Word2Vec


In [None]:
%time
assigned_clusters = kclusterer.cluster(X, assign_clusters=True)
print (assigned_clusters)