# Data:

In [1]:
import pickle
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
# Indicate dataframes to import.
list_dfs = ['pickled_conala_mined_df', 'pickled_conala_train_df', 'pickled_conala_test_df',
           'conala_train_bag_df', 'conala_mined_bag_df', 'combined_bag_df']

In [3]:
%time
# Load all data in list_dfs
data = {}
for df in list_dfs:
    dbfile = open(df, 'rb')      
    contents = pickle.load(dbfile)
    data[df] = contents
    dbfile.close()

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.72 µs


In [4]:
data.keys()

dict_keys(['pickled_conala_mined_df', 'pickled_conala_train_df', 'pickled_conala_test_df', 'conala_train_bag_df', 'conala_mined_bag_df', 'combined_bag_df'])

In [5]:
df = data['combined_bag_df']

## Word2Vec

For Word2Vec, we need a list of all the sentences which will be transformed in it. So this will have to be done for both intent, and snippet. We can assemble this by combining the `conala_train_df` and the `conala_mined_df`

In [6]:
conala_train_df = data["pickled_conala_train_df"]
conala_mined_df = data["pickled_conala_mined_df"]

In [7]:
# concatenate the two dfs.
df = pd.concat([conala_train_df, conala_mined_df], ignore_index=True)

In [8]:
# Peek
df

Unnamed: 0,intent,rewritten_intent,snippet,question_id,parent_answer_post_id,prob,id
0,How to convert a list of multiple integers int...,Concatenate elements of a list 'x' of multiple...,"sum(d * 10 ** i for i, d in enumerate(x[::-1]))",41067960,,,
1,How to convert a list of multiple integers int...,convert a list of integers into a single integer,"r = int(''.join(map(str, x)))",41067960,,,
2,how to convert a datetime string back to datet...,convert a DateTime string back to a DateTime o...,datetime.strptime('2010-11-13 10:33:54.227806'...,4170655,,,
3,Averaging the values in a dictionary based on ...,get the average of a list values for each key ...,"[(i, sum(j) / len(j)) for i, j in list(d.items...",29565452,,,
4,zip lists in python,"zip two lists `[1, 2]` and `[3, 4]` into a lis...","zip([1, 2], [3, 4])",13704860,,,
...,...,...,...,...,...,...,...
5759,How to convert datetime to string in python in...,,{{(item.date | date): 'Y M d'}},794995,795000.0,0.500243,794995_795000_0
5760,Delete column from pandas DataFrame,,"df = df.drop('column_name', 1)",13411544,18145399.0,0.500193,13411544_18145399_2
5761,How to get a list which is a value of a dictio...,,"reverse_d = {value: key for key, values in lis...",40584186,40584271.0,0.500171,40584186_40584271_0
5762,Cross-platform addressing of the config file,,config_file = os.path.expanduser('~/foo.ini'),3227624,3227931.0,0.500164,3227624_3227931_0


In [9]:
# Create a list of the text in intent field. (Note this is NOT using the 
# rewritten intent in the training data.)
intent_text = list(df["intent"])

# Create a list of the code snippets in the data. 
snippet_text = list(df["snippet"])

In [10]:
# Check
print(intent_text[:10])
print(snippet_text[:10])

['How to convert a list of multiple integers into a single integer?', 'How to convert a list of multiple integers into a single integer?', 'how to convert a datetime string back to datetime object?', 'Averaging the values in a dictionary based on the key', 'zip lists in python', 'Prepend the same string to all items in a list', 'regex for repeating words in a string in Python', 'Normalizing a pandas DataFrame by row', 'swap values in a tuple/list inside a list in python?', 'swap values in a tuple/list inside a list in python?']
['sum(d * 10 ** i for i, d in enumerate(x[::-1]))', "r = int(''.join(map(str, x)))", "datetime.strptime('2010-11-13 10:33:54.227806', '%Y-%m-%d %H:%M:%S.%f')", '[(i, sum(j) / len(j)) for i, j in list(d.items())]', 'zip([1, 2], [3, 4])', "['hello{0}'.format(i) for i in a]", "re.sub('(?<!\\\\S)((\\\\S+)(?:\\\\s+\\\\2))(?:\\\\s+\\\\2)+(?!\\\\S)', '\\\\1', s)", 'df.div(df.sum(axis=1), axis=0)', 'map(lambda t: (t[1], t[0]), mylist)', '[(t[1], t[0]) for t in mylist]']

Now we need to get each unique word in the text, and for the code, each unique char.

In [11]:
# Get unique words in text
intent_tokens = set()
    
for intent in tqdm(intent_text):
    for word in intent.split(" "):
        intent_tokens.add(word)

num_intent_tokens = len(intent_tokens)
intent_tokens

100%|██████████| 5764/5764 [00:00<00:00, 204870.67it/s]


{'float',
 'compose',
 'possible',
 'entry',
 '',
 'Horizontal',
 'Iterate',
 'Bulk',
 'fixed',
 'only?',
 'variables',
 'capital',
 'dot',
 'app?',
 'gradients',
 '0s',
 'Max',
 'box?',
 'tabs',
 'Pillow',
 'converting',
 'cookie',
 'based',
 're-indexing',
 'utf-string',
 'Office',
 'Time',
 'Expat',
 'xlwt?',
 'go',
 'Evaluating',
 'listing',
 'pylab.savefig()',
 'str(dict)?',
 'figure?',
 'psycopg2:',
 '0-dimension',
 'gracefully',
 'y-axis',
 'concise',
 'Contained',
 'place',
 'index?',
 'parse',
 'configure',
 'App',
 'modules',
 'sub-level',
 'python,',
 'libraries?',
 'unicode_literals',
 'fashion',
 'occurences',
 'comma-separated',
 '(float)',
 'correspond',
 'value)',
 'groupby:',
 'Flask-Mail',
 'them',
 'doubling',
 'integrate',
 'bower',
 'year?',
 'pipe?',
 'interactive?',
 'Uniqueness',
 'id?',
 'crop',
 'sets',
 'pandas',
 'driver',
 "'£'",
 'SQLAlchemy-flask',
 'Shell',
 'Encoding',
 'Pyhon',
 'extension?',
 'INSERT',
 'visible?',
 'repeating',
 'slow',
 'installed',

In [12]:
len(intent_text)

5764

In [13]:
num_intent_tokens

3658

In [14]:
# Create the data with N-grams
from nltk import ngrams
import itertools

gram_size = 4
data = []

# Go over each intent statement
for intent in tqdm(intent_text):
    # Finds all n-grams in the statement
    grams = ngrams(intent.split(), gram_size)
    for gram in grams:
        # Find all pairs of words within this n-gram
        for pair in itertools.permutations(gram, 2):
            data.append(pair)

data[0:20]

100%|██████████| 5764/5764 [00:00<00:00, 27912.57it/s]


[('How', 'to'),
 ('How', 'convert'),
 ('How', 'a'),
 ('to', 'How'),
 ('to', 'convert'),
 ('to', 'a'),
 ('convert', 'How'),
 ('convert', 'to'),
 ('convert', 'a'),
 ('a', 'How'),
 ('a', 'to'),
 ('a', 'convert'),
 ('to', 'convert'),
 ('to', 'a'),
 ('to', 'list'),
 ('convert', 'to'),
 ('convert', 'a'),
 ('convert', 'list'),
 ('a', 'to'),
 ('a', 'convert')]

In [15]:
len(data)

400836

In [29]:
from sklearn.preprocessing import LabelBinarizer
from scipy import sparse
from scipy.sparse import csr_matrix

In [54]:
encoder = LabelBinarizer(sparse_output=False)

In [55]:
one_hot_encoder = encoder.fit(list(intent_tokens))

In [56]:
len(one_hot_encoder.classes_)

3658

In [57]:
#Transform the input/output pairs:
intent_train_data = []
intent_train_target = []

for pair in tqdm(data[:1000]):
    intent_train_data.append(one_hot_encoder.transform([pair[0]]))
    intent_train_target.append(one_hot_encoder.transform([pair[1]]))

100%|██████████| 1000/1000 [00:04<00:00, 217.55it/s]


In [58]:
intent_train_data = np.squeeze(np.asarray(intent_train_data))
intent_train_target = np.squeeze(np.asarray(intent_train_target))

In [37]:
%time
# Pickle the data for use later, avoiding lengthy one-hot encoding again. 
# intent_train_data
with open('pickled_intent_train_data.pkl', 'wb+') as f:
    # source, destination 
    pickle.dump(intent_train_data, f)                      

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.81 µs


In [60]:
intent_train_data.shape

(1000, 3658)

In [62]:
# Cleaning the text
processed_intent_text = intent_text.lower()

AttributeError: 'list' object has no attribute 'lower'

In [None]:
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )
processed_article = re.sub(r'\s+', ' ', processed_article)

# Preparing the dataset
all_sentences = nltk.sent_tokenize(processed_article)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

# Removing Stop Words
from nltk.corpus import stopwords
for i in range(len(all_words)):
    all_words[i] = [w for w in all_words[i] if w not in stopwords.words('english')]

In [59]:
from gensim.models import Word2Vec
word2vec = Word2Vec(intent_train_data, min_count=5)

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U21'), dtype('<U21')) -> dtype('<U21')

In [50]:
vocabulary = word2vec.wv.vocab

In [38]:
# intent_train_target
with open('pickled_intent_train_target.pkl', 'wb+') as f:
    # source, destination 
    pickle.dump(intent_train_target, f)                      

Now set up the network. 

In [24]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
model = keras.models.Sequential()

model.add(keras.layers.Dense(10, activation='relu'))

# Output Layer
model.add(keras.layers.Dense(num_intent_tokens, activation='softmax'))

model.compile(
    # Optimizer
    optimizer=keras.optimizers.Adam(),  
    # Loss function to minimize
    loss=keras.losses.CategoricalCrossentropy()
)

In [25]:
num_epochs = 1000

# Printout a single verbose fit operation 10 times throughout the training process.
for i in range(0, 10):
    model.fit(intent_train_data, intent_train_target, epochs=round(num_epochs/10)-1, verbose=0)
    
    print(f"Epoch: {(i+1)*round(num_epochs/10)}/{num_epochs}")
    model.fit(intent_train_data, intent_train_target, verbose=1)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type csr_matrix).

## PCA Dimension Reduction

In [None]:
from sklearn.decomposition import PCA

In [None]:
num_pcs = 200
# Instantiate
myPCA = PCA(n_components=200)
# Fit
myPCA.fit(df)

In [None]:
expl_var = myPCA.explained_variance_ratio_
expl_var_cumulative = myPCA.explained_variance_ratio_.cumsum()

In [None]:
plt.figure(figsize=(8,8))
plt.plot(range(num_pcs), expl_var_cumulative, color='cornflowerblue')
plt.title("Explained Variance by Number of PCs")
plt.xlabel("Number of PCs")
plt.ylabel("Explained Variance")
plt.show()

There's a diminishing return in the explained variance with respect to the number of PCs.
But it's heartening that the explained variance begins with a steep increase in variance explained. 
Let's take the explained variance thresholde to be 0.8

In [None]:
# Finding the number of PCs for 0.8 explained var.
num_PCs = np.argmax(expl_var_cumulative > 0.8)
num_PCs

In [None]:
# Refitting with 129 PCs

# Instantiate
myPCA = PCA(n_components=num_PCs)
# Fit
myPCA.fit(df)
# Transform
df_PC = myPCA.transform(df)

In [None]:
df_PC

In [None]:
%%time
from sklearn.cluster import AgglomerativeClustering
agg_clus = AgglomerativeClustering(n_clusters=3, linkage='average').fit(df_PC)

In [None]:
agg_clus.labels_

In [None]:
np.unique(agg_clus.labels_, return_counts=True)

In [None]:
from sklearn.metrics.cluster import silhouette_score

silhouette_score(df, agg_clus.labels_)

## TF-IDF

This did not accomplish anything meaningful.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer().fit(df)
df_tfidf = tfidf.transform(df)

df_tfidf = pd.DataFrame(columns=tfidf.get_feature_names(), data=df)
display(df_tfidf)

In [None]:
df_tfidf.sum()

In [None]:
df.sum()

So this vectorization is equivalent to the bag of words. Not necessary, and does not improve anything.