In [1]:
# !pip install spacy
# !pip install scikit-learn
# !python -m spacy download en_core_web_md

In [2]:
import pandas as pd
import spacy
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
nlp = spacy.load("en_core_web_md")

In [3]:
# SOLO PARA USO EN GOOGLE COLABORATORY
# para conectar el notebook con la cuenta de gdrive
# from google.colab import drive
# drive.mount('/content/drive/')

In [4]:
# BASE_FOLDER = '/content/drive/My Drive/TFM'
BASE_FOLDER = './'

In [5]:
corpus = pd.read_parquet(f'{BASE_FOLDER}/dataset_python_functions.parquet')
corpus = corpus['instruction'].to_list()

In [6]:
def transform_sentence(sentence):
    sentence_transformed = []
    nlp_sentence = nlp(sentence)
    
    for word in nlp_sentence:
        if not word.is_stop and not word.is_punct and not word.like_num:
            sentence_transformed.append(word.text)

    return " ".join(sentence_transformed)

In [7]:
print(corpus[:5])
for index, sentence in enumerate(corpus):
    corpus[index] = transform_sentence(sentence)

print(corpus[:5])

['Create a function to calculate the sum of a sequence of integers.', 'Generate a Python code for crawling a website for a specific type of data.', 'Create a Python list comprehension to get the squared values of a list [1, 2, 3, 5, 8, 13].', 'Generate a python script to perform this action.', 'Write a python script to generates random numbers between 0 and 9 that are divisible by 3.']
['Create function calculate sum sequence integers', 'Generate Python code crawling website specific type data', 'Create Python list comprehension squared values list', 'Generate python script perform action', 'Write python script generates random numbers divisible']


In [8]:
corpus_vec = np.mean([nlp(instruction).vector for instruction in corpus], axis=0)
print(corpus_vec)

[-0.7960618  -0.2573387   1.5524931   0.41363308  2.688529    0.3146781
  1.8208108   2.2197366  -2.7233176  -0.54690075  5.3397055   1.9338576
 -4.2966914   2.4969664   0.37079632  0.68272454  2.397711    1.1503541
 -1.3532854   0.48721963 -1.1650857   1.1779038  -1.2810013   0.5372504
 -1.4514434  -0.49112925 -0.9705407  -1.8605627  -1.348806    1.174778
 -0.35029024  0.9432025  -1.1598158  -0.60816544  1.187353   -1.0960531
  0.43153036 -0.28287143  2.1484945   0.92216116  0.8986668  -1.1144468
 -1.6732911   0.8925951  -2.4947379   0.80841285  2.2919462  -1.288995
 -0.15612681 -1.7120885   0.36612856  1.9513457  -0.30400637 -2.2375906
 -2.4474888   2.0463588  -2.0988162   1.886121    1.017816   -1.2687927
  2.4162874   1.5328352  -3.0044312   0.5096481   2.026303    3.0498402
 -2.0525825  -3.8909523  -0.2624292   2.64655    -2.156691    0.40768492
 -1.0112213  -0.02407207 -0.16623355  1.5462584  -0.29660705 -0.02785358
 -0.4892479   1.1282624  -2.8511946  -1.108096   -0.14151023  1.

In [9]:
def calculate_context_of_sentence_in_corpus(corpus_vec, sentence):
    query = nlp(sentence)
    q_vec = query.vector
    return cosine_similarity([corpus_vec], [q_vec])

In [10]:
sentence = "Make a function to return fibonnaci sequence"
sentence = transform_sentence(sentence)
calculate_context_of_sentence_in_corpus(corpus_vec, sentence)

array([[0.7882325]], dtype=float32)

In [11]:
sentence = "Make a sandwich with cheese, tomatoes and lettuce"
sentence = transform_sentence(sentence)
calculate_context_of_sentence_in_corpus(corpus_vec, sentence)

array([[0.14083377]], dtype=float32)

In [12]:
with open(f"{BASE_FOLDER}/sentences_vector_processed.pickle", "wb") as file:
    pickle.dump(corpus_vec, file)