In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5p-220m")

inputs = tokenizer.encode(" Predicts the model for the given image. Args: X_img_path: Path to the image to predict. knn_clf: The classifier to use. model_path: Path to the model to use. distance_threshold: The maximum number of times the model is found in the image. Returns: A list of the classifiers that are not within the threshold.", truncation=True)
print(f" len inputs: {len(inputs)}")
print(tokenizer.decode(inputs))

In [None]:
import datasets
import matplotlib.pyplot as plt
import os

dataset_folder = "../data/python_dataset.dse"
if(os.path.exists(dataset_folder)):
    print("loading local dataset")
    docstrings = datasets.load_from_disk(dataset_folder)["train"]
else:
    print("loading from dataset from huggingface")
    docstrings = datasets.load_dataset("juraj-juraj/doc_gen")["train"]

docstring_lengths = [len(docstring) for docstring in docstrings["docstring"]]

docstring_lengths = list(filter(lambda x: x < 2000, docstring_lengths))

plt.hist(docstring_lengths, bins=30, edgecolor='black')
plt.xlabel('Docstring Length')
plt.ylabel('Frequency')
plt.title('Histogram of Docstring Lengths')
plt.show()

In [None]:
import datasets
import pandas as pd

dataset = datasets.load_dataset("juraj-juraj/doc_gen")

train_dataset = pd.DataFrame.from_dict(dataset["train"])


In [None]:
def filter_lengths(dataset: pd.DataFrame, lower_bound: int = 50, high_bound: int = 500):
    longer_than_lower = dataset["docstring"].str.len() > lower_bound
    shorter_than_higher = dataset["docstring"].str.len() < high_bound
    return dataset[shorter_than_higher & longer_than_lower]

train_dataset = filter_lengths(train_dataset)
train_dataset = train_dataset.reset_index()

docstring_lengths = [len(docstring) for docstring in train_dataset["docstring"]]
docstring_lengths = list(filter(lambda x: x < 2000, docstring_lengths))


plt.hist(docstring_lengths, bins=30, edgecolor='black')
plt.xlabel('Docstring Length')
plt.ylabel('Frequency')
plt.title('Histogram of Docstring Lengths')
plt.show()

In [None]:


train_dataset = train_dataset[train_dataset["docstring"].str.len() > 50]
train_dataset = train_dataset[train_dataset["docstring"].str.len() < 500]

eval_dataset = pd.DataFrame.from_dict(dataset["validation"])
eval_dataset = eval_dataset[eval_dataset["docstring"].str.len() > 50]
eval_dataset = eval_dataset[eval_dataset["docstring"].str.len() < 500]

test_dataset = pd.DataFrame.from_dict(dataset["test"])
test_dataset = test_dataset[test_dataset["docstring"].str.len() > 50]
test_dataset = test_dataset[test_dataset["docstring"].str.len() < 500]


In [None]:
s = train_dataset.copy().reset_index()
s[["docstring", "function"]]

In [None]:
train_dataset = train_dataset.reset_index()
eval_dataset = eval_dataset.reset_index()
test_dataset = test_dataset.reset_index()

In [None]:
from datasets import Dataset, DatasetDict

ds_train = Dataset.from_pandas(train_dataset[["docstring", "function"]])
ds_validation = Dataset.from_pandas(eval_dataset[["docstring", "function"]])
ds_test = Dataset.from_pandas(test_dataset[["docstring", "function"]])

ds_train["docstring"]


In [None]:
dataset_dict = DatasetDict(
        {
            "train": ds_train,
            "validation": ds_validation,
            "test": ds_test,
        }
    )

dataset_dict.save_to_disk("../docstring_len_filtered.ds")

In [None]:
dataset_dict = datasets.load_from_disk("../docstring_len_filtered.ds")
dataset_dict["train"]["docstring"]