In [None]:
import datasets

docstrings = datasets.load_dataset("juraj-juraj/doc_gen")

In [None]:
print(docstrings["validation"][300]["docstring"])

In [None]:
from lark import Lark

parser = Lark(r"""
start: LINE+ "\n"* parameters "\n"*

parameters.1: section+

section.1: WORD "\n" "-"+ "\n"+ SENTENCE+ ["\n"]
SENTENCE: /./+ ["\n"]
LINE: /(.[^-])+/"\n"


// imports WORD from library
%import common.WORD   

// Disregard spaces in text
%ignore " "
%ignore "\t" """)

try:
    
    parsed = parser.parse(text=docstrings["validation"][300]["docstring"])
except Exception as e:
    print(e)
else:
    print("parsed !")

In [None]:
from dataclasses import dataclass, field
from typing import Protocol


class GrammarFilterI(Protocol):
    def parse(self, comment: str) -> bool:
        ...
    
    def __call__(self, comment: str) -> bool:
        ...

@dataclass(slots=True)
class GrammarFilter:
    grammar: str
    parser: Lark = field(init=False)
    
    def __post_init__(self):
        self.parser = Lark(self.grammar)
    
    def parse(self, comment: str) -> bool:
            try:
                self.parser.parse(text=comment)
            except Exception:
                return False
            else:
                return True
    
    def __call__(self, comment: str) -> bool:
        return self.parse(comment)

numpydoc_parser = GrammarFilter(grammar=r"""
start: LINE+ "\n"* parameters "\n"*

parameters.1: section+

section.1: WORD "\n" "-"+ "\n"+ SENTENCE+ ["\n"]
SENTENCE: /./+ ["\n"]
LINE: /(.[^-])+/"\n"


// imports WORD from library
%import common.WORD   

// Disregard spaces in text
%ignore " "
%ignore "\t" """)


In [None]:
df_docstrings = docstrings["validation"].to_pandas()
numpydoc_dataset = df_docstrings.iloc[[numpydoc_parser(docstring) for docstring in df_docstrings.docstring]]


print(numpydoc_dataset.iloc[3]["docstring"])

In [None]:
import random
import time
import datasets

#docstrings = datasets.load_from_disk("../data/googlestyle_dataset_processed_2.ds")
docstrings = datasets.load_dataset("juraj-juraj/doc_gen")

print(f"len docstrings: {len(docstrings['train'])}")
train_data = docstrings["train"][::]["docstring"]

print(f"original seed: {random.seed}")
random.seed(time.time())

for _ in range(0, 20):
    print(train_data[random.randrange(0, len(train_data))], end="\n\n------------------\n\n")
    

In [4]:
import pandas as pd

raw_data = pd.read_pickle("../data/unannotated_functions.pkl")
df = pd.DataFrame(raw_data)
df.reset_index(inplace=True)
df = df.sample(frac=1)
df.reset_index(inplace=True)

#df.iloc[0:100]["function"]
with open("../evaluation/corpus.py", mode="+a") as f:
    f.write("\n\n".join(df.iloc[0:5]["function"]))


In [10]:
# Get some random functions from dataset for corpus at evaluation

import datasets

data = datasets.load_from_disk("../data/googlestyle_dataset_processed_2.ds")
df = data["validation"].to_pandas()
df = df.sample(frac=1)
df.reset_index(inplace=True)
df = df.iloc[0:120]

with open("../evaluation/corpus_2.py", mode="w") as f:
    for i in range(0, len(df)):
        f.write(f"\"\"\"{df.iloc[i]['docstring']}\n\"\"\"\n{df.iloc[i]['function']} \n\n")
