In [3]:
import re
import os
import sys
sys.path.insert(0,'..')
from parse_utils import *
from yaml_file_cls import yaml_file

In [63]:
class parser:
    def __init__(self, fname, content):
        self.data = yaml_file(title=fname, api_name=fname.split('.')[-1], url='', package='scikit-learn', version='0.24.X')
        self.content = content
        self.fname = fname
    
    def parse_sig(self):
        def process_sig(parsed_sig):
            ret = {}
            for key in parsed_sig:
                if key=='' or key.isspace() or key=='...':
                    continue
                else:
                    ret[key] = parsed_sig[key]
            return ret
        m = re.match('(.*?)\s=\s([\w_.]+)\((.*?)\)$', self.content[0])
        if m:
            assert(fname == m.group(1))
            assert(fname.split('.')[-1] == m.group(2))
            sig = m.group(3)
            parsed_sig = parse_input(sig)
            parsed_sig = process_sig(parsed_sig)
            if self.data.init_input(parsed_sig):
                return True
        
        raise GoToTheNextOne('' , self.fname, '[Sig] Fail to parse signature', save=True)
    
    def get_sect(self, sect):
        rule = r'\n\s*{}\n\s*---+\n(.*?)(\n\s*(\w+|See [aA]lso)\n\s*---+\n|$)'.format(sect)
        m = re.search(rule, ''.join(self.content), flags=re.DOTALL)
        if m:
            return m.group(1)
        raise GoToTheNextOne('' , self.fname, '[{}] Fail to parse {} section'.format(sect, sect), save=True)
        
    def descp_pre_process(self,descp):
        return re.sub(r'\.\.\s+version(changed|added)::.*?(\n\s+\n|$)', '\n \n', descp, flags=re.DOTALL)
    
    def descp_post_process(self,descp):
        return re.sub(r'\s*\n\s*', ' ', descp.lstrip())
    
    def update_descp(self, descp_dict):
        if not descp_dict:
            raise GoToTheNextOne('' , self.fname, '[Descp] Fail to parse descp section (empty return)', save=True)
        
        for arg in descp_dict:
            self.data.update_constraint(arg, descp_dict[arg], allow_inconsistent_when_kwargs=False, ignore_star=False)
        
        
    def parse_descp(self, raw_descp):
        ret = {}
        raw_descp = self.descp_pre_process(raw_descp)
#         print(raw_descp)
        for a in re.split(r'\n\s+\n', raw_descp):
            if not a:
                continue
            m = re.match(r'^\s*([\w_\*]+)\s*:(.*?)$', a, flags=re.DOTALL)
            if not m:
                return self.parse_descp2(raw_descp)
                
                
            varname = m.group(1)
            descp = self.descp_post_process(m.group(2))
            ret[varname] = descp
            
        return ret
        
    def parse_descp2(self, raw_descp):
        # match the descp by the args
        # cannot detect inconsistencies, but can solve the itmes inside param descp
        ret = {}
        # raw_descp = descp_pre_process(raw_descp)
        arg_list = list(self.data.data['constraints'].keys())
        non_space_seg = []
        for seg in re.split(r'(^|\n\s+\n)\s+({})\s+:(.*?)'.format(get_bigrex(arg_list, boundary=False, escape=True)), raw_descp, flags=re.DOTALL):
            if seg and not seg.isspace():
                non_space_seg.append(seg)
        try:
            assert(len(non_space_seg) == 2*len(arg_list))
        except:
            raise GoToTheNextOne('' , self.fname, '[SPEC_Descp] Fail to parse descp section', save=True)
            
#         print(non_space_seg)
        for i in range(0, len(non_space_seg), 2):
            varname = non_space_seg[i]
            descp = self.descp_post_process(non_space_seg[i+1])
            ret[varname] = descp
            
        return ret
        
    
        
    def parse(self, folder):
        self.parse_sig()
        param_str = self.get_sect('Parameters')
        descp_dict = self.parse_descp(param_str)
        self.update_descp(descp_dict)
        self.data.save_file(folder, filename = self.fname)
        

In [64]:
src_path = '/Users/danning/Desktop/deepflaw/exp2/code/dl-fuzzer/doc_analysis/collect_doc/scikitlearn/raw/'
dst_path = '/Users/danning/Desktop/deepflaw/exp2/code/dl-fuzzer/doc_analysis/collect_doc/scikitlearn/parsed/'


In [65]:

del_file(dst_path)
for fname in get_file_list(src_path):
    try:
        p = parser(fname, content = read_file(os.path.join(src_path, fname)))
        p.parse(dst_path)
#         descp_dict = p.parse_descp(p.get_sect('Parameters'))
#         prettyprint(descp_dict)
#         print()
    except GoToTheNextOne as gttno:
        if gttno.save:
            # continue
            print(fname+': '+gttno.msg)


sklearn.utils.Bunch: [Sig] Fail to parse signature
sklearn.pipeline.make_pipeline: arg memory doesn't exist
sklearn.utils.arrayfuncs.min_pos: [Sig] Fail to parse signature
sklearn.utils.estimator_checks.check_estimator: arg estimator doesn't exist
sklearn.manifold.locally_linear_embedding: arg arpack doesn't exist
sklearn.compose.make_column_transformer: arg transformer doesn't exist
sklearn.metrics.pairwise.distance_metrics: [Sig] Fail to parse signature
sklearn.utils.sparsefuncs_fast.inplace_csr_row_normalize_l1: [Sig] Fail to parse signature
sklearn.datasets.load_sample_images: [Sig] Fail to parse signature
sklearn.config_context: arg assume_finite doesn't exist
sklearn.show_versions: [Sig] Fail to parse signature
sklearn.utils.parallel_backend: [Sig] Fail to parse signature
sklearn.linear_model.PassiveAggressiveRegressor: [Sig] Fail to parse signature
sklearn.utils.graph_shortest_path.graph_shortest_path: [Sig] Fail to parse signature
sklearn.metrics.silhouette_samples: [SPEC_Descp

In [46]:
arg_list = ['score_func', 'greater_is_better', 'needs_proba', 'needs_threshold', '**kwargs']
get_bigrex(arg_list, boundary=False, escape=True)

'score_func|greater_is_better|needs_proba|needs_threshold|\\*\\*kwargs'

In [44]:
re.match(r'\b\*\*kwargs\b', ' **kwargs')

In [47]:
# fname = 'sklearn.metrics.make_scorer'
fname = 'sklearn.model_selection.cross_val_predict'
p = parser(fname, content = read_file(os.path.join(src_path, fname)))
# print(p.data.data)
# p.parse(dst_path)
p.parse_sig()
s= p.get_sect('Parameters')
prettyprint(p.parse_descp(s))

{ 'X': 'array-like The data to fit. Can be, for example a list, or an array at '
       'least 2d.',
  'cv': 'int, cross-validation generator or an iterable, optional Determines '
        'the cross-validation splitting strategy. Possible inputs for cv are: '
        '- None, to use the default 5-fold cross validation, - integer, to '
        'specify the number of folds in a `(Stratified)KFold`, - :term:`CV '
        'splitter`, - An iterable yielding (train, test) splits as arrays of '
        'indices. For integer/None inputs, if the estimator is a classifier '
        'and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is '
        'used. In all other cases, :class:`KFold` is used. Refer :ref:`User '
        'Guide <cross_validation>` for the various cross-validation strategies '
        'that can be used here.',
  'estimator': "estimator object implementing 'fit' and 'predict' The object "
               'to use to fit the data.',
  'fit_params': 'dict, optional Pa

In [None]:
s

In [7]:
def get_bigrex(sep, boundary=True, escape=True):
    if boundary:
        s1 = r'\b%s\b'
        s2 = r'\b|\b'
    else:
        s1 = r'%s'
        s2 = r'|'


    if escape:
        return s1 % s2.join(map(re.escape, sep))
    else:
        return s1 % s2.join(sep)



In [None]:
get_bigrex(args, boundary=True, escape=True)

In [54]:
raw_descp = p.get_sect('Parameters')
raw_descp = p.descp_pre_process(raw_descp)
args = list(p.data.data['constraints'].keys())

In [55]:
m = re.split(r'(^|\n\s+\n)\s+({})\s+:(.*?)'.format(get_bigrex(args, boundary=False, escape=True)), raw_descp, flags=re.DOTALL)
cnt = 0
for line in m:
    if line and not line.isspace():
        cnt+=1
        print(line)
        print('**********************')
print(cnt)

estimator
**********************
 estimator object implementing 'fit' and 'predict'
        The object to use to fit the data.
**********************
X
**********************
 array-like
        The data to fit. Can be, for example a list, or an array at least 2d.
**********************
y
**********************
 array-like, optional, default: None
        The target variable to try to predict in the case of
        supervised learning.
**********************
groups
**********************
 array-like, with shape (n_samples,), optional
        Group labels for the samples used while splitting the dataset into
        train/test set. Only used in conjunction with a "Group" :term:`cv`
        instance (e.g., :class:`GroupKFold`).
**********************
cv
**********************
 int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
    
        - None, to use the default 5-fold cross validati

In [56]:
len(args)

10

In [48]:
args = ['estimator', 'X', 'y', 'groups', 'scoring', 'cv', 'n_jobs', 'verbose', 'fit_params', 'pre_dispatch', 'error_score']

In [None]:
len(args)

In [None]:
for a in re.split(r'\n\s+\n', s):
    m = re.match(r'^\s*([\w_]+)\s*:(.*?)$', a, flags=re.DOTALL)

    varname = m.group(1)
    descp = m.group(2)
    descp = re.sub(r'\s*\n\s*', ' ', descp.lstrip())
    print(varname)
    print(descp)
    print('\n')

In [None]:
re.split(r'\n\s+\n', s)

In [None]:
re.findall(r'((\n\s+\n|^)\s*([\w_]+)\s*:(.*?))', s, flags=re.DOTALL)