##Install Dependencies

In [None]:
#@title install srcml
!wget https://github.com/srcML/srcMLReleases/raw/main/srcml_1.0.0-1_ubuntu20.04.deb
!sudo apt install ./srcml_1.0.0-1_ubuntu20.04.deb

In [None]:
#@title install src2abs
!git clone https://github.com/micheletufano/src2abs.git
!sudo apt-get install maven
%cd src2abs
!mvn clean
!mvn install:install-file -Dfile="lib/javalexer.jar" -DgroupId="edu.wm.cs" -DartifactId="javalexer" -Dversion="1" -Dpackaging="jar"
!mvn package
!mv target/src2abs-0.1-jar-with-dependencies.jar /content/src2abs-0.1-jar-with-dependencies.jar
%cd /content/


##Parameters

In [None]:
#@title Paths parameters
#@markdown Path to the dataset with source code files
raw_files_dataset_path = '/content/raw_dataset/' #@param {type:"string"}
#@markdown ---
#@markdown Path to store paths to every files to abstract.
output_paths_dataframe = '/content/functions_dataset_paths.csv' #@param {type:"string"}
#@markdown ---
#@markdown Path to store the abstracted functions
functions_dataset_path = '/content/functions_dataset/' #@param {type:"string"}

##Run

In [None]:
#@title Splitting parameters. Used to split the functions dataset generation between multiple colab instances (optional)

split_amount = 1 #@param {type:"slider", min:1, max:10}
#@markdown ---
#@markdown Part to be used for the current instance of colab (from 0 to split_amount excluded)
current_part = 0 #@param {type:"integer"}



In [None]:
#@title generate and save paths
import os
import pandas as pd
if not os.path.exists(output_paths_dataframe):
  paths = []
  for root, subdirs, files in os.walk(raw_files_dataset_path):
    new_root=root.replace(raw_files_dataset_path, '')
    if 'pre_patch' in new_root or 'pre_vulnerability_inducing' in new_root or '/new/' in new_root:
      continue
    for f in files:
      if f == 'last_commit.txt':
        continue
      paths.append(os.path.join(new_root,f))

  df = pd.DataFrame(paths)
  df.to_csv(output_paths_dataframe, index=False)


In [None]:
#@title load paths
pathsdf = pd.read_csv(output_paths_dataframe)


In [None]:
#@title splitting (for multiple colab instances)
import numpy as np
pathsdf = np.array_split(pathsdf, split_amount)[current_part]

In [None]:
#@title run parsing
import os
import subprocess
import xml.etree.ElementTree as ET
from pathlib import Path

srcML = {'s':'http://www.srcML.org/srcML/src'}

def function_generation_raw(function_txt, new_root, function_name, f):
      
      filename, ext = os.path.splitext(f)
      new_root_raw = os.path.join(raw_files_dataset_path, 'raw', new_root, '%s_func_%s%s' % (filename, function_name, ext))
      
      Path(new_root_raw).parent.mkdir(parents=True, exist_ok=True)
      with open(new_root_raw, 'wb') as out:
        out.write(function_txt)

      return new_root_raw

def function_generation_abs(raw_1, raw_2):

      abstract_1 = raw_1.replace('/raw/', '/abstract/')
      abstract_2 = raw_2.replace('/raw/', '/abstract/')
      Path(abstract_1).parent.mkdir(parents=True, exist_ok=True)
      Path(abstract_2).parent.mkdir(parents=True, exist_ok=True)
      result = subprocess.run(['java', '-jar', 'src2abs-0.1-jar-with-dependencies.jar', 'pair', 'method', raw_1, raw_2, abstract_1, abstract_2, 'src2abs/idioms/idioms.csv'])
      if result.returncode != 0:
        print('ERROR', raw_1, raw_2, abstract_1, abstract_2, result.returncode)

def function_generation_abs_single(raw):

      abstract = raw.replace('/raw/', '/abstract/')
      Path(abstract).parent.mkdir(parents=True, exist_ok=True)
      result = subprocess.run(['java', '-jar', 'src2abs-0.1-jar-with-dependencies.jar', 'single', 'method', raw, abstract, 'src2abs/idioms/idioms.csv'])
      if result.returncode != 0:
        print('ERROR', raw, abstract, result.returncode)

def parse_file(root, file_type, functions):
    xmlcontent = subprocess.check_output(['srcml', root.replace('vulnerability_inducing', file_type)])
    tree = ET.fromstring(xmlcontent)
    for function in tree.iterfind('s:function', srcML):
      function_name = function.find('s:name', srcML).text
      if not function_name in functions:
        functions[function_name] = {}
      functions[function_name][file_type] = function

def getfunction_txt(versions, version_type):
  version = versions.get(version_type)
  if version is None:
    return None
  return ET.tostring(version, encoding='utf8', method='text')

VERSION_TYPES = ('vulnerability_inducing', 'pre_vulnerability_inducing')

for row in pathsdf.iterrows():
  new_root = row[1].values[0]
  check_output_dir = functions_dataset_path+'/'.join(new_root.split('/')[:-2])
  if os.path.exists(check_output_dir):
    print('%s already exists' % check_output_dir)
    continue
  print(new_root)
  functions = {}
  for version_type in VERSION_TYPES:
    parse_file(os.path.join(raw_files_dataset_path,new_root), version_type, functions)
  file_name = os.path.basename(new_root);
  for function_name in functions:
    versions = functions[function_name]
    if len(versions) < len(VERSION_TYPES):
      print('%s is missing in %s' % (function_name, str([version_type for version_type in VERSION_TYPES if version_type not in versions ])))

    patch = getfunction_txt(versions,'patch')
    pre_patch = getfunction_txt(versions,'pre_patch')
    vulnerability_inducing = getfunction_txt(versions,'vulnerability_inducing')
    pre_vulnerability_inducing = getfunction_txt(versions,'pre_vulnerability_inducing')

    if patch is not None and pre_patch is not None:
      final_root = None
      if patch != pre_patch:
        final_root = new_root.replace('patch', 'pre_patch_patch_diff')
        patch_root = function_generation_raw(patch, final_root, '%s_%s' % (function_name, 'patch'), file_name)
        pre_patch_root = function_generation_raw(pre_patch, final_root, '%s_%s' % (function_name, 'pre_patch'), file_name)
        function_generation_abs(patch_root, pre_patch_root)
      else:
        final_root = new_root.replace('patch', 'pre_patch_patch_same')
        function_generation_abs_single(function_generation_raw(patch, final_root, function_name, file_name))
    else:
      for version_type in ('patch', 'pre_patch'):
        function = getfunction_txt(versions,version_type)
        if function is not None:
          function_generation_abs_single(function_generation_raw(function, new_root.replace('patch', version_type), function_name, file_name))
    
    if pre_vulnerability_inducing is not None and vulnerability_inducing is not None:
      final_root = None
      if pre_vulnerability_inducing != vulnerability_inducing:
        final_root = new_root.replace('vulnerability_inducing', 'pre_vulnerability_inducing_vulnerability_inducing_diff')
        pre_vulnerability_inducing_root = function_generation_raw(pre_vulnerability_inducing, final_root, '%s_%s' % (function_name, 'pre_vulnerability_inducing'), file_name)
        vulnerability_inducing_root = function_generation_raw(vulnerability_inducing, final_root, '%s_%s' % (function_name, 'vulnerability_inducing'), file_name)
        function_generation_abs(vulnerability_inducing_root, pre_vulnerability_inducing_root)
      else:
        final_root = new_root.replace('vulnerability_inducing', 'pre_vulnerability_inducing_vulnerability_inducing_same')
        function_generation_abs_single(function_generation_raw(vulnerability_inducing, final_root, function_name, file_name))

    else:
      for version_type in ('pre_vulnerability_inducing', 'vulnerability_inducing'):
        function = getfunction_txt(versions,version_type)
        if function is not None:
          function_generation_abs_single(function_generation_raw(function, new_root.replace('vulnerability_inducing', version_type), function_name, file_name))
    