###Install V-SZZ

In [None]:
!git clone https://github.com/baolingfeng/V-SZZ.git

In [None]:
%cd /content/V-SZZ/ICSE2022ReplicationPackage/
!pip install -r requirements.txt

In [None]:
import sys
sys.path.append('/content/V-SZZ/ICSE2022ReplicationPackage/icse2021-szz-replication-package/tools/pyszz/')

###Create dataset

In [None]:
#@title create dataset
from szz.my_szz import MySZZ
import os
import json
from pathlib import Path
import git
import io
from pydriller import ModificationType, RepositoryMining, GitRepository
from typing import List, Set
import itertools
import time

def copyfile(commit_file, target):
  Path(target).parent.mkdir(parents=True, exist_ok=True)
  with io.BytesIO(commit_file.data_stream.read()) as f:
    with open(target, 'wb') as t:
      t.write(f.read())

import requests
import re

def getRename(url_path, commit):
  sendData = [
      ('items[item-0][current_blob_path]', url_path),
      ('items[item-0][last_commit]', commit),
      ('items[item-0][branch]', 'master'),
      ('items[item-0][new_path]', url_path),
      ('_method', 'GET')
  ]

  response = requests.post('https://github.com/torvalds/linux/commits/check_for_rename_commits', headers={"x-requested-with": "XMLHttpRequest"}, data=sendData)
  return re.search('Renamed from (.+)\\\\n      <a', response.text)

def findOldName(current_commit, file_path, base_url='https://github.com/torvalds/linux/commits/'):
  url = base_url+current_commit+'/'+file_path
  while url:
    res = requests.get(url)
    
    while 'Retry-After' in res.headers:
      time.sleep(int(res.headers["Retry-After"]))
      print('retry')
      res = requests.get(url)
    page = res.text
    next = re.search('Newer(</a>|</button>)<a rel="nofollow" class="btn btn-outline BtnGroup-item" href="(.+)">Older</a>', page)
    if next:
      url = next.group(2)
    else:
      print(current_commit, file_path)
      try:
        rename = getRename(file_path,re.findall('data-url="/torvalds/linux/commits/(.+)/commits_list_item"', page)[-1])
      except Exception as e:
        print(e)
        print(url)
        print(page)
        exit()
      if rename:
        return rename.group(1)
      else:
        return None

repo_url='https://github.com/torvalds/linux'
   

my_szz = MySZZ(repo_full_name='torvalds/linux', repo_url=repo_url, repos_dir='/content/repo', ast_map_path=None)
repository = git.Repo(os.path.join('/content/repo', 'torvalds/linux'))
log_txt = os.path.join('/content/dataset', 'linux_kernel', 'last_commit.txt')
Path(log_txt).parent.mkdir(parents=True, exist_ok=True)

for data_file in os.listdir('/content/data'):
  if os.path.isdir(os.path.join('/content/data',data_file)):
      continue
  print(data_file)
  with open(os.path.join('/content/data',data_file)) as data:

    data_json = json.load(data)
    
    for fix in data_json:
      
            inducings = list(set([(commit['previous_commits'][-1][0], commit['file_path']) for commit in data_json[fix]]))
            if len(inducings) == 0:
              continue
            if len(inducings) > 1:
              print('More than 1 inducing commit?? %s' % inducings)
            for inducing in inducings:
              inducing_hash, file_path = inducing
              print(file_path)
              print('Fixing Commit:', fix)
              print('Inducing Commit:', inducing_hash)
              with open(log_txt, 'w') as f:
                f.write('fix: %s, inducing: %s, data file: %s' % (fix, inducing_hash, data_file))
              
              Directory = os.path.join('/content/dataset', 'linux_kernel', inducing_hash)
              
          
  
              inducing_commit = repository.commit(inducing_hash)
              try:
                pre_inducing_commit = repository.commit(inducing_hash+'~1')
              except:
                pre_inducing_commit = None


  
              pre_inducing_file = None
              old_path = None
              try:
                inducing_file = inducing_commit.tree / file_path
              except Exception as e:
                old_path = findOldName(fix, file_path)
                if old_path is None:
                  print('----ERROR----')
                  print(e)
                  continue
                try:
                  inducing_file = inducing_commit.tree / old_path
                except:
                  print('abort')
                  continue

              if pre_inducing_commit is not None:
                try:
                  if old_path is None:
                    pre_inducing_file = pre_inducing_commit.tree / file_path
                  else:
                    pre_inducing_file = pre_inducing_commit.tree / old_path
                except Exception as e:
                  old_path = findOldName(inducing_hash, file_path if old_path is None else old_path)
                  if old_path is None:    
                    print('\033[92m%s is new in inducing commit\033[0m' % file_path)
                  else:
                    print('\033[96mOld path: %s\033[0m' % old_path)
                    try:
                      pre_inducing_file = pre_inducing_commit.tree / old_path
                    except:
                      print('abort')
                      continue

              file_path = os.path.basename(file_path)
              
              if pre_inducing_file is not None:
                copyfile(inducing_file, os.path.join(Directory, 'vulnerability_inducing', file_path))
                copyfile(pre_inducing_file, os.path.join(Directory, 'pre_vulnerability_inducing', file_path))
              else:
                copyfile(inducing_file, os.path.join('/content/dataset', 'linux_kernel', 'new', inducing_hash, 'vulnerability_inducing', file_path))