# Code structure

This script is used to split VBA code into subroutines and functions and populate a database with some basic metadata about the code to start a translation process and understand dependencies.

In [1]:
from typing import Literal
from pathlib import Path
from tqdm import tqdm
from copy import copy

from transllmate.db import Db
from transllmate.models import ModuleTable, Struct, StructTable

## Set parameters

We use a parameters chunk to use this notebook with papermil later

In [2]:
CODEBASE = '../PGM'
DB_PATH = 'output/codebase_v1.db'


In [3]:
BASE_PATH = Path('.')
DB_DIR = Path('.') / DB_PATH

files = list((BASE_PATH / CODEBASE).glob('*.bas'))
print(f"Found {len(files)} *.bas source files")

Found 39 *.bas source files


In [4]:
!head -n 60 {files[13]}

Attribute VB_Name = "modFiles"
'-- Modul mit Routinen zum Datei- und Ordnerhandling
'   zusammengestellt 2003
'   FVA Abt. Waldwachstum , Martin Wohnhas

Option Explicit

Global GlobalKanal

'-- g�ltiges Laufwerk
Public Declare Function PathStripToRoot Lib "shlwapi.dll" _
   Alias "PathStripToRootA" _
  (ByVal pPath As String) As Long
  

'-- Adding to the Documents Menu ----------------------------------------------------
Public Const SHARD_PIDL = &H1&
Public Const SHARD_PATH = &H2&

Public Declare Function SHAddToRecentDocs Lib "shell32.dll" _
  (ByVal dwFlags As Long, ByVal dwData As String) As Long
  
  Private Declare Function SearchPath Lib "kernel32" Alias "SearchPathA" (ByVal _
    lpPath As String, ByVal lpFileName As String, ByVal lpExtension As String, _
    ByVal nBufferLength As Long, ByVal lpBuffer As String, _
    ByVal lpFilePart As String) As Long
    
 Public Declare Function PathFileExists Lib "shlwapi" Alias "PathFileExistsA" (ByVal lpszPath As String) As Long


Pri

In [5]:
Db(DB_DIR).modules

Initialized database.


Unnamed: 0_level_0,path,length,n_structs
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


## Codebase

This class operates on a folder of source files that should be analyzed. It can be run in different modes:

* `'parse'` will read the folder for matching files and parse them into the database, but skip duplicates (if they are already there)
* `'ignore'` will **not** read the folder at all, but init the database. This is helpful if the codebase changed, but should not yet be updated
* `'recreate'` this will **drop the entire database** and then operate in `'parse'` mode. This is effectively an update.

In [6]:
class Codebase:
  def __init__(
      self,
      folder: str = '.',
      db_name: str = None,
      module_extension: str = '*.bas',
      mode: Literal['parse'] | Literal['ignore'] | Literal['recreate'] = 'ignore'
  ):
    self.folder = Path(folder).resolve()
    self.db = Db(db_name)
    self.module_extension = module_extension

    # if there are no struct types initialized,
    # we just add VBA function and subroutine
    # this still needs a way to be configurable
    if len(self.db.struct_types) == 0:
      self.db.add_struct_type(name="function", start_token="Function ", end_token="End Function")
      self.db.add_struct_type(name="subroutine",start_token="Sub ",end_token="End Sub")

    # check mode
    if mode == 'recreate':
      self.db.path.unlink()

    if mode != 'ignore':
      # and also read all files
      self.read_folder()

  def read_folder(self):
    # get the structs
    structs = self.db.struct_types
    assert len(structs) > 0
    files = list(self.folder.glob(self.module_extension))
    print(files)
    for fname in tqdm(files):
      # check if we have this module aready
      if fname.name in self.db.modules.path.tolist():
        continue

      # parse the module
      mod = self.parse_file(fname, structs)
      self.db.add_module(mod)

  @classmethod
  def parse_file(cls, fname: Path, structs: list[Struct] = []):
    # load the file
    try:
      with open(fname, 'rb') as f:
        code = f.read().decode('unicode_escape')
    except UnicodeDecodeError:
      with open(fname, 'rb') as f:
        code = f.read().decode('latin1')

    # parse line by line
    objects = []
    buffer = []
    is_in_struct = False
    current_struct = None

    for line in code.splitlines():
      # check if we enter a top level struct in this line
      if not is_in_struct:
        for meta_struct in structs:
          if line.startswith(meta_struct.start_token):
            # flag that we are in a struct now
            is_in_struct = True
            #current_struct = meta_struct(signature=line)
            current_struct = StructTable(signature=line, type=meta_struct)

            # empty the buffer
            buffer = []
            # break the inner loop
            break

        # here we can continue as we entered a struct, or don't want to buffer
        continue

      # here we are in a struct - check if the struct ends
      for meta_struct in structs:
        if line.startswith(meta_struct.end_token):
          # flag that we are not in a struct now
          is_in_struct = False
          current_struct.body = '\n'.join(buffer)
          current_struct.body_n = len(buffer)

          buffer = []
          objects.append(copy(current_struct))
          current_struct = None
          break

      # if we are still here, the struct was not ended.
      # now, we can either be in a struct or not.
      # currently, if we are outside a struct, we discard the line
      if is_in_struct:
        buffer.append(line)
      else:
        pass

    # here we parsed the whole file
    mod = ModuleTable(
        path=fname.name,
        length=len(code.splitlines()),
        n_structs=len(objects),
        structs=objects
    )
    return mod


In [7]:
cb = Codebase(folder=BASE_PATH / CODEBASE, db_name=DB_DIR, mode='parse')

cb.db.modules

[PosixPath('/Users/mirko/Library/CloudStorage/GoogleDrive-mirko@hydrocode.de/Geteilte Ablagen/Projekte/PEP/PGM/ModRecursiv.bas'), PosixPath('/Users/mirko/Library/CloudStorage/GoogleDrive-mirko@hydrocode.de/Geteilte Ablagen/Projekte/PEP/PGM/ModRegression.bas'), PosixPath('/Users/mirko/Library/CloudStorage/GoogleDrive-mirko@hydrocode.de/Geteilte Ablagen/Projekte/PEP/PGM/ModWachstumsimu.bas'), PosixPath('/Users/mirko/Library/CloudStorage/GoogleDrive-mirko@hydrocode.de/Geteilte Ablagen/Projekte/PEP/PGM/moddbManag.bas'), PosixPath('/Users/mirko/Library/CloudStorage/GoogleDrive-mirko@hydrocode.de/Geteilte Ablagen/Projekte/PEP/PGM/modOlecPEP.bas'), PosixPath('/Users/mirko/Library/CloudStorage/GoogleDrive-mirko@hydrocode.de/Geteilte Ablagen/Projekte/PEP/PGM/ModClimateGrowth.bas'), PosixPath('/Users/mirko/Library/CloudStorage/GoogleDrive-mirko@hydrocode.de/Geteilte Ablagen/Projekte/PEP/PGM/modEvaluate.bas'), PosixPath('/Users/mirko/Library/CloudStorage/GoogleDrive-mirko@hydrocode.de/Geteilte Ab

  code = f.read().decode('unicode_escape')
  code = f.read().decode('unicode_escape')
  code = f.read().decode('unicode_escape')
  code = f.read().decode('unicode_escape')
  code = f.read().decode('unicode_escape')
100%|██████████| 39/39 [00:00<00:00, 204.62it/s]


Unnamed: 0_level_0,length,n_structs,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,753,16,ModRecursiv.bas
2,1197,13,ModRegression.bas
3,5090,86,ModWachstumsimu.bas
4,3990,56,moddbManag.bas
5,989,2,modOlecPEP.bas
6,6248,56,ModClimateGrowth.bas
7,8653,82,modEvaluate.bas
8,846,17,modMatrix.bas
9,644,2,modAdoFM.bas
10,895,1,modImportWW.bas


## Code Database interface

Instiantiate a new Instance of the database to see what has been added to the codebase database.
The following functions illustrate how the database can be used to enter different parts of
the codeabase and inspect single structures.

In [8]:
# init the codebase again
db = Db(DB_DIR)
print(f"Total structs       {len(db.structs)}")
print(f"|-  Functions       {len(db.structs.function)}")
print(f"|-  Subroutines     {len(db.structs.subroutine)}")

Total structs       1073
|-  Functions       187
|-  Subroutines     886


You can filter the structs by type. If you access the type name by attribute, it will be used as 
a SQL filter on the database, rather than on the pandas object

In [9]:
db.structs.function.df

Unnamed: 0_level_0,signature,body,body_n,type,end_token,module
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,"Function DurbinApha(cn As Integer, AI As Integ...",\nDim DAWert As Double\n\nIf cn <= 1 Then Exit...,16,function,End Function,ModRecursiv.bas
7,"Function tWert(cn As Integer, TI As Integer) A...",\n\nSelect Case TI\n \n Case 1\n Case 2\...,9,function,End Function,ModRecursiv.bas
21,"Function EVALPXY(ndat As Integer, dfx() As Dou...",'---ErrPtnr-CallStack-START--- DO NOT MODIFY !...,53,function,End Function,ModRegression.bas
22,"Function EVALPXY_AR(ndat As Integer, dfx() As ...",'---ErrPtnr-CallStack-START--- DO NOT MODIFY !...,95,function,End Function,ModRegression.bas
56,"Function iBArtNrGet(xBArt As String, xbe As Be...",\nDim lBArt As Integer\n\nFor iBArt = 1 To xbe...,11,function,End Function,ModWachstumsimu.bas
...,...,...,...,...,...,...
1068,"Function zgBArtSet(iAufn As Integer, sBArt As ...",\n\nDim iBArt As Integer\nDim izBArt As Intege...,48,function,End Function,ModPEP.bas
1069,"Function iBArtKollektiv(iAufn As Integer, zMix...",\nDim koBArt As Integer\nReDim ikoBArt(gBestan...,66,function,End Function,ModPEP.bas
1070,Function SIMixFestlegen(sBArt As String) As Si...,\nDim nSI As Integer\nDim rSIWert() As Double\...,49,function,End Function,ModPEP.bas
1071,"Function MixFeldSizeGet(sIDFeld As Integer, Er...",\nDim sql$\nDim iAufn As Integer\nDim rstModel...,48,function,End Function,ModPEP.bas


Use subscription of the `DB.struct`, or any of the filter attributes to query specific structs

In [10]:
# print a txt representation of the structs of 36
print(db.structs.txt(36))

Sub wm_BestandDatSave(xbe As Bestand, xtbName As String, connName As adodb.Connection)

Dim iBArt As Integer
Dim rstModel As New adodb.Recordset

With rstModel
        
   .Open xtbName, connName, adOpenKeyset, adLockOptimistic, adCmdTable 'strSQL
           
   For iBArt = 1 To xbe.nHauptBArt
            
      .AddNew
            
      rstModel("KeyFeld") = xbe.feld
      rstModel("Baumart") = xbe.iHauptBArt(iBArt)
      
       rstModel("Fl_ha") = xbe.Fl_Ha
       rstModel("N") = xbe.n
       rstModel("NnBArt") = xbe.nBArt(iBArt)
       rstModel("BHDg") = xbe.BHDg(iBArt)
       rstModel("hg") = xbe.Hg(iBArt)
       rstModel("BHDo") = xbe.BHDo(iBArt)
       rstModel("Ho") = xbe.Ho(iBArt)
       rstModel("jGfl") = xbe.jGfl(iBArt)
       rstModel("gGfl") = xbe.gGfl
       
      .Update
   
   Next iBArt
   
   .Close
        
End With

End Sub


You can also use the name of the structure and combine this with a filter attribute

In [11]:
# If you add the mode 'md', it will be retunred in a markdown codeblock
print(db.structs.function.txt('FnPrognoseJahrIndexGet', mode='md'))

Original Module: moddbManag.bas
```
Function FnPrognoseJahrIndexGet(xPrognoseJahr As Integer) As Integer

'---ErrPtnr-CallStack-START--- DO NOT MODIFY ! ---
ErrPtnr.CallStack "[moddbManag] Function FnPrognoseJahrIndexGet(xPrognoseJahr As Integer) As Integer"
'---ErrPtnr-CallStack-END--- DO NOT MODIFY ! ---
'---ErrPtnr-OnError-START--- DO NOT MODIFY ! ---
On Error GoTo ErrPtnr_OnError
'---ErrPtnr-OnError-END--- DO NOT MODIFY ! ---

For jprog = 1 To gPlwNTZVorgabe.nProg
   If xPrognoseJahr = Year(gBestand.iAufnahme(gBestand.nAufnahme)) + jprog * 5 Then
      FnPrognoseJahrIndexGet = jprog
      Exit For
   End If
Next jprog

Exit Function
ErrPtnr_OnError:
Select Case ErrPtnr.OnError("moddbManag", "FnPrognoseJahrIndexGet")
Case 0: Resume
Case 1: Resume Next
Case 2: Exit Function
Case 3: End
End Select
'---ErrPtnr-OnError-END--- DO NOT MODIFY ! ---

End Function
```


Be aware, that the name *filters* the Stucture signature, which might return more than one.
Filters are **not case-sensitive**

In [12]:
db.structs['BI']

Unnamed: 0_level_0,signature,body,body_n,type,end_token,module
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,"Function DurbinApha(cn As Integer, AI As Integ...",\nDim DAWert As Double\n\nIf cn <= 1 Then Exit...,16,function,End Function,ModRecursiv.bas
30,"Sub BI_eSTPDatGet(iSTP As Integer, xrSTP() As ...",\nDim iBHD As Integer\nDim iBArt As Integer\nD...,43,subroutine,End Sub,ModWachstumsimu.bas
32,Sub FnBIAufnBHD_BaumConversion(xAufnBHD() As A...,\nDim iBA As Integer\nDim idk As Integer\nDim ...,79,subroutine,End Sub,ModWachstumsimu.bas
66,"Sub BaumBestandKombinieren(mxBa_Dt() As baum, ...",\nDim iBArt As Integer\n\nPhi = Sqr(1 + (mxBe_...,5,subroutine,End Sub,ModWachstumsimu.bas
76,"Sub wm_Plwk_CombinationSet(iBArt As Integer, M...",'---ErrPtnr-CallStack-START--- DO NOT MODIFY !...,59,subroutine,End Sub,ModWachstumsimu.bas
...,...,...,...,...,...,...
858,"Sub TreeRingClimateCombining(sFeld As String, ...",\nDim sql As String\nDim xTm() As Single\nDim ...,253,subroutine,End Sub,ModClimate.bas
859,Sub TreeRingClimateCombiningOrig(sFeld As Stri...,\nDim sql As String\nDim xTm() As Single\nDim ...,135,subroutine,End Sub,ModClimate.bas
865,"Sub combinTreeClimateNO(sBaum As Integer, sFel...",\nDim sql As String\nDim MNO As Single\nDim sN...,225,subroutine,End Sub,ModClimate.bas
884,"Sub HOANAClimateCombine(sVflFeld As String, tb...",\nDim nVfl As Integer\nDim iJahr As Integer\nD...,182,subroutine,End Sub,ModClimate.bas


In [13]:
db.structs.function['bi']

Unnamed: 0_level_0,signature,body,body_n,type,end_token,module
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,"Function DurbinApha(cn As Integer, AI As Integ...",\nDim DAWert As Double\n\nIf cn <= 1 Then Exit...,16,function,End Function,ModRecursiv.bas
578,"Function BonDynWertGet(JahrVon As Integer, Jah...",\n\nDim fVZ As Double\nDim RVZ As Double\n\nDi...,115,function,End Function,ModTrend.bas
751,Function BI_dGewicht(xBHD As Single) As Single,\nDim dW As Single\nDim pi As Double\n\npi = 3...,18,function,End Function,ModdbBI.bas
782,Function iBArtBIGet(sBArt As String) As String,\nDim iBA As Integer\nDim jBA As Integer\nDim ...,20,function,End Function,ModdbBI.bas
789,"Function SqlStrInventur(InventNr As Integer, x...",\nDim NItem As Integer\nDim szfItemName As Str...,34,function,End Function,ModdbBI.bas
790,Function BI_BArtMatch(xBArt As String) As String,\nDim iBArt As Integer\nDim jBArt As Integer\n...,13,function,End Function,ModdbBI.bas
993,"Function RTBIS(x1 As Double, x2 As Double) As ...",'---ErrPtnr-CallStack-START--- DO NOT MODIFY !...,44,function,End Function,modFunctLib.bas


Print a summary of all structures found and the length of all included lines (this does not include sanitized lines that were never imported into the codebase)

In [14]:
db.modules.sum()

length                                                  101488
n_structs                                                 1073
path         ModRecursiv.basModRegression.basModWachstumsim...
dtype: object