- This script splits bounding boxes that have a function call 
using the dot operator.
- For mapping bounding boxes to parts of the code, it's easier
to separate these than it is to leave them in. 
- That's because scrML splits these as it makes the ASTs 
- (e.g. clock.getCurrentTime --> 'clock', '.', 'getCurrentTime')

In [1]:
import os
import re
import math
import pandas as pd

In [2]:
box_files = os.listdir("./word_coordinates_final/")

In [10]:
# checks whether the word is "null", and makes sure "null" is still put
# into final file
def check_word(word):
    if isinstance(word, float) and math.isnan(word):
        return "null"
    return word

# for each bounding box, calculates the char width to ensure reliable splits
def calculate_char_width(row):
    return row['width'] / len(row['word'])

# for words with a dot (e.g. System.out.println), splits and puts each part into a new row
def split_word(row, char_width):
    parts = re.split("\.", row['word'])
    new_x = row['x']
    replacement = pd.DataFrame()

    for j, string in enumerate(parts):
        word_width = round(len(string)*char_width)
        if word_width == 0:
            continue
        new_row = pd.Series([string, 0, new_x, row['y'], word_width, row['height'], new_x / 1920,
                             row['tobii_y'], word_width / 1920, row['tobii_height']])
        # concatenating new rows together
        replacement = pd.concat([replacement, new_row], ignore_index=True, axis=1)
        new_x += word_width # moving x coordinate by word size
    return replacement

# occurrence counts for each word got messed up splitting the boxes, so this recalculates for each word
# this will also discard comments from the calculations
def recalculate_num_occurrences(new_boxes): 
    occurrences = {}
    comment_line = 0
    slashes = r'\/\/'  # comments in java --> //
    
    for i, row in new_boxes.iterrows():
        word = row['word']
        
        # comment filter: if word is // or a flag is flilpped, just write "comment" in occurrence column
        if re.search(slashes, word) or row['y'] == comment_line:
            comment_line = row['y']
            #occurrences[word] = "comment"
            continue
        
        if word not in occurrences:
            occurrences[str(word)] = 0
        # elif occurrences[word] == 'comment':
            
        else:
            occurrences[str(word)] += 1
        row['occurrence'] = occurrences[word]
    return new_boxes

# used to find words in code that qualify for splitting
# 1) if it's a string, 
# 2) there's a period in it somewhere, and
# 3) the whole word isn't itself a string
# e.g. System.out.println but not: "P/x.ctx" because it's in quotes
def needs_to_be_split(word):
    pattern = r'\"(.+?)\"' # pattern checking whether word is a single string
    return isinstance(word, str) and re.search("\.", word) and not re.findall(pattern, word)

def process_file(file, new_boxes):
    boxes = pd.read_csv(f'./word_coordinates_final/{file}') # bounding box file
    for i, row in boxes.iterrows():
        row['word'] = check_word(row['word']) # checks whether word is "null"
        
        if needs_to_be_split(row['word']):
            char_width = calculate_char_width(row)
            replacement = split_word(row, char_width) # splitting word by dots, returning new dataframe rows
            replacement.index = new_boxes.index # currently transposed, so this sets column headers
            new_boxes = pd.concat([new_boxes, replacement], ignore_index=True, axis=1)
        else:
            new_boxes = pd.concat([new_boxes, row.T], ignore_index=True, axis=1)
       
    new_boxes = recalculate_num_occurrences(new_boxes.T)
    new_boxes.to_csv(f"word_coordinates_split/{file}", index=False, header=[
        'word', 'occurrence', 'x', 'y', 'width', 'height', 'tobii_x',
        'tobii_y', 'tobii_width', 'tobii_height'])

In [11]:
for file in box_files:
    new_boxes = pd.DataFrame()
    print(file)
    process_file(file, new_boxes)

testOneTwoThreeCreateCycle_boxes.csv
testGetEmail_boxes.csv
moveUpTarget_boxes.csv
genSql_boxes.csv
openNewContainerWindow_boxes.csv
clearFieldersFromField_boxes.csv
iconify_boxes.csv
setCombo_Value_boxes.csv
BFSdist_boxes.csv
testSetEvtID_boxes.csv
doOutput_boxes.csv
compareTo_boxes.csv
appendDeclarations_boxes.csv
add_boxes.csv
testOccThm_boxes.csv
play_boxes.csv
getUserNameFromCookie_boxes.csv
setNamedItem_boxes.csv
getTargetServiceName_boxes.csv
getClassNameForLookAndFeel_boxes.csv
checkSetLayout_boxes.csv
removeSelection_boxes.csv
atomToExpression_boxes.csv
load_boxes.csv
setUpMrj_boxes.csv
wndPreferences_boxes.csv
testInvoke_boxes.csv
getJSplitPane_boxes.csv
getConnectionPanel_boxes.csv
addUIDeployment_boxes.csv
makeUniqueParagraphForGlobalWithLength_boxes.csv
capitalizeString_boxes.csv
getMenuAdministracion_boxes.csv
countQuery_boxes.csv
selectBracketingEntries_boxes.csv
addRotation_boxes.csv
encode_boxes.csv
getMessage_boxes.csv
split_boxes.csv
getScopePartnerLinks_boxes.csv
vi