- This script splits bounding boxes that have a function call 
using the dot operator.
- For mapping bounding boxes to parts of the code, it's easier
to separate these than it is to leave them in. 
- That's because scrML splits these as it makes the ASTs 
- (e.g. clock.getCurrentTime --> 'clock', '.', 'getCurrentTime')

In [38]:
import os
import re
import cv2
import math
import pandas as pd
import numpy as np

In [3]:
box_files = os.listdir("./word_coordinates_final/")

In [47]:
# checks whether the word is "null", and makes sure "null" is still put
# into final file
def check_word(word):
    if isinstance(word, float) and math.isnan(word):
        return "null"
    return word

# for each bounding box, calculates the char width to ensure reliable splits
def calculate_char_width(row):
    return row['width'] / len(row['word'])

# for words with a dot (e.g. System.out.println), splits and puts each part into a new row
def split_word(row, char_width):
    parts = re.split("\.", row['word'])
    new_x = row['x']
    replacement = pd.DataFrame()

    for j, string in enumerate(parts):
        word_width = round(len(string)*char_width)
        if word_width == 0:
            continue
        new_row = pd.Series([string, 0, new_x, row['y'], word_width, row['height'], new_x / 1920,
                             row['tobii_y'], word_width / 1920, row['tobii_height']])
        replacement = pd.concat([replacement, new_row], ignore_index=True, axis=1)
        new_x += (word_width + char_width) # moving x coordinate by word size

    return replacement

# occurrences got messed up splitting the boxes, so this recalculates for each word
def recalculate_num_occurrences(new_boxes): 
    occurrences = {}
    for i, row in new_boxes.iterrows():
        word = row['word']
        if word not in occurrences:
            occurrences[word] = 0
        else:
            occurrences[word] += 1
        row['occurrence'] = occurrences[word]
    return new_boxes

def process_file(file, new_boxes):
    name = re.sub("_boxes.csv", "", file)
    boxes = pd.read_csv(f'./word_coordinates_final/{file}')

    for i, row in boxes.iterrows():
        row['word'] = check_word(row['word'])

        if isinstance(row['word'], str) and re.search("\.", row['word']):
            char_width = calculate_char_width(row)
            replacement = split_word(row, char_width)

            replacement.index = new_boxes.index
            new_boxes = pd.concat([new_boxes, replacement], ignore_index=True, axis=1)
        else:
            new_boxes = pd.concat([new_boxes, row.T], ignore_index=True, axis=1)
            
    new_boxes = recalculate_num_occurrences(new_boxes.T)
    new_boxes.to_csv(f"word_coordinates_split/{file}", index=False, header=[
        'word', 'occurrence', 'x', 'y', 'width', 'height', 'tobii_x',
        'tobii_y', 'tobii_width', 'tobii_height'])



for file in box_files:
    new_boxes = pd.DataFrame()
    process_file(file, new_boxes)
    break


In [41]:
### SAME CODE AS ABOVE, BUT LESS MODULARIZED


# # goes through all the words in all the files to find cases where there's a dot (e.g. "System.out.println")
# for file in box_files:
#     name = re.sub("_boxes.csv", "", file)
#     boxes = pd.read_csv(f'./word_coordinates_final/{file}') # getting corresponding bounding box file
#     new_boxes = pd.DataFrame() # used for accumulating new rows
    
#     for i, row in boxes.iterrows():
#         word = row['word']
        
#         if isinstance(word, float) and math.isnan(word): # checking if the word is "null" and makes sure "null"
#             row['word'] = "null"                         # is still in final file
            
            
#         if isinstance(word, str) and re.search("\.", word): # if there's a dot in the word, split
#             char_width = row['width']/len(word) # used for gauging the pixel width of the word parts
#             parts = re.split("\.", word)
#             new_x = row['x']
#             replacement = pd.DataFrame() # used for collecting the newly split up word
            
#             for j, string in enumerate(parts): # perform this function for all parts of the original word
#                 word_width = round(len(string)*char_width)
#                 if word_width == 0: continue # sometimes the dot is at the end of a word (e.g. "fail.")
                
#                 # making new row to concatenate with original dataframe. Scaling tobii width to screen width (1920px) 
#                 new_row = pd.Series([string, 0, new_x, row['y'], word_width, row['height'], new_x / 1920, 
#                                      row['tobii_y'], word_width / 1920, row['tobii_height']])
#                 replacement = pd.concat([replacement, new_row], ignore_index=True, axis=1)
#                 new_x += (word_width + char_width) # adding char_width to account for the "." (e.g. System.out.println)
            
#             replacement.index = new_boxes.index
#             new_boxes = pd.concat([new_boxes, replacement], ignore_index=True, axis=1)
            
#         else: # if there wasn't a period, just add the row to the dataframe
#             new_boxes = pd.concat([new_boxes, row.T], ignore_index=True, axis=1)
            
#     new_boxes.T.to_csv("/word_coordinates_split/{file}", index=False, header=[
#         'word', 'occurrence', 'x', 'y', 'width', 'height', 'tobii_x',
#         'tobii_y', 'tobii_width', 'tobii_height'])


null
null
null
