In [28]:
# import glob
# import json

import xml.etree.ElementTree as ET
# from bs4 import BeautifulSoup
# from tqdm import tqdm

import translators.server as tss

import re
from nltk.corpus import stopwords

import pandas as pd

In [31]:
# create name for variable
def clean_name(s, convert_to_lower=True):
    s = re.sub('<[^<]+?>', '', s) #markup
    s = re.sub('[^0-9a-zA-Z_\s]', '', s) #keep alnum
    s = re.sub('\t\n\r', '', s) #remove tab, line break, carriage return
    s = ' '.join(s.split()) #remove redundant whitespace
    return s.lower() if convert_to_lower else s

# remove unnecessary part in value
def clean_phrase(s, convert_to_lower=False):
    if(s is None or s == ''):
        return None
    s = re.sub('\t\n\r', '', s) #remove tab, line break, carriage return
    s = ' '.join(s.split()) #remove redundant whitespace
    return s.lower() if convert_to_lower else s

In [32]:
# create variable
def make_var(phrase, signature='_', keep_n_words=4):
    if (phrase is None or phrase == ''):
        return ''
    the_clean_phrase = clean_phrase(phrase) 
    # print(the_clean_phrase)
    if the_clean_phrase in var_phrase_map:
        return var_phrase_map[the_clean_phrase]
    else:
        h = signature + '_' + '_'.join([word for word in clean_name(phrase).split(' ') if word not in STOPWORDS][:keep_n_words])
        v = '%(' + str(h) + ')%'
        # key: value of the variable, value: value of the variable
        var_phrase_map[the_clean_phrase] = v
        # print('v', v)
        return v

In [33]:
# replace pharse with variable
def iterate_generic(tag: str, root):
    """txt should be %% type or a pharse."""
    count = 1
    for element in root.iter(tag):
        if tag == 'Input' and count > 1:
            continue
        if tag == 'Input':
            txt = element[0].text
        else:
            txt = element.text
        if txt in table_new_index_list or clean_phrase(txt) is None or clean_phrase(txt) == '':
            continue
        else:
            # print(txt)
        #     # store_list.append(clean_phrase(txt))
            if tag == 'Input':
                element[0].text = make_var(txt, signature=tag+'_'+str(count))
            else:
                # print(txt)
                element.text = make_var(txt, signature=tag+'_'+str(count))
        count += 1

In [34]:
# read the tags and call all functions above
def process_file(infile_brd, infile_table, outfile_table, outfile_brd):
    global tree; tree = ET.parse(infile_brd)
    print("mass production brd input read")
    print("path: " + infile_brd)
    global root; root = tree.getroot()
    global var_phrase_map; var_phrase_map = dict()
    global store_dict; store_dict = {}

    table_new = pd.read_csv(infile_table, sep="\t", index_col=0)
    print("mass production table input read")
    print("path: " + infile_table)
    global table_new_index_list; table_new_index_list = table_new.index.tolist() 

    tags = ['hintMessage', 'buggyMessage', 'successMessage', 
            'label', 'Input']
    for tag in tags:
        iterate_generic(tag, root) 

    # create new dataframe and concat it with the latest mass production table
    df_new = pd.DataFrame(var_phrase_map.keys(), index = list(var_phrase_map.values()))
    df_dup = pd.concat([df_new.T]*len(table_new.columns)).T
    df_dup.columns = table_new.columns
    df_mix = pd.concat([table_new, df_dup])

    # export the csv
    df_mix.to_csv(outfile_table, encoding="utf-8", sep="\t")
    print("mass production table output finished")
    print("path: " + outfile_table)

    # export the brd
    tree.write(outfile_brd)
    print("mass production brd output finished")
    print("path: " + outfile_table)

    return

In [42]:
# read the latest mass production graph
infile_brd = "./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.brd"
STOPWORDS = stopwords.words('english')

# read the latest mass production table
infile_table = "./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.txt"

# set the output paths
# outfile_table = './Output_cleaned_folder/6.5_cleaned.txt'
# outfile_brd = "./Output_cleaned_folder/6.5_cleaned.brd"
outfile_brd = infile_brd.replace('/HTML_folder/6.05 HTML/6.05 HTML/MassProduction/', '/Output_cleaned_folder/').replace('.brd', '_cleaned.brd')
outfile_table = infile_table.replace('/HTML_folder/6.05 HTML/6.05 HTML/MassProduction/', '/Output_cleaned_folder/').replace('.txt', '_cleaned.txt')

# run the process function
process_file(infile_brd, infile_table, outfile_table, outfile_brd)

mass production brd input read
path: ./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.brd
mass production table input read
path: ./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.txt
mass production table output finished
path: ./Output_cleaned_folder/6_5_cleaned.txt
mass production brd output finished
path: ./Output_cleaned_folder/6_5_cleaned.txt
