In [1]:
# import glob
# import json

import xml.etree.ElementTree as ET
# from bs4 import BeautifulSoup
# from tqdm import tqdm

import translators.server as tss

import re
from nltk.corpus import stopwords

import pandas as pd

Using state Pennsylvania server backend.


In [2]:
# read the latest mass production graph
infile = "./HTML_folder//6.05 HTML/6.05 HTML/MassProduction/6_5.brd"
STOPWORDS = stopwords.words('english')

In [3]:
# read the latest mass production table
path_new = "./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.txt"
table_new = pd.read_csv(path_new, sep="\t", index_col=0)
# table_new.columns
table_new_index_list = table_new.index.tolist()

In [4]:
# define the basic structure
tree = ET.parse(infile)
root = tree.getroot()
var_phrase_map = dict()
store_dict = {}

tags = ['hintMessage', 'buggyMessage', 'successMessage', 
        'label', 'Input']

In [5]:
# create name for variable
def clean_name(s, convert_to_lower=True):
    s = re.sub('<[^<]+?>', '', s) #markup
    s = re.sub('[^0-9a-zA-Z_\s]', '', s) #keep alnum
    s = re.sub('\t\n\r', '', s) #remove tab, line break, carriage return
    s = ' '.join(s.split()) #remove redundant whitespace
    return s.lower() if convert_to_lower else s

# remove unnecessary part in value
def clean_phrase(s, convert_to_lower=False):
    if(s is None or s == ''):
        return None
    # TODO (check if html references should be preserved in translation)
    # s = re.sub('<[^<]+?>', '', s) #markup
    # s = re.sub('[^0-9a-zA-Z_\'\s*/+-äöüÄÖÜß!?\.\u0590-\u05fe]', '', s) #keep alnum, operators, umlauts, hebrew, and apostrophe
    s = re.sub('\t\n\r', '', s) #remove tab, line break, carriage return
    s = ' '.join(s.split()) #remove redundant whitespace
    return s.lower() if convert_to_lower else s

In [6]:
# create variable
def make_var(phrase, signature='_', keep_n_words=4):
    if (phrase is None or phrase == ''):
        return ''
    the_clean_phrase = clean_phrase(phrase) 
    # print(the_clean_phrase)
    if the_clean_phrase in var_phrase_map:
        return var_phrase_map[the_clean_phrase]
    else:
        h = signature + '_' + '_'.join([word for word in clean_name(phrase).split(' ') if word not in STOPWORDS][:keep_n_words])
        v = '%(' + str(h) + ')%'
        # key: value of the variable, value: value of the variable
        var_phrase_map[the_clean_phrase] = v
        # print('v', v)
        return v

In [7]:
# replace pharse with variable
def iterate_generic(tag: str, root):
    """txt should be %% type or a pharse."""
    count = 1
    for element in root.iter(tag):
        if tag == 'Input' and count > 1:
            continue
        if tag == 'Input':
            txt = element[0].text
        else:
            txt = element.text
        if txt in table_new_index_list or clean_phrase(txt) is None or clean_phrase(txt) == '':
            continue
        else:
            # print(txt)
        #     # store_list.append(clean_phrase(txt))
            if tag == 'Input':
                element[0].text = make_var(txt, signature=tag+'_'+str(count))
            else:
                # print(txt)
                element.text = make_var(txt, signature=tag+'_'+str(count))
        count += 1

In [8]:
# read the tags and call all functions above
for tag in tags:
    iterate_generic(tag, root)  

In [9]:
# create new dataframe and concat it with the latest mass production table
df_new = pd.DataFrame(var_phrase_map.keys(), index = list(var_phrase_map.values()))
# df_new.head()
# duplicate the columns
df_dup = pd.concat([df_new.T]*len(table_new.columns)).T
df_dup.columns = table_new.columns
df_mix = pd.concat([table_new, df_dup])
df_mix

Unnamed: 0,1,2,3,4,5,6,7,8
%(startStateNodeName)%,example1,example2,example3,,,,,
%(statement)%,"Using the set of shapes in the picture, answer...","Using the set of shapes in the picture, answer...","Using the set of shapes in the picture, answer...","Using the set of shapes in the picture, answer...","Using the set of shapes in the picture, answer...","Using the set of shapes in the picture, answer...","Using the set of shapes in the picture, answer...","Using the set of shapes in the picture, answer..."
%(graphic)%,Assets/01.png,Assets/02.png,Assets/03.png,Assets/04.png,Assets/05.png,Assets/06.png,Assets/07.png,Assets/08.png
%(question1)%,(1) What percent of the shapes are %(type1)%?,(1) What percent of the shapes are %(type1)%?,(1) What percent of the shapes are %(type1)%?,(1) What percent of the shapes are %(type1)%?,(1) What percent of the shapes are %(type1)%?,(1) What percent of the shapes are %(type1)%?,(1) What percent of the shapes are %(type1)%?,(1) What percent of the shapes are %(type1)%?
%(question2)%,(2) What percent of the shapes are %(type2)%?,(2) What percent of the shapes are %(type2)%?,(2) What percent of the shapes are %(type2)%?,(2) What percent of the shapes are %(type2)%?,(2) What percent of the shapes are %(type2)%?,(2) What percent of the shapes are %(type2)%?,(2) What percent of the shapes are %(type2)%?,(2) What percent of the shapes are %(type2)%?
...,...,...,...,...,...,...,...,...
%(label_4_count_total_items)%,Count total items,Count total items,Count total items,Count total items,Count total items,Count total items,Count total items,Count total items
%(label_5_count_target_items)%,Count target items,Count target items,Count target items,Count target items,Count target items,Count target items,Count target items,Count target items
%(label_6_enter_unreduced_ratio)%,Enter unreduced ratio,Enter unreduced ratio,Enter unreduced ratio,Enter unreduced ratio,Enter unreduced ratio,Enter unreduced ratio,Enter unreduced ratio,Enter unreduced ratio
%(label_7_enter_fraction_100)%,Enter as fraction out of 100,Enter as fraction out of 100,Enter as fraction out of 100,Enter as fraction out of 100,Enter as fraction out of 100,Enter as fraction out of 100,Enter as fraction out of 100,Enter as fraction out of 100


In [12]:
# export the csv
df_mix.to_csv('./Output_cleaned_folder/6.5_cleaned.txt', encoding="utf-8", sep="\t")

In [13]:
# export the brd
output = "./Output_cleaned_folder/6.5_cleaned.brd"
tree.write(output)