In [25]:
import xml.etree.ElementTree as ET
import re
from nltk.corpus import stopwords
import pandas as pd
import glob
from tqdm import tqdm
from xmldiff import main
import os
import translators.server as tss

STOPWORDS = stopwords.words('english')

In [26]:
class clean():
    """the class for cleaning purpose"""
    def __init__(self, infile_brd, infile_table, outfile_brd, outfile_table):
        self.infile_brd = infile_brd
        self.infile_table = infile_table
        self.outfile_brd = outfile_brd
        self.outfile_table = outfile_table
        self.var_phrase_map = dict()
        self.var_name_map = dict()
        self.table_new = None
        self.table_new_index_list = None

    def clean_name(self, s, convert_to_lower=True):
        """create name for variables in .brd"""
        s = re.sub('<[^<]+?>', '', s) # markup
        s = re.sub('[^0-9a-zA-Z_\s]', '', s) # keep alnum
        s = re.sub('\t\n\r', '', s) # remove tab, line break, carriage return
        s = ' '.join(s.split()) # remove redundant whitespace
        return s.lower() if convert_to_lower else s
    
    def clean_phrase(self, s, convert_to_lower=False):
        """remove unnecessary part in value"""
        if(s is None or s == ''):
            return None
        s = re.sub('\t\n\r', '', s) # remove tab, line break, carriage return
        s = ' '.join(s.split()) # remove redundant whitespace
        return s.lower() if convert_to_lower else s

    def find_hash(self, s):
        """find hash-like variable"""
        if(s is None or s == ''):
            return False
        # replace "%(" and "%)" to detect whether the variable is a hash-like
        s = re.sub('%\(', '', s) # "\(" is for re to search "("
        s = re.sub('\)%', '', s)
        return s.lstrip('-').isdigit() # ignore "-" in the variable 
    
    def change_var(self, old_name, signature='_', keep_n_words=4):
        """change hash-like variable's name in df"""
        if old_name in self.var_name_map:
            return self.var_name_map[old_name]
        else:
            phrase = self.table_new.loc[old_name].iloc[0] # find the first pharse in the mass production table
            if (old_name is None or old_name == '' or pd.isnull(phrase)):
                return ''
            the_clean_phrase = self.clean_phrase(phrase) 
            h = signature + '_' + '_'.join([word for word in self.clean_name(the_clean_phrase).split(' ') if word not in STOPWORDS][:keep_n_words])
            v = '%(' + str(h) + ')%'
            self.var_name_map[old_name] = v
            self.table_new.rename(index={old_name:v}, inplace=True) # dict key: variable value, dict value: variable name
            return v
    
    def make_var(self, phrase, signature='_', keep_n_words=4):
        """create variable-value pair"""
        if (phrase is None or phrase == ''):
            return ''
        the_clean_phrase = self.clean_phrase(phrase) # clean the value(phrase)
        # if the variable in self.var_phrase_map
        if the_clean_phrase in self.var_phrase_map: 
            return self.var_phrase_map[the_clean_phrase]
        # else create one
        else:
            h = signature + '_' + '_'.join([word for word in self.clean_name(the_clean_phrase).split(' ') if word not in STOPWORDS][:keep_n_words])
            v = '%(' + str(h) + ')%' # create variable name
            self.var_phrase_map[the_clean_phrase] = v # dict key: value, dict value: variable
            return v
        
    def process_txt(self, txt, element, tag, count):
        """process txt"""
        # if txt is empty
        if self.clean_phrase(txt) is None or self.clean_phrase(txt) == '':
            return
        # elif txt is already in the mass production table and it is not a hash-like
        elif txt in self.table_new_index_list and self.find_hash(txt) is False:
            return
        # elif txt is already in the mass production table and it is a hash-like
        elif txt in self.table_new_index_list and self.find_hash(txt) is True:
            if tag == 'Input':
                element[0].text = self.change_var(txt, signature=tag+'_'+str(count))
            else:
                element.text = self.change_var(txt, signature=tag+'_'+str(count))
            return
        # else create a variable name for the value
        else:
            if tag == 'Input':
                element[0].text = self.make_var(txt, signature=tag+'_'+str(count))
            else:
                element.text = self.make_var(txt, signature=tag+'_'+str(count))
            return 
    
    def iterate_generic(self, tag: str, root):
        """replace pharse with variable,
            txt should be %% type or a pharse"""
        count = 1
        for element in root.iter(tag):
            # print(tag)
            if tag == 'Input' and element[0].tag == 'value': # find input value
                txt = element[0].text
                self.process_txt(txt, element, tag, count)
            else:
                txt = element.text
                self.process_txt(txt, element, tag, count)
            count += 1

    def clean_file(self):
        """read the tags and call all functions above"""
        tree = ET.parse(self.infile_brd)
        print("mass production brd input read")
        print("path: " + self.infile_brd)
        root = tree.getroot()

        self.table_new = pd.read_csv(self.infile_table, sep="\t", index_col=0, keep_default_na=False)
        self.table_new_index_list = self.table_new.index.tolist()
        print("mass production table input read")
        print("path: " + self.infile_table)

        tags = ['hintMessage', 'successMessage', 'buggyMessage', 'label', 'Input']
        for tag in tags:
            self.iterate_generic(tag, root) 

        # create new dataframe and concat it with the latest mass production table
        df_new = pd.DataFrame(self.var_phrase_map.keys(), index = list(self.var_phrase_map.values()))
        df_dup = pd.concat([df_new.T]*len(self.table_new.columns)).T
        df_dup.columns = self.table_new.columns
        df_mix = pd.concat([self.table_new, df_dup])
        df_mix.index.name = self.table_new.index.name

        # export the csv
        df_mix.to_csv(self.outfile_table, encoding="utf-8", sep="\t")
        print("mass production table output finished")
        print("path: " + self.outfile_table)

        # export the brd
        tree.write(self.outfile_brd)
        print("mass production brd output finished")
        print("path: " + self.outfile_table)

        return self.table_new, df_mix

In [27]:
class mass_produce:
    """the class for mass production purpose"""
    def __init__(self, infile_brd, infile_table, outfile_folder):
        self.infile_brd = infile_brd
        self.infile_table = infile_table
        self.outfile_folder = outfile_folder

    def replace_var(self):
        """replace variable with value in the latest mass production table"""
        table_clean = pd.read_csv(self.infile_table, sep="\t", index_col=0, keep_default_na=False)
        for column in range(table_clean.shape[1]):
            for row in range(table_clean.shape[0]):
                content_new = str(table_clean.iloc[row, column])
                start_pattern = "%\("
                # count the number of the replacement in one variable(content_new)
                count = [match.start() for match in re.finditer(start_pattern, str(content_new))]
                for _ in range(len(count)):
                    # find the variable
                    start_pattern = "%\("
                    start_index = [match.start() for match in re.finditer(start_pattern, str(content_new))]
                    end_pattern = "\)%"
                    end_index = [match.start() for match in re.finditer(end_pattern, str(content_new))]
                    # no need for replacement
                    if start_index == []:
                        continue
                    else:
                        variable = content_new[start_index[0]: end_index[0]+2]
                        # find the corresponding column name, and then find the value
                        column_name = table_clean.columns[column]
                        try:
                            value = table_clean.loc[variable, column_name] 
                            content_new = content_new.replace(variable, value)
                            table_clean.iloc[row, column] = content_new
                        except:
                            print(variable + " doesn't exist")
        return table_clean
    
    def mass_produce_file(self):
        """iterate and mass produce all the brds"""
        table_clean = self.replace_var()
        for i in range(len(table_clean.columns)):
            column_name = table_clean.columns[i]
            fout = self.outfile_folder + str(table_clean.columns[i]) + ".brd"
            count_line = 0
            count_text = 0
            with open(self.infile_brd, 'r') as infile, open(fout, 'w+') as outfile:
                for line in infile:
                    line = line.replace('\r', '')
                    line_str = str(line)
                    # replace massproduce
                    start_pattern_problem_name = "<ProblemName>"
                    start_index_problem_name = [match.start() for match in re.finditer(start_pattern_problem_name, line_str)]
                    end_pattern_problem_name = "</ProblemName>"
                    end_index_problem_name = [match.start() for match in re.finditer(end_pattern_problem_name, line_str)]
                    start_index_problem_name, end_index_problem_name
                    try:
                        problem_name_old = line_str[start_index_problem_name[0]+13: end_index_problem_name[0]]
                        line_str = line_str.replace(problem_name_old, column_name)
                    except:
                        pass
                    # replace text in first node
                    if count_text == 0:
                        start_pattern_first_node = '<text>'
                        start_index_first_node = [match.start() for match in re.finditer(start_pattern_first_node, line_str)]
                        end_pattern_first_node = '</text>'
                        end_index_first_node = [match.start() for match in re.finditer(end_pattern_first_node, line_str)]
                        if start_index_first_node != []:
                            count_text += 1
                            text_old = line_str[start_index_first_node[0]+6: end_index_first_node[0]]
                            line_str = line_str.replace(text_old, column_name)
                    # count the number of the replacement in one variable(line_str)
                    start_pattern = "%\("
                    count = [match.start() for match in re.finditer(start_pattern, str(line_str))]
                    if count == []:
                        line_str = line_str
                    else:
                        for _ in range(len(count)):
                            start_pattern = "%\("
                            start_index = [match.start() for match in re.finditer(start_pattern, str(line_str))]
                            end_pattern = "\)%"
                            end_index = [match.start() for match in re.finditer(end_pattern, str(line_str))]
                            if start_index == []:
                                line_str = line_str
                            else:
                                variable = line_str[start_index[0]: end_index[0]+2]
                            try:
                                value = table_clean.loc[variable, column_name] 
                                line_str = line_str.replace(variable, value).replace("<%", "&lt;%").replace("%>", "%&gt;")
                            except:
                                print(variable + " doesn't exist")
                    count_line += 1
                    outfile.write(line_str)
                print(fout.split("/")[-1] + " finished")

In [28]:
class validate():
    """the class for validation purpose"""
    def __init__(self, old_folder, new_folder):
        self.old_folder = old_folder
        self.new_folder = new_folder

    def check(self, old_brd, new_brd):
        """use ET.parse to validate"""
        old = ET.parse(old_brd)
        new = ET.parse(new_brd)
        old_text = old.getroot().itertext()
        new_text = new.getroot().itertext()
        set_old = set(old_text)
        set_new = set(new_text)
        if set_old == set_new:
            res = "True"
        else:
            res = "False"
        return set_old, set_new, res

    def check_xmldiff(self, old_brd, new_brd):
        """use xmldiff to validate"""
        diff = main.diff_files(old_brd, new_brd)
        if len(diff) == 0:
            res = "True"
        else:
            res = "False"
        return diff, res
    
    def validate_file(self):
        fs_brd = glob.glob(self.old_folder + "*")
        for old_brd in tqdm(fs_brd, position=0, leave=True):
            new_brd = self.new_folder + old_brd.split("\\", 1)[-1]
            if os.path.exists(new_brd):
                new_brd = new_brd
            elif os.path.exists(new_brd.replace('Problem', '')):
                new_brd = new_brd.replace('Problem', '')
            elif os.path.exists(new_brd.replace(old_brd.split("\\", 1)[-1], "Problem"+old_brd.split("\\", 1)[-1])):
                new_brd = new_brd.replace(old_brd.split("\\", 1)[-1], "Problem"+old_brd.split("\\", 1)[-1])
            else:
                print(old_brd.split("\\", 1)[-1], " cannot find reference")
                continue
            _, _, res_tree = self.check(old_brd, new_brd)
            _, res_diff = self.check_xmldiff(old_brd, new_brd)
            print(old_brd.split("\\", 1)[-1], res_tree, res_diff)

In [29]:
class translate():
    """the class for translation purpose"""
    def __init__(self, path_new, path_ref, path_output, path_no_mark, TARGET_LANG = 'es'):
        self.path_new = path_new
        self.path_ref = path_ref
        self.path_output = path_output
        self.path_no_mark = path_no_mark
        self.TARGET_LANG = TARGET_LANG
    
    def replace_var(self):
        """replace variable with value in the latest mass production table"""
        table_clean = pd.read_csv(self.path_new, sep="\t", index_col=0, keep_default_na=False)
        # iterate column and row in table_clean
        for column in range(table_clean.shape[1]):
            for row in range(table_clean.shape[0]):
                content_new = str(table_clean.iloc[row, column])
                start_pattern = "%\("
                # count the number of the replacement in one variable(content_new)
                count = [match.start() for match in re.finditer(start_pattern, str(content_new))]
                for i in range(len(count)):
                    # find the variable
                    start_pattern = "%\("
                    start_index = [match.start() for match in re.finditer(start_pattern, str(content_new))]
                    end_pattern = "\)%"
                    end_index = [match.start() for match in re.finditer(end_pattern, str(content_new))]
                    # no need for replacement
                    if start_index == []:
                        continue
                    else:
                        variable = content_new[start_index[0]: end_index[0]+2]
                        # find the corresponding column name, and then find the value
                        column_name = table_clean.columns[column]
                        try:
                            value = table_clean.loc[variable, column_name] 
                            content_new = content_new.replace(variable, value)
                            table_clean.iloc[row, column] = content_new
                        except:
                            print(variable + " doesn't exist")
        return table_clean
    
    def find_founction(self, content_new, replacement="#*"):
        """find and replace function-like(<%...%>) in value"""
        start_pattern = "<%"
        start_index = [match.start() for match in re.finditer(start_pattern, str(content_new))]
        end_pattern = "%>"
        end_index = [match.start() for match in re.finditer(end_pattern, str(content_new))]
        if start_index and end_index:
            function = content_new[start_index[0]: end_index[-1]+2]
            content_out = content_new.replace(function, replacement)
        else:
            function = None
            content_out = content_new
        return content_out, function, replacement
    
    def create_table(self):
        table_clean = self.replace_var()

        # Change column name
        table_clean_col_list = []
        for i in table_clean:
            table_clean_col_list.append(i)
            table_clean_col_list.append(i+"ESP")

        # Create a new table, with the column from the Greg's mass production table and the index from the latest mass production table
        table_translated = pd.DataFrame(columns=table_clean_col_list, index=table_clean.index)
        table_translated_clean = pd.DataFrame(columns=table_clean_col_list, index=table_clean.index)

        return table_translated, table_translated_clean
    
    def translate_file(self):
        # read the Greg's mass production table
        table_old = pd.read_csv(self.path_ref, header = None)
        # skip some rows, because they are translation for HTML elements
        header_index = table_old.index[table_old[0] == 'Problem Name'].to_list()
        # reload the csv
        table_old = pd.read_csv(self.path_ref, header = header_index)

        table_clean = self.replace_var()
        table_translated, table_translated_clean = self.create_table()
        
        # find the translation of the latest mass production table from the Greg's mass production table 
        google_dict = {}
        for column in range(table_clean.shape[1]):
            for row in range(table_clean.shape[0]):
                content_new = str(table_clean.iloc[row, column])
                column_name = table_clean.columns[column]
                column_num = table_translated.columns.get_loc(column_name)
                # write the english column
                table_translated[column_name].iloc[row] = content_new
                table_translated_clean[column_name].iloc[row] = content_new
                if table_clean.index[row] == '%(startStateNodeName)%':
                    print("skip '%(startStateNodeName)%'")
                    table_translated.iloc[row, column_num+1] = content_new
                    table_translated_clean.iloc[row, column_num+1] = content_new
                else:
                    print(content_new)
                    # digit or empty, keep the original
                    if content_new.lstrip('-').replace(".", "").isdigit() or content_new == '':
                        print("digital or empty")
                        content_translated = content_new
                        content_translated_clean = content_translated
                    # search in dict
                    elif content_new in google_dict:
                        print("find translation in dict")
                        content_translated = '[google]' + google_dict[content_new]
                        content_translated_clean = google_dict[content_new]
                    # translate in google
                    elif table_old.columns[(table_old == content_new).any()].empty:
                        print("use google translation")
                        try:
                            content_new, function, replacement = self.find_founction(content_new) 
                            translation = tss.google(content_new, from_language='en', to_language=self.TARGET_LANG)
                            if function is None:
                                pass
                            else:
                                translation = translation.replace(replacement, function)
                        except:
                            translation = 'error'
                        content_translated = '[google]' + translation
                        content_translated_clean = translation
                        google_dict[content_new] = translation
                    # find translation in old table
                    else:
                        print("find translation in sheet")
                        column_name_old = table_old.columns[(table_old == content_new).any()][0]
                        column_num_old = table_old.columns.get_loc(column_name_old)
                        content_translated = table_old[table_old[column_name_old] == content_new].iloc[0, column_num_old+1]
                        content_translated_clean = table_old[table_old[column_name_old] == content_new].iloc[0, column_num_old+1]
                    print(content_translated)
                    table_translated.iloc[row, column_num+1] = content_translated
                    table_translated_clean.iloc[row, column_num+1] = content_translated_clean
        
        # export the csv
        table_translated_output = table_translated.fillna("[CTAT]NAN")
        table_translated_output.to_csv(self.path_output, encoding="utf-8", sep="\t")
        table_translated_clean.to_csv(self.path_no_mark, encoding="utf-8", sep="\t")

In [30]:
# # clean 6.05
# html_brd = "./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.brd"
# html_table = "./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.txt"
# clean_brd = "./Output_cleaned_folder/6.05 HTML/6.05 HTML/MassProduction/6_5_cleaned.brd"
# clean_table = "./Output_cleaned_folder/6.05 HTML/6.05 HTML/MassProduction/6_5_cleaned.txt"
# # mass produce for cleaned 6.05
# clean_mp_folder = "./Output_cleaned_folder/6.05 HTML/6.05 HTML/FinalBRDs/"
# # validate for cleanded 6.05
# old_folder_c = "./HTML_folder/6.05 HTML/6.05 HTML/FinalBRDs/"
# # translate for cleanded 6.05
# ref_table = './Greg_table_folder/6.05 - Sheet1.csv'
# transalte_table = './Output_translated_folder/6.05 HTML/6.05 HTML/MassProduction/6.05_translated.txt'
# transalte_table_no_mark = './Output_translated_folder/6.05 HTML/6.05 HTML/MassProduction/6.05_translated_no_mark.txt'
# # mass produce for transalted 6.05
# translate_mp_folder = "./Output_translated_folder/6.05 HTML/6.05 HTML/FinalBRDs/"

# clean 6.05
html_brd = "./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.brd"
html_table = "./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.txt"
clean_brd = "./Output_cleaned_folder/6.05 HTML/6.05 HTML/MassProduction/6_5_cleaned.brd"
clean_table = "./Output_cleaned_folder/6.05 HTML/6.05 HTML/MassProduction/6_5_cleaned.txt"
# mass produce for cleaned 6.05
clean_mp_folder = "./Output_cleaned_folder/6.05 HTML/6.05 HTML/FinalBRDs/"
# validate for cleanded 6.05
old_folder_c = "./HTML_folder/6.05 HTML/6.05 HTML/FinalBRDs/"
# translate for cleanded 6.05
ref_table = './Greg_table_folder/6.05 - Sheet1.csv'
transalte_table = './Output_translated_folder/6.05 HTML/6.05 HTML/MassProduction/6.05_translated.txt'
transalte_table_no_mark = './Output_translated_folder/6.05 HTML/6.05 HTML/MassProduction/6.05_translated_no_mark.txt'
# mass produce for transalted 6.05
translate_mp_folder = "./Output_translated_folder/6.05 HTML/6.05 HTML/FinalBRDs/"

# run the process function
print("clean task ------")
clean_res = clean(html_brd, html_table, clean_brd, clean_table)
_, _ = clean_res.clean_file()
print("mass produce for clean task ------")
mass_produce_clean_res = mass_produce(clean_brd, clean_table, clean_mp_folder)
mass_produce_clean_res.mass_produce_file()
print("validate for clean task ------")
validate_clean_res = validate(old_folder_c, clean_mp_folder)
validate_clean_res.validate_file()
print("translate task ------")
translate_clean_res = translate(clean_table, ref_table, transalte_table, transalte_table_no_mark)
translate_clean_res.translate_file()
print("mass produce for translation task ------")
mass_produce_translate_res = mass_produce(clean_brd, transalte_table_no_mark, translate_mp_folder)
mass_produce_translate_res.mass_produce_file()
print("validate for translation task ------")
validate_clean_res = validate(old_folder_c, translate_mp_folder)
validate_clean_res.validate_file()

clean task ------
mass production brd input read
path: ./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.brd
mass production table input read
path: ./HTML_folder/6.05 HTML/6.05 HTML/MassProduction/6_5.txt
mass production table output finished
path: ./Output_cleaned_folder/6.05 HTML/6.05 HTML/MassProduction/6_5_cleaned.txt
mass production brd output finished
path: ./Output_cleaned_folder/6.05 HTML/6.05 HTML/MassProduction/6_5_cleaned.txt
mass produce for clean task ------
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
1.brd finished
2.brd finished
3.brd finished
4.brd finished
5.brd finished
6.brd finished
7.brd finished
8.brd finished
validate for clean task ------


 12%|█▎        | 1/8 [00:00<00:01,  3.95it/s]

1.brd True True


 25%|██▌       | 2/8 [00:00<00:01,  4.08it/s]

2.brd True True


 38%|███▊      | 3/8 [00:00<00:01,  4.03it/s]

3.brd True True


 50%|█████     | 4/8 [00:01<00:01,  3.68it/s]

4.brd True True


 62%|██████▎   | 5/8 [00:01<00:00,  3.51it/s]

5.brd True True


 75%|███████▌  | 6/8 [00:01<00:00,  3.74it/s]

6.brd True True


 88%|████████▊ | 7/8 [00:01<00:00,  3.68it/s]

7.brd True True


100%|██████████| 8/8 [00:02<00:00,  3.67it/s]

8.brd True True
translate task ------
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist





%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
%(imgNum)% doesn't exist
skip '%(startStateNodeName)%'
Using the set of shapes in the picture, answer questions 1 through 4 in the worksheet provided.
find translation in sheet
Usando el conjunto de formas en la imagen, responda las preguntas 1 a 4 en la hoja de trabajo provista.
Assets/01.png
find translation in sheet
Assets/01.png
(1) What percent of the shapes are squares?
find translation in sheet
(1) ¿Qué porcentaje de las formas son cuadrados?
(2) What percent of the shapes are purple?
find translation in sheet
(2) ¿Qué porcentaje de las formas son moradas?
(3) What percent of the shapes are not blue?
find translation in sheet
(3) ¿Qué porcentaje de las formas no son azules?
(4) What percent of the shapes are red?
find translation in 

 12%|█▎        | 1/8 [00:00<00:02,  2.84it/s]

1.brd True True


 25%|██▌       | 2/8 [00:00<00:01,  3.36it/s]

2.brd True True


 38%|███▊      | 3/8 [00:00<00:01,  3.16it/s]

3.brd True True


 50%|█████     | 4/8 [00:01<00:01,  3.44it/s]

4.brd True True


 62%|██████▎   | 5/8 [00:01<00:00,  3.65it/s]

5.brd True True


 75%|███████▌  | 6/8 [00:01<00:00,  3.52it/s]

6.brd True True


 88%|████████▊ | 7/8 [00:02<00:00,  3.61it/s]

7.brd True True


100%|██████████| 8/8 [00:02<00:00,  3.50it/s]

8.brd True True



