\# Developer: Ali Hashaam (ali.hashaam@initos.com) <br>
\# 15th December 2018 <br>

\# © 2019 initOS GmbH <br>
\# License MIT <br>

The code is responsible to translating Mantis German data to English. In order to obtain unilingual model, we have to bring all data into one format

In [1]:
from textblob import TextBlob
import unidecode, string
import pandas as pd
import logging, os, re, string, time
from textblob.exceptions import NotTranslated, TranslatorError

In [2]:
regex_tab_newlines = re.compile(r'(\n+)|(\r+)|(\t+)')
regex_doublequotes = re.compile(r'\"+')
remove_html_tags = re.compile(r'<[^>]+>')

In [6]:
def establish_logger(plateform):
    filepath = 'logs/mantis_translation.log'
    if os.path.exists(filepath):
        os.remove(filepath)
    logger = logging.getLogger(plateform+"_translator")
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler(filepath)
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger

In [None]:
def remove_null(df):
    """
    Takes in data and remove the records with any null values.
        > Parameters:
            df : pandas Dataframe Object | Dataframe to be processed
    
        > Returns:
            Dataframe without null values
    """
    # Check for null values in data
    null_rows = df[df.isnull().all(axis=1)]
    if len(null_rows):
        # if all values are null in a row then drop that row
        df = df.dropna(how='all')
        print("Rows with all null values dropped!!!")
    df = df.fillna('')
    return df

In [16]:
def translate_bugs(row, t_cols, id_col):
    """First remove the unncessary punctuations and then translate it and set the status of translation in 
    another column"""
    global logger
    for col in t_cols:
        text = re.sub(regex_tab_newlines, " ", row[col])
        text = re.sub(remove_html_tags, " ", text)
        text = text.translate(None, '"#<>\\^`{|}~')
        text = unicode(text, "utf-8")
        try:
            if len(text):
                row[col] = TextBlob(text).translate(to ='en').string
        except TranslatorError as e:
            print e
            row[col+'_error'] = 'translation error'
        except NotTranslated:
            row[col+'_error'] = 'not translated'
    logger.info('{} is done.'.format(row[id_col]))
    return row

In [17]:
if __name__ == '__main__':
    logger = establish_logger('mantis')
    dataset_dir = '../datasets'
    for files in ['mantis_bug_notes', 'mantis_bugs']:
        df = pd.read_csv('{}/{}.csv'.format(dataset_dir, files)).head(5)
        df = df.fillna('')
        df = remove_null(df)
        if 'mantis_bug_notes' in files:
            textual_cols = ['bug_note']
            id_col = 'bugnote_id'
        else:
            textual_cols = ['summary', 'description', 'additional_information', 'steps_to_reproduce' ]
            id_col = 'id'
        df = df.apply(lambda x: translate_bugs(x, textual_cols, id_col), axis=1)
        df.to_csv('{}/{}_en.csv'.format(dataset_dir, files), index=False, encoding='utf-8')