\# Developer: Ali Hashaam (ali.hashaam@initos.com) <br>
\# 2nd March 2019 <br>

\# © 2019 initOS GmbH <br>
\# License MIT <br>

The code is responsible for preprocessing data for PAD testing.<br>
https://github.com/rpryzant/proxy-a-distance

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize
import re, icu
from string import punctuation
import os.path
import random, logging

In [2]:
# The path to the main output folder, in which the text files are placed, and checks are made to avoid duplicated work:
directory = "../datasets/"
datasets_dir = "../datasets"
regex_doublequotes = re.compile(r'\"+')
regex_square_brackets = re.compile(r'(\[)|(\])')
regex_urls = re.compile('http\S+')
regex_emails = re.compile('\S*@\S*\s?')
regex_tab_newlines = re.compile(r'(\n+)|(\r+)|(\t+)')
remove_html_tags = re.compile(r'<[^>]+>')

In [3]:
def establish_logger():
    logger = logging.getLogger("PAD_TESTING")
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('{}/logs/pad_testing.log'.format(datasets_dir))
    fh.setLevel(logging.INFO)
    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    return logger

In [4]:
"""
The py2casefold in not tested enough, slow, and I couldn't install it in anaconda. This lambda function is a replacement.
The function returns a string, not a unicode, because we will use the .translate fast function to remove punct. of strings.
Source: https://stackoverflow.com/a/32838944/3429115
"""
CharsSet = "ascii" # The Character set to be used as the default one when interpreting texts
casefold = lambda u: unicode(icu.UnicodeString(u).foldCase()).encode(CharsSet,"ignore")

In [5]:
def iter_sample_fast(iterable, samplesize):
    """
    Fast memory-efficient sampling method for pretty large iterables.
    Adopted from: https://stackoverflow.com/a/12583436/3429115
    
    > Parameters:
        * iterable: iterable object | The collection we want to sample from
        
        * samplesize: int           | How many samples to generate
        
    > Returns:
        List of samples, hoewver, since sampling is made without replacement (most probably), they aren't IID; but
        it's not that problem to our case I guess, especially that we do not want to sample the same item twice...
    """
    results = []
    iterator = iter(iterable)
    # Fill in the first samplesize elements:
    try:
        for _ in xrange(samplesize):
            results.append(iterator.next())
    except StopIteratlion:
        raise ValueError("Sample larger than population.")
    random.shuffle(results)  # Randomize their positions
    for i, v in enumerate(iterator, samplesize):
        r = random.randint(0, i)
        if r < samplesize:
            results[r] = v  # at a decreasing rate, replace random items
    return results

In [6]:
def elicitDomainSentences(df_Reviews, _Amount = 0):
    """
    Convert a domain csv resource to a list of sentences, casefolded and stripped from common nuisances. If a sample
    is required, then random sampling is applied after generating the full population.
    
    > Parameters:
        * df_Reviews: Pandas Dataframe - dataset whose textual data needs preprocessing
    
    > Returns:
        The list of sentences of the domain in a list. Sentences that contain < 3 words aren't considered.
    """                             
    # Drop rows if text is NaN
    df_Reviews.dropna(inplace=True)    
    
    casefolded = np.array(df_Reviews["text"].apply(casefold))

    SentencesReviewsDomain = []
    counter = 0
    total = len(casefolded)
    
    for review in casefolded:        
        if (counter % 1000 == 0):
            logger.info("{}% of data processed..".format(round(100.0 * counter / total,2)))
        # Replace newlines and double quotes  
        review = re.sub(regex_tab_newlines, " ", review)
        review = re.sub(regex_doublequotes, "'", review)
        review = re.sub(regex_square_brackets, "", review)
        review = re.sub(regex_urls, " ", review)
        review = re.sub(regex_emails, " ", review)
        review = re.sub(remove_html_tags, " ", review)
        #Split sentences:
        currentSentences = sent_tokenize(review)
        #remove punctuation:
        nonPunctuatedSentences = []
        for s in currentSentences:
            if (len(s.split()) >= 3): # Only consider the sentence if it has at least 3 words.
                nonPunctuatedSentences.append(s.translate(None, punctuation)) # Remove punctuation
                
        if (len(nonPunctuatedSentences) > 0):
            SentencesReviewsDomain.extend(nonPunctuatedSentences)
        counter += 1
    logger.info("100% of data processed.")
    print len(df_Reviews)
    print len(SentencesReviewsDomain)
    if (type(_Amount) == int and _Amount > 0 and _Amount < len(SentencesReviewsDomain)):
        """ Apply the random sampling if proper and required:
         The population has been generated, and now we want to select _Amount i.i.d samples;
         To choose i.i.d samples, sampling WITH replacement must be carried out.
         Choosing a specific _Amount from all domains guarantees the balance of the U dataset for PAD,
         And enhances the efficiency of the PAD SVM of course: two birds with one stone. """
        # This np.random.choice causes memory problems with large domains, such as restaurants |-_-|..
        # Additionally, it will sometimes sample the same item more than once!
        #return np.random.choice(SentencesReviewsDomain, _Amount).tolist();
        # We will use a more efficient way of sampling without sampling the same item twice:
        return iter_sample_fast(SentencesReviewsDomain, _Amount)
    else:
        # No sampling, return the mere list: 
        return SentencesReviewsDomain

In [7]:
def findCommonVocabularyFromSentences(_domain1, _domain2):
    """
    Find the intersection of vocabulary between the two domains, expressed as lists of sentences, so it is best
    called after `elicitDomainSentences` routine.
    
    > Parameters:
        * _domain1: list      | List of sentences belonging to the first domain;
        
        * _domain2: list      | List of sentences belonging to the second domain.
    
    > Returns:
        The set of common vocabulary between the two domains.
    """
    if ((len(_domain1)==0) or (len (_domain2)==0)):
        logger.info("ATTENTION: One or more empty domain(s) passed to findCommonVocabulary.")
        return None
    
    domain1Vocab = []
    for s in _domain1:     
        domain1Vocab.extend(s.split())
        
    domain2Vocab = []
    for s in _domain2:
        domain2Vocab.extend(s.split())
    
    return set.intersection(set(domain1Vocab), set(domain2Vocab))

In [8]:
def saveList2txt(_List, _filePath, IsVocabulary = False):
    """
    Saves a list to a text file, each entry on a new line.
    https://stackoverflow.com/a/13434105/3429115
    
    > Parameters:
      * _List: list             | The list to be saved
      
      * _filePath: string       | The full path, including file name and extension
      
      * IsVocabulary: boolean   | True if we are saving the vocabulary, so that we prepend the special values
    
    > Returns:
        0 if the execution goes well, -1 if the file already exists.
    """
    if (os.path.isfile(_filePath)):
        logger.info("ATTENTION: text file already exists, exiting.")
        return -1
    
    outfile = open(_filePath, "w")
    if (IsVocabulary):
        outfile.write("<unk>\n<s>\n</s>\n")
    print >> outfile, "\n".join(s.strip() for s in _List)
    outfile.close()
    return 0

In [9]:
def loadSentencesFromtxt(_Filepath):
    """
    Load sentences list for a specific domain from disk.
    
    > Parameters:
        * _Filepath: the path to the text file, in which each sentence is expected to be on one line. No checks are done
        to ensure the file exists, so be sure it does, or the function will vomit an exception, I guess.
    """
    dataSentences = [];
    for line in open(_Filepath):
        dataSentences.append(line.strip());
    
    return dataSentences;

In [10]:
def getSentencesForDomain(directory, _Filename, df, _NumberOfSentences):
    """
    Processes domains into sentences 
    
    > Parameters:
        * _Filename : string       | First domain's name in the common dataset directory -without the extension
        
        * df : Pandas Dataframe       | Second domain's name in the common dataset directory -without the extension
    
    > Returns:
        * sentences: list        | list of all the sentences of domains
    """
    domainAlreadyDone = os.path.isfile("{}/{}_sentences.txt".format(directory, _Filename))    
    if (not domainAlreadyDone):
        sentences = elicitDomainSentences(df, _NumberOfSentences)
        logger.info("{} data read and processed..".format(_Filename))
        saveList2txt(sentences, "{}/{}_sentences.txt".format(directory, _Filename))
        logger.info("{} data saved ({:d} sentences).".format(_Filename, len(sentences)))
    else:
        logger.info("ATTENTION: Domain {} was found to be already processed.".format(_Filename))
        # Load sentences
        logger.info("INFO: Domain's sentences loaded as a preface to build the common vocabulary.")
        sentences = loadSentencesFromtxt("{}/{}_sentences.txt".format(directory, _Filename))
    return sentences

In [11]:
def ProcessDomains(_Filename1, _Filename2, df_A, df_B, _NumberOfSentences):
    """
    The main controller; Processes domains into sentences, finds their common vocabulary, and saves the results. Work is
    carried out in a structured way to avoid redundant tasks and unintentional outputs overwriting.
    
    > Parameters:
        * _Filename1 : string       | First domain's name in the common dataset directory -without the extension
        
        * _Filename2 : string       | Second domain's name in the common dataset directory -without the extension
        
    > Returns:
        None. Everything is written to disk in `directory` path.
    """
    sentencesD1 = []
    sentencesD2 = []    
    logger.info("Starting main routine..")
    # Is the vocabulary already there?
    IsVocabAlreadyBuilt = os.path.isfile("{}/Vocab_{}_{}.txt".format(directory, _Filename2, _Filename1))
    if (not IsVocabAlreadyBuilt):
        sentencesD1 = getSentencesForDomain(directory, _Filename1, df_A, _NumberOfSentences)
        sentencesD2 = getSentencesForDomain(directory, _Filename2, df_B, _NumberOfSentences)
        setCommonVocab = findCommonVocabularyFromSentences(sentencesD1, sentencesD2)
        logger.info("Common vocabulary for {} and {} processed..".format(_Filename1, _Filename2))
        saveList2txt(setCommonVocab, "{}/Vocab_{}_{}.txt".format(directory, _Filename2, _Filename1), IsVocabulary=True)
        logger.info("Common vocabulary saved ({:d} terms).".format(len(setCommonVocab)))
        logger.info("All Done.")
    else:
        logger.info("ATTENTION: Vocabulary was already found on disk, so no need to rebuild it again.")

In [12]:
def merge_textual_data_in_one_column(filename, _NumberOfSentences = 0):
    """
    merge textual data into one column
    
    > Parameters:
    * plateform: str- Name of plateform whose textual data is to be dealt with
    * _NumberOfSentences : int  | The number of sentences to include from both of the domains. 0 means all sentences,
                                | and > 0 means random sampling will be applied to pick this number of sentences out
                                | of the domain's population of sentences.
    > Returns:
    df: Pandas Dataframe- dataframe with two columns id and text, with text containing all the textual
        data belonging to particular id
    """
    df = pd.read_csv('{}/jira_projects/{}.csv'.format(datasets_dir, filename))
    if 'mantis' in filename:
        df = df[df["translation_status"]=='done']
    df = df.fillna('')
    """ Apply the random sampling if proper and required:
         The population has been generated, and now we want to select _Amount i.i.d samples
         To choose i.i.d samples, sampling WITH replacement must be carried out.
         Choosing a specific _Amount from all domains guarantees the balance of the U dataset for PAD,
         And enhances the efficiency of the PAD SVM of course: two birds with one stone. """
    if _NumberOfSentences > 0:
        df = df.sample(n=_NumberOfSentences, replace=False, weights=None)
    
    if 'mantis' in plateform:
        df['text'] = df['description'] + '. ' +  df['summary'] + '. ' +  df['bug_notes']  + '. '  +  df['steps_to_reproduce'] + '. ' +  df['additional_information']
    else:
        df['text'] = df['description'] + '. ' +  df['summary'] + '. ' +  df['comments']
    return df[['id', 'text']]

In [13]:
_NumberOfSentences = 0
bug_notes = pd.read_csv('../datasets/mantis_bug_notes_en.csv')
bugs = pd.read_csv('../datasets/mantis_bugs_en.csv')
df_bug_note_table = bug_notes.groupby(['bug_id'])['bug_note'].apply(','.join).to_frame('bug_notes').reset_index()
result = pd.merge(bugs, df_bug_note_table, how='left', left_on='id', right_on='bug_id')
result['text'] = result['summary'].fillna('') + ',' + result['description'].fillna('') + ',' + result['additional_information'].fillna('') + ',' + result['bug_notes'].fillna('')
if _NumberOfSentences > 0:
    result = result.sample(n=_NumberOfSentences, replace=False, weights=None)
mantis_df = result[['id', 'text']].copy()

In [15]:
# prepare data for erp_next
erp_nxt = pd.read_csv('../datasets/github_projects/erpnext_issues.csv')
erp_nxt['type'] = erp_nxt['type'].str.lower()
erp_nxt = erp_nxt[(erp_nxt['type'].str.contains('feature')) | (erp_nxt['type'].str.contains('bug')) | 
       (erp_nxt['type'].str.contains('manufacturing')) | (erp_nxt['type'].str.contains('Enhancement'))]
erp_nxt.rename(columns={"textual_data": "text"}, inplace=True)
erp_nxt_df = erp_nxt[['id', 'text']].copy()
logger = establish_logger()
ProcessDomains('mantis_en_issues', 'erp_next_issues', mantis_df, erp_nxt_df, 10000)
erp_nxt.to_csv('../datasets/github_projects/erpnext_issues_relevant.csv', encoding='utf-8')