\# Developer: Ali Hashaam (ali.hashaam@initos.com) <br>
\# 3rd March 2019 <br>

\# © 2019 initOS GmbH <br>
\# License MIT <br>

In [1]:
import numpy as np
import pandas as pd
import nltk, os
from nltk.corpus import sentiwordnet as swn
from pprint import pprint
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import wordnet
import itertools, string, re, unidecode, time, pickle
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk import bigrams
import operator
import ast, icu
from scipy.sparse import hstack
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [2]:
from methods.scikitTSVM import SKTSVM
from frameworks.SelfLearning import *

In [3]:
# Regex to remove urls and emails from text
regex_doublequotes = re.compile(r'\"+')
regex_square_brackets = re.compile(r'(\[)|(\])')
regex_urls = re.compile('http\S+')
regex_emails = re.compile('\S*@\S*\s?')
regex_tab_newlines = re.compile(r'(\n+)|(\r+)|(\t+)')
remove_html_tags = re.compile(r'<[^>]+>')

In [4]:
CharsSet = "ascii" # The Character set to be used as the default one when interpreting texts
casefold = lambda u: unicode(icu.UnicodeString(u).foldCase()).encode(CharsSet, "ignore")

In [5]:
def getWordnetPos(_treebank_tag):
    """
    Translate the tree bank PoS tags to the WordNet's
    
    > Parameters:
        _treebank_tag : str     | The tag to be translated
    
    > Returns:
        The relevant WordNet PoS tag
    https://stackoverflow.com/a/15590384/3429115
    """
    if _treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif _treebank_tag.startswith('V'):
        return wordnet.VERB
    elif _treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif _treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return '' # CAUTION! It will remove all the words but the four above! implicit stopwords and punctioation removal somehow


In [6]:
def lemmatizeTaggedTerms(_tgdSentsMTX, _isFlattened = True, _oddTokensBehaviour = 3):
    """
    Infer the lemmatized form of tokens with their PoS tags.
    
    > Parameters:
        _tgdSentsMTX : collection of pairs: token - PoS tag | The Token/Tag collection from which we want to find lemmas
        
        _isFlattened : bool                                 | True if the collection is an array rather than a matrix
        
        _oddTokensBehaviour : bool                          | What to do when an odd non-English or non-lingual token is encountered,
                                                            | if 1, such token is not lemmatized and included as-is,
                                                            | if 2, such token is  to be in ASCII form, ignoring non-ascii chars,
                                                            | if 3, such token is included after replacing the odd chars with '?'.
                                                            | 0 (and otherwise), such token is discarded.
                                                            
    > Returns:
        The lemmatized list of tokens according to the selected behaviour.
    
    """
    lemmatizer = WordNetLemmatizer()
    lemmatizedSentsMTX = []
    for pair in _tgdSentsMTX:
        WordNetTag = getWordnetPos(str(pair[1]));
        if (len(WordNetTag) > 0): # Ensure there is a mapping to WordNet categories, ignore punctuations, propositions, determinants, etc.
            # Append the lemmatized token to the sentence list after decoding foreign letters
            # If lemmatizing fails, often due to non-English characters, kepp the word as it is:
            try:
                lemma = lemmatizer.lemmatize(pair[0], WordNetTag)
                lemmatizedSentsMTX.append(lemma)
            except UnicodeDecodeError: # This code won't be entered at all, since the controlle will convert the unicode to the ASCII chars only
                if (_oddTokensBehaviour == 1):
                    lemmatizedSentsMTX.append(pair[0])
                elif (_oddTokensBehaviour == 2):
                    lemmatizedSentsMTX.append(lemmatizer.lemmatize(unicode(pair[0], errors="ignore").encode(CharsSet,"ignore"), WordNetTag))
                elif (_oddTokensBehaviour == 3):
                    lemmatizedSentsMTX.append(lemmatizer.lemmatize(unicode(pair[0], errors="replace").encode(CharsSet,"replace"), WordNetTag))
                        
    return lemmatizedSentsMTX

In [7]:
def tagPoS(_text):
    """
    TagPoS will find the PoS tags for a review, which is of a multiple sentences, preserving the sentences' boundaries.    
    
    > Parameters:
        _text : str                 | the text of the review in str format.

    > Returns:
    the returned object is a mere array of words' tags, but the sentences boundaries would have been 
    taken into consideration nevertheless.
    
        Additionally, _removePunt will be taken into consideration as well.
    
    > Comments:
        As said, it is NOT recommended to remove the punctuation blindly. However, this ability is offered. A better way is to remove 
    the punctuation from the output depending on smart PoS tagging, where the tag is "." === a punctuation.
    
        Also, It is not recommended to remove stop words before this stage, the outputs will contain all part of speeches, and afterwards 
    we can remove everything but adjectives, nouns, verbs, and adverbs, a smarter way to get the essence of a sentence.
    
        The main function here is documented on: http://www.nltk.org/api/nltk.tag.html#nltk.tag.pos_tag_sents
    """
    #if not(isinstance(_text, unicode)):
        #_text = unicode(_text, errors="ignore");        
    listSentences = nltk.sent_tokenize(_text);
    # But even sentences need to be an array of words, so we have to tokenise further, making each sentence array distinguishable by rows
    # Convert the list of sentences to a list of list of words:
    matrixSentences = [];
    for sentence in listSentences:
        if not (isinstance(sentence, str)):
            sentence = sentence.encode(CharsSet,"ignore");            
        sentence = sentence.translate(None, string.punctuation)
        # Append non-empty sentences:
        if (len(sentence) > 0):
            # Back to unicode:
            sentence = unicode(sentence, errors="ignore")
            # bigram[nltk.word_tokenize(sentence)] will place _ with in multiword phrases if present, any. 
            #matrixSentences.append(bigram[nltk.word_tokenize(sentence)])
            matrixSentences.append(nltk.word_tokenize(sentence))
    # Now let's try to PoS on the sentences of the text:
    taggedTokens = nltk.pos_tag_sents(matrixSentences)
    taggedTokens = list(itertools.chain.from_iterable(taggedTokens))
    return taggedTokens

In [8]:
def lemmatizeText(_text, _oddCharsBehaviour = 1):
    """
    The main controller to lemmatize a text and return the list of lemmatized tokens.
    
    This controller takes care of: replacing foreign accents with most likely letters, casefolding, 
    removing newlines and tabs, replacing " with ', removing punctuation after expanding abbreviations 
    for the sake of a better effectiveness, stopping, tokenizing, generating the PoS tags, and 
    generating lemmas of the text depending on PoS tags.
    lemmatizeTaggedTerms
    :parameters:
        _text : unicode             | The text to tokenize and lemmatize, in unicode, which will be converted to str 
                                    | after proper processing
        
        _enAbbrvDict : dictionary   | Holds the english abbreviations' shorthands and expansions, used when 
                                    | removing punctuation
        
        _oddCharsBehaviour : bool   | What to do when an odd non-English or non-lingual token is encountered,
                                    | if 1, such token is not lemmatized and included as-is,
                                    | if 2, such token is  to be in ASCII form, ignoring non-ascii chars,
                                    | if 3, such token is included after replacing the odd chars with '?'.
                                    | 0 (and otherwise), such token is discarded.
    
    :returns:
        A list of lemmatized tokens which belong to the inputted _text and semantic orientation(SO) score
    """    
    # First, strip the unicode of accents, replace with ß with ss
    try:
        _text = unidecode.unidecode(_text)
        taggedText = tagPoS(casefold(_text))
    except:
        taggedText = tagPoS(casefold(_text))
    return lemmatizeTaggedTerms(taggedText, _oddTokensBehaviour = _oddCharsBehaviour)

In [9]:
def apply_lemmatization(row, textual_columns):
    """
    apply lematizeText Function on every row of dataset.
    
    :parameters:
        row : pandas Dataframe row | row to be processed

    :returns:
        processed row
    """
    try:
        if users_df.loc[row['bug_note_reporter_id'], 'roles'] in ['Customers', 'Customers Suppliers', 'initOS Suppliers', 
                                                                 'Partner']:
            is_customer = True
        else:
            is_customer = False
    except:
        is_customer = False
    for col in textual_columns:
        
        _text = re.sub(regex_tab_newlines, " ", row[col])
        _text = re.sub(regex_doublequotes, " ", _text)
        _text = re.sub(regex_square_brackets, " ", _text)
        _text = re.sub(regex_urls, " ", _text)
        _text = re.sub(regex_emails, " ", _text)
        _text = re.sub(remove_html_tags, " ", _text)
        _text = unicode(_text, "utf-8")
        row[col] = lemmatizeText(_text)
    return row

In [10]:
def pre_process_data(df, textual_columns):
    """
        remove the null values and then apply apply_lemmatization.

        :parameters:
            df : pandas Dataframe Object | Dataframe to be processed
    
        :returns:
            Processed Dataframe

    """
    df = df.apply(lambda row: apply_lemmatization(row, textual_columns),axis=1)
    return df

In [26]:
erp_nxt = pd.read_csv('../datasets/github_projects/erpnext_issues_relevant.csv')

In [27]:
#erp_nxt['text'] = erp_nxt['title'].fillna('') + ',' + erp_nxt['comments'].fillna('')
erp_nxt.drop(['Unnamed: 0'], axis=1, inplace=True)

In [28]:
df = pre_process_data(erp_nxt, ['text'])
df['text'] = df['text'].apply(', '.join)

In [31]:
df.loc[df['type'].str.contains('bug'), 'type'] = 0
df.loc[df['type']!=0, 'type'] = 1

In [32]:

df['type'].value_counts()

1    734
0    430
Name: type, dtype: int64

In [37]:
df.to_csv('github_preprocessed_data.csv', encoding='utf-8')

In [12]:
bugs = pd.read_csv('../datasets/lexical_semantic_preprocessed_mantis_bugs_less_columns_with_class_expansion.csv')
bug_notes = pd.read_csv('../datasets/lexical_semantic_preprocessed_mantis_bugnotes.csv')
bug_notes['bug_note'] = bug_notes['bug_note'].str.replace(regex_square_brackets, '')
bugs['additional_information'] = bugs['additional_information'].str.replace(regex_square_brackets, '')
bugs['description'] = bugs['description'].str.replace(regex_square_brackets, '')
bugs['summary'] = bugs['summary'].str.replace(regex_square_brackets, '')
df_bug_note_table = bug_notes.groupby(['bug_id'])['bug_note'].apply(','.join).to_frame('bug_notes').reset_index()
result = pd.merge(bugs, df_bug_note_table, how='left', left_on='id', right_on='bug_id')
result['textual_data'] = result['summary'].fillna('') + ',' + result['description'].fillna('') + ',' + result['additional_information'].fillna('') + ',' + result['bug_notes'].fillna('')
result['textual_data'] = result['textual_data'].str.replace(" ", "")
result.sort_values(by=['class'], inplace=True)
result.reset_index(drop=True, inplace= True)

In [17]:
DA_insight = result[~result['class'].isnull()].copy()

In [20]:
DA_insight = DA_insight[DA_insight['severity'].isin([10, 40, 70, 80])].copy()

In [21]:
DA_insight.groupby(['class', 'severity']).size()

class         severity
critical      10           19
              40            2
              70            9
              80           39
non-critical  10          737
              40           42
dtype: int64

In [22]:
bugs = result[result['severity'].isin([40, 70, 80])].index
non_bugs = result[result['severity'].isin([10])].index

In [23]:
result["bug_or_not"] = np.nan

In [24]:
result.loc[bugs,'bug_or_not'] = 'bugs'
result.loc[non_bugs,'bug_or_not'] = 'non_bugs'

In [25]:
DA_insight = result[~result['bug_or_not'].isnull()].copy()

In [28]:
DA_insight.groupby(['class', 'bug_or_not']).size().unstack()

bug_or_not,bugs,non_bugs
class,Unnamed: 1_level_1,Unnamed: 2_level_1
critical,50,19
non-critical,42,737


In [29]:
DA_insight.groupby(['class', 'bug_or_not']).size().unstack().to_csv('bug_vs_critical.csv')

In [42]:
result.loc[result['bug_or_not']=='bugs', 'type'] = 0
result.loc[result['bug_or_not']=='non_bugs', 'type'] = 1
result['type'].value_counts()

1.0    756
0.0     92
Name: type, dtype: int64

In [43]:
result.to_csv('mantis_data_for_domain_adaptation.csv', encoding='utf-8')