In [2]:
import numpy as np
import pandas as pd
import nltk, os
from nltk.corpus import sentiwordnet as swn
from pprint import pprint
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import itertools, string, re, unidecode, icu, time, pickle
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from nltk import bigrams
from langdetect import detect
from textblob import TextBlob
from pattern.en import sentiment

In [2]:
# Regex to remove urls and emails from text
regex_urls = re.compile('http\S+')
regex_emails = re.compile('\S*@\S*\s?')

The casefold() method is removes all case distinctions present in a string. It is used for caseless matching, 
i.e. ignores cases when comparing. <br>
For example, German lowercase letter ß is equivalent to ss. However, since ß is already lowercase, 
lower() method does nothing to it. But, casefold() converts it to ss

In [3]:
CharsSet = "ascii" # The Character set to be used as the default one when interpreting texts
casefold = lambda u: unicode(icu.UnicodeString(u).foldCase()).encode(CharsSet, "ignore")

In [4]:
def getWordnetPos(_treebank_tag):
    """
    Translate the tree bank PoS tags to the WordNet's
    
    > Parameters:
        _treebank_tag : str     | The tag to be translated
    
    > Returns:
        The relevant WordNet PoS tag
    https://stackoverflow.com/a/15590384/3429115
    """
    if _treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif _treebank_tag.startswith('V'):
        return wordnet.VERB
    elif _treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif _treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return '' # CAUTION! It will remove all the words but the four above! implicit stopwords and punctioation removal somehow


In [5]:
def lemmatizeTaggedTerms(_tgdSentsMTX, _isFlattened = True, _oddTokensBehaviour = 3):
    """
    Infer the lemmatized form of tokens with their PoS tags.
    
    > Parameters:
        _tgdSentsMTX : collection of pairs: token - PoS tag | The Token/Tag collection from which we want to find lemmas
        
        _isFlattened : bool                                 | True if the collection is an array rather than a matrix
        
        _oddTokensBehaviour : bool                          | What to do when an odd non-English or non-lingual token is encountered,
                                                            | if 1, such token is not lemmatized and included as-is,
                                                            | if 2, such token is  to be in ASCII form, ignoring non-ascii chars,
                                                            | if 3, such token is included after replacing the odd chars with '?'.
                                                            | 0 (and otherwise), such token is discarded.
                                                            
    > Returns:
        The lemmatized list of tokens according to the selected behaviour.
    
    """
    lemmatizer = WordNetLemmatizer()
    lemmatizedSentsMTX = []
    for pair in _tgdSentsMTX:
        WordNetTag = getWordnetPos(str(pair[1]));
        if (len(WordNetTag) > 0): # Ensure there is a mapping to WordNet categories, ignore punctuations, propositions, determinants, etc.
            # Append the lemmatized token to the sentence list after decoding foreign letters
            # If lemmatizing fails, often due to non-English characters, kepp the word as it is:
            try:
                lemma = lemmatizer.lemmatize(pair[0], WordNetTag)
                lemmatizedSentsMTX.append(lemma)
            except UnicodeDecodeError: # This code won't be entered at all, since the controlle will convert the unicode to the ASCII chars only
                if (_oddTokensBehaviour == 1):
                    lemmatizedSentsMTX.append(pair[0])
                elif (_oddTokensBehaviour == 2):
                    lemmatizedSentsMTX.append(lemmatizer.lemmatize(unicode(pair[0], errors="ignore").encode(CharsSet,"ignore"), WordNetTag))
                elif (_oddTokensBehaviour == 3):
                    lemmatizedSentsMTX.append(lemmatizer.lemmatize(unicode(pair[0], errors="replace").encode(CharsSet,"replace"), WordNetTag))
                        
    return lemmatizedSentsMTX

In [6]:
def tagPoS(_text):
    """
    TagPoS will find the PoS tags for a review, which is of a multiple sentences, preserving the sentences' boundaries.    
    
    > Parameters:
        _text : str                 | the text of the review in str format.

    > Returns:
    the returned object is a mere array of words' tags, but the sentences boundaries would have been 
    taken into consideration nevertheless.
    
        Additionally, _removePunt will be taken into consideration as well.
    
    > Comments:
        As said, it is NOT recommended to remove the punctuation blindly. However, this ability is offered. A better way is to remove 
    the punctuation from the output depending on smart PoS tagging, where the tag is "." === a punctuation.
    
        Also, It is not recommended to remove stop words before this stage, the outputs will contain all part of speeches, and afterwards 
    we can remove everything but adjectives, nouns, verbs, and adverbs, a smarter way to get the essence of a sentence.
    
        The main function here is documented on: http://www.nltk.org/api/nltk.tag.html#nltk.tag.pos_tag_sents
    """
    #if not(isinstance(_text, unicode)):
        #_text = unicode(_text, errors="ignore");        
    listSentences = nltk.sent_tokenize(_text);
    # But even sentences need to be an array of words, so we have to tokenise further, making each sentence array distinguishable by rows
    # Convert the list of sentences to a list of list of words:
    matrixSentences = [];
    for sentence in listSentences:
        if not (isinstance(sentence, str)):
            sentence = sentence.encode(CharsSet,"ignore");            
        sentence = sentence.translate(None, string.punctuation)
        # Append non-empty sentences:
        if (len(sentence) > 0):
            # Back to unicode:
            sentence = unicode(sentence, errors="ignore")
            # bigram[nltk.word_tokenize(sentence)] will place _ with in multiword phrases if present, any. 
            #matrixSentences.append(bigram[nltk.word_tokenize(sentence)])
            matrixSentences.append(nltk.word_tokenize(sentence))
    # Now let's try to PoS on the sentences of the text:
    taggedTokens = nltk.pos_tag_sents(matrixSentences)
    taggedTokens = list(itertools.chain.from_iterable(taggedTokens))
    return taggedTokens

In [7]:
def lemmatizeText(_text, _oddCharsBehaviour = 1):
    """
    The main controller to lemmatize a text and return the list of lemmatized tokens.
    
    This controller takes care of: replacing foreign accents with most likely letters, casefolding, 
    removing newlines and tabs, replacing " with ', removing punctuation after expanding abbreviations 
    for the sake of a better effectiveness, stopping, tokenizing, generating the PoS tags, and 
    generating lemmas of the text depending on PoS tags.
    lemmatizeTaggedTerms
    :parameters:
        _text : unicode             | The text to tokenize and lemmatize, in unicode, which will be converted to str 
                                    | after proper processing
        
        _enAbbrvDict : dictionary   | Holds the english abbreviations' shorthands and expansions, used when 
                                    | removing punctuation
        
        _oddCharsBehaviour : bool   | What to do when an odd non-English or non-lingual token is encountered,
                                    | if 1, such token is not lemmatized and included as-is,
                                    | if 2, such token is  to be in ASCII form, ignoring non-ascii chars,
                                    | if 3, such token is included after replacing the odd chars with '?'.
                                    | 0 (and otherwise), such token is discarded.
    
    :returns:
        A list of lemmatized tokens which belong to the inputted _text and semantic orientation(SO) score
    """    
    # First, strip the unicode of accents, replace with ß with ss
    _text = unidecode.unidecode(_text)
    taggedText = tagPoS(casefold(_text))
    return lemmatizeTaggedTerms(taggedText, _oddTokensBehaviour = _oddCharsBehaviour)

In [8]:
def intensify_polarity(is_customer, row, col, pole):
    if is_customer:
        row[col+'_polarity'] = 'extremely_'+pole
    else:
        row[col+'_polarity'] = pole
    return row

In [9]:
def get_polarity(row, col, is_customer):
    if (row[col+'_textblob_SO'] > 0 and row[col+'_pattern_SO'] > 0) or \
        (row[col+'_textblob_SO'] == 0 and row[col+'_pattern_SO'] > 0) or \
        (row[col+'_textblob_SO'] > 0 and row[col+'_pattern_SO'] == 0):
        row = intensify_polarity(is_customer, row, col, 'positive')
        
    elif (row[col+'_textblob_SO'] < 0 and row[col+'_pattern_SO'] < 0) or \
        (row[col+'_textblob_SO'] == 0 and row[col+'_pattern_SO'] < 0) or \
        (row[col+'_textblob_SO'] < 0 and row[col+'_pattern_SO'] == 0):
        row = intensify_polarity(is_customer, row, col, 'negative')
        
    elif row[col+'_textblob_SO'] == 0 and row[col+'_pattern_SO'] == 0:
        row[col+'_polarity'] = 'neutral'
    else:
        if abs(row[col+'_textblob_SO']) > abs(row[col+'_pattern_SO']):
            if row[col+'_textblob_SO'] > 0:
                row = intensify_polarity(is_customer, row, col, 'positive')
            else:
                row = intensify_polarity(is_customer, row, col, 'negative')
        else:
            if row[col+'_pattern_SO'] > 0:
                row = intensify_polarity(is_customer, row, col, 'positive')
            else:
                row = intensify_polarity(is_customer, row, col, 'negative')
    return row

In [15]:
def apply_lemmatization(row, textual_columns, users_df):
    """
    apply lematizeText Function on every row of dataset.
    
    :parameters:
        row : pandas Dataframe row | row to be processed

    :returns:
        processed row
    """
    try:
        if users_df.loc[row['bug_note_reporter_id'], 'roles'] in ['Customers', 'Customers Suppliers', 'initOS Suppliers', 
                                                                 'Partner']:
            is_customer = True
        else:
            is_customer = False
    except:
        is_customer = False
    for col in textual_columns:
        # Remove hyperlinks
        _text = re.sub(regex_urls, " ", row[col])
        _text = re.sub(regex_emails, " ", _text)
        _text = unicode(_text, "utf-8")
        tb_so = TextBlob(_text)
        row[col+'_textblob_SO'] = tb_so.sentiment.polarity
        row[col+'_textblob_subjectivity'] = tb_so.sentiment.subjectivity
        pt_so = sentiment(_text)
        row[col+'_pattern_SO'] = pt_so[0]
        row[col+'_pattern_subjectivity'] = pt_so[1]
        row = get_polarity(row, col, is_customer)
        row[col] = lemmatizeText(_text)
    return row

In [16]:
def pre_process_data(df, textual_columns, users_df):
    """
        remove the null values and then apply apply_lemmatization.

        :parameters:
            df : pandas Dataframe Object | Dataframe to be processed
    
        :returns:
            Processed Dataframe

    """
    df = df.apply(lambda row: apply_lemmatization(row, textual_columns, users_df),axis=1)
    return df

In [17]:
if __name__ == '__main__':
    df = pd.read_csv('../datasets/mantis_bug_notes_en.csv')
    users_df = pd.read_csv('../datasets/mantis_user_groups.csv', index_col='user_id')
    preprocessed = pre_process_data(df, ['bug_note'], users_df)
    preprocessed.to_csv('../datasets/lexical_semantic_preprocessed_mantis_bugnotes.csv', encoding='utf-8', index=False)

In [19]:
preprocessed['bug_note_polarity'].value_counts()

positive              24514
neutral               16764
negative               9762
extremely_positive     1570
extremely_negative      727
Name: bug_note_polarity, dtype: int64

In [57]:
df_bug_history_table = pd.read_csv('../datasets/mantis_bugs_history.csv')
df_bug_history_table.fillna('', inplace=True)
df_bug_history_table['date_modified'] = pd.to_datetime(df_bug_history_table['date_modified'], unit='s')

In [58]:
status_changes = df_bug_history_table[((df_bug_history_table['field_name']=='priority') & 
                      (df_bug_history_table['new_value'] > df_bug_history_table['old_value']) &
                     (df_bug_history_table['new_value'] > 30)) | 
                     ((df_bug_history_table['field_name']=='severity') & 
                      (df_bug_history_table['new_value'] > df_bug_history_table['old_value']) &
                     (df_bug_history_table['new_value'] > 50))]

In [46]:
df = pd.read_csv('../datasets/lexical_semantic_preprocessed_mantis_bugnotes.csv')

In [47]:
df_dates = pd.read_csv('../datasets/mantis_bug_notes_dates.csv')

In [45]:
df.head()

Unnamed: 0,bug_id,bug_note,bug_note_reporter_id,bug_note_translation_status,bugnote_id
0,12,The one compatible Java VM must be installed o...,3,English,1
1,12,The problem could be reconstructed and the ins...,3,English,2
2,12,The problem is now edited User55 User3,3,English,3
3,12,The problem is fixed and the JVM installed. Fr...,3,English,4
4,10,IPhone is not yet connected to the Exchange,3,English,5
