# Script to prepare the data

In [2]:
# I think you don't have to execute this when running on your own device

# code for installing our own library for accessing the MongoDB through a ORM engine
import sys
!{sys.executable} -m pip install pycoshark
!{sys.executable} -m pip install nltk
import nltk
#nltk.download('wordnet')
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [1]:
import csv
import nltk
import numpy as np
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
from sklearn.utils import resample

import pickle
from scipy.sparse import save_npz, load_npz

from mongoengine import connect
from pycoshark.mongomodels import People, Commit, Project, VCSSystem, Issue, IssueSystem
from pycoshark.utils import create_mongodb_uri_string

In [2]:
# Database credentials
user = 'datascience2019'
password = 'zE3qHdeJtdVJYznf'
host = '134.76.81.151'
port = '27017'
authentication_db = 'smartshark'
database = "smartshark"
ssl_enabled = None

# Establish connection
uri = create_mongodb_uri_string(user, password, host, port, authentication_db, ssl_enabled)
connect(database, host=uri)

MongoClient(host=['134.76.81.151:27017'], document_class=dict, tz_aware=False, connect=True, authsource='smartshark', read_preference=Primary())

## Load raw data and generate data structures to store them

In [3]:
def print_issue(issue):
    print(issue.issue_type)
    print(issue.title)
    print(issue.desc)
    print()

In [4]:
stop_words=set(stopwords.words("english"))

def preprocess_text(text, name = None):
    # remove links
    text = re.sub('www\.\\S*', '', text)
    text = re.sub('https:\\S*', '', text)
    text = re.sub('https:\\S*', '', text)
    
    text = text.lower()
    # Tokenize each document into word list
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_word = tokenizer.tokenize(text)
    
    # remove other common words like project names
    # Unfortunately, this does not seem to improve performance at all, but do experiment!
#     custom_stop_words = ['org', 'apache']
#     if name != None:
#         custom_stop_words += name.split('-')
#     for word in custom_stop_words:
#         tokenized_word = [re.sub(word + '[\\s\\.$]', "", word) for word in tokenized_word]
#         text = re.sub(word + '[\\s\\.$]', '', text)
    
    # Remove any digits or underscores in each word of the list
    tokenized_word_digits_removed=[re.sub(r"\d+|_+", "", word) for word in tokenized_word]
    filtered_words=[]
    for word in tokenized_word_digits_removed:
        if word not in stop_words:
            filtered_words.append(word)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in filtered_words]
    snowball_stemmer = SnowballStemmer("english")
    stemmed_words = [snowball_stemmer.stem(word) for word in lemmatized_words]
    return ' '.join(stemmed_words)
    
    
def Nonetostr(string):
    if string is None:
        string = ""
    return string


def make_matrix(corpus, n_words = 1000):
    vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', max_features = n_words)
    X = vectorizer.fit_transform(corpus)
    return vectorizer.get_feature_names(), X

def make_count_matrix(corpus, n_words = 10000):
    vectorizer = CountVectorizer(lowercase=True, stop_words='english', max_features = n_words)
    X = vectorizer.fit_transform(corpus)
    return vectorizer.get_feature_names(), X

In [5]:
stop_words=set(stopwords.words("english"))

def preprocess_text_2(text, name = None):
    attributes = []
    
    text_length = len(text)
    
    # remove links
    text = re.sub('www\.\\S*', '', text)
    text = re.sub('http:\\S*', '', text)
    text = re.sub('https:\\S*', '', text)
    
    if len(text) < text_length:
        attributes.append('contains_link')
        text_length = len(text)
    
    if re.search('[Ee]rror', text) != None or re.search('[Ee]xception', text) != None:
        attributes.append('contains_exception')
    
    # removes empty line
    re.sub(r'\n(\s)*\n', '\n', text)
    text_length = len(text)
    
    # removes indented lines
    text = re.sub('(^|\n)(  +|\t).*', '\n', text)
    # removes java style multiline comments: /* ... */
    #text = re.sub(r'\/\*\*?([^*]+(\*)?[^/*]+)+\*\/', '', text)
    text = re.sub(r'\/(\*)(.|\n)*?\*\/', '', text)
    # removes camel-case strings
    text = re.sub(r'[a-z]*([A-Z]+[a-z]+)+', '', text)
    if len(text) < text_length:
        attributes.append('contains_code')
        text_length = len(text)
        
#     text = ''.join(filter(lambda x: x in string.printable, text))
#     if len(text) < text_length:
#         attributes.append('contains_non_english_char')
    
    
    
    text = text.lower()
    # Tokenize each document into word list
    tokenizer = RegexpTokenizer(r'\w+')
    tokenized_word = tokenizer.tokenize(text)

    # Remove any digits or underscores in each word of the list
    tokenized_word=[re.sub(r"\d+|_+", "", word) for word in tokenized_word]
    
    # remove other common words like project names
    # Unfortunately, this does not seem to improve performance at all, but do experiment!
    custom_stop_words = ['apache', 'aaa+', 'abc', 'bb+', 'cc+', 'zz+', 'xx+', 'yy+'] # 'org'
    if name != None:
        custom_stop_words += name.split('-')
    for stop_word in custom_stop_words:
        tokenized_word = [word if re.match(stop_word, word) == None else '' for word in tokenized_word]
        #tokenized_word = [re.sub(stop_word + '(\s|\.|$)', '', word) for word in tokenized_word]
        #text = re.sub(word + '[\\s\\.$]', '', text)   
        
    contains_non_english_char = False
    filtered_words=[]
    for word in tokenized_word:
        if None != re.match('[a-z][a-z]+$', word):
            if word not in stop_words:
                filtered_words.append(word)
        else:
            contains_non_english_char = True
    if contains_non_english_char:
        attributes.append('contains_non_english_char')
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in filtered_words]
    snowball_stemmer = SnowballStemmer("english")
    stemmed_words = [snowball_stemmer.stem(word) for word in lemmatized_words]
    
    return ' '.join(stemmed_words), attributes

In [8]:
# sandbox cell to inspect issues or test stuff

# project = Project.objects(name='commons-math').only('id').get()
# issue_system = IssueSystem.objects(project_id=project.id).only('id').get()
# issues = Issue.objects(issue_system_id=issue_system.id).only(*['issue_type','title','desc','priority','created_at'])

#preprocess_text_2('aaa abc hello sub zzz rüm', 'ant-ivy')
#re.match('[a-z][a-z]+$', 'aoã')

Create stemmed corpus without stopwords and links 

In [6]:
def generate_raw_data():
    names = ['ant-ivy','archiva','calcite','cayenne','commons-bcel','commons-beanutils','commons-codec','commons-collections','commons-compress','commons-configuration','commons-dbcp','commons-digester','commons-io','commons-jcs','commons-jexl','commons-lang','commons-math','commons-net','commons-rdf','commons-scxml','commons-validator','commons-vfs','deltaspike','eagle','giraph','gora','jspwiki','kylin','lens','mahout','manifoldcf','nutch','opennlp','parquet-mr','santuario-java','systemml','tika','wss4j']
    # Prepare csv file to store the information in
    csv_columns = ['issue_type','title_and_desc','priority', 'environment', 'original_time_estimate', 'created_at']
    csv_file = "issues.csv"
    f = open(csv_file, 'w+', newline='', encoding="utf8")
    writer = csv.DictWriter(f, fieldnames=csv_columns)
    writer.writeheader()

    # corpus will consist of title and desc
    corpus = []

    # Keep tally of type frequencies
    all_types = ['None']
    all_count = dict()
    all_count['None'] = 0

    index = 0
    for name in names:
        project = Project.objects(name=name).only('id').get()
        issue_system = IssueSystem.objects(project_id=project.id).only('id').get()

        issue_features = ['original_time_estimate', 'created_at', 'priority', 'creator_id', 'title', 'desc', 'issue_type']
        # removed environment from extracted features

        for issue in Issue.objects(issue_system_id=issue_system.id).only(*issue_features):
            index += 1
            issue_dict = dict()
            t = issue.issue_type
            if t == None:
                all_count['None'] += 1
                #print_issue(issue)
                continue # all the relevant fields are None as well, so skip this issue
            if not issue.issue_type in all_types:
                all_types.append(issue.issue_type)
                all_count[issue.issue_type] = 1
            all_count[issue.issue_type] += 1
            issue_dict["issue_type"] = t
            if issue.title == None and issue.desc == None:
                print_issue(issue)
                continue

            issue_dict["title_and_desc"] = preprocess_text(Nonetostr(issue.title) + " " + Nonetostr(issue.desc))
            issue_dict["priority"] = issue.priority
            issue_dict["original_time_estimate"] = issue.original_time_estimate
            issue_dict["created_at"] = issue.created_at

            text = preprocess_text(Nonetostr(issue.title) + " " + Nonetostr(issue.desc), name)
            corpus.append(text)


            try:
                writer.writerow(issue_dict)
            except IOError:
                print("I/O error")

    f.close()
    return corpus

Create corpus without code snippets

In [7]:
def generate_raw_data_2():
    names = ['ant-ivy','archiva','calcite','cayenne','commons-bcel','commons-beanutils','commons-codec','commons-collections','commons-compress','commons-configuration','commons-dbcp','commons-digester','commons-io','commons-jcs','commons-jexl','commons-lang','commons-math','commons-net','commons-rdf','commons-scxml','commons-validator','commons-vfs','deltaspike','eagle','giraph','gora','jspwiki','kylin','lens','mahout','manifoldcf','nutch','opennlp','parquet-mr','santuario-java','systemml','tika','wss4j']
    # Prepare csv file to store the information in
    csv_columns = ['issue_type','title_and_desc','priority','created_at','code','exception','link','non_english_char']
    csv_file = "issues_2.csv"
    f = open(csv_file, 'w+', newline='', encoding="utf8")
    writer = csv.DictWriter(f, fieldnames=csv_columns)
    writer.writeheader()

    # corpus will consist of title and desc
    corpus = []

    index = 0
    for name in names:
        project = Project.objects(name=name).only('id').get()
        issue_system = IssueSystem.objects(project_id=project.id).only('id').get()

        issue_features = ['original_time_estimate', 'created_at', 'priority', 'creator_id', 'title', 'desc', 'issue_type']
        # removed environment from extracted features

        for issue in Issue.objects(issue_system_id=issue_system.id).only(*issue_features):
            index += 1
            issue_dict = dict()
            t = issue.issue_type
            if t == None:
                if issue.title != None or issue.desc != None:
                    print_issue(issue)
                continue # all the relevant fields are None as well, so skip this issue
            issue_dict["issue_type"] = t
            if issue.title == None and issue.desc == None:
                continue

            issue_dict["title_and_desc"] = preprocess_text_2(Nonetostr(issue.title) + " " + Nonetostr(issue.desc))
            issue_dict["priority"] = issue.priority
            #issue_dict["original_time_estimate"] = issue.original_time_estimate
            issue_dict["created_at"] = issue.created_at

            text, flags = preprocess_text_2(Nonetostr(issue.title) + " " + Nonetostr(issue.desc), name)
            corpus.append(text)
            issue_dict['code'] = 'contains_code' in flags
            issue_dict['exception'] = 'contains_exception' in flags
            issue_dict['link'] = 'contains_link' in flags
            issue_dict['non_english_char'] = 'contains_non_english_char' in flags

            try:
                writer.writerow(issue_dict)
            except IOError:
                print("I/O error")

    f.close()
    return corpus

## File generation

Execute this cell with the generate-method and file names of your choice

In [8]:
corpus = generate_raw_data()

In [9]:
corpus = [s if type(s) == str else '' for s in corpus]
words, matrix = make_matrix(corpus, 1000)
save_npz('matrix_2', matrix)
with open("feature_names_2.txt", "wb") as fp:
        pickle.dump(words, fp)

In [13]:
# inspect the results
print(words)

['abil', 'abl', 'abstract', 'accept', 'access', 'accord', 'account', 'action', 'activ', 'actual', 'ad', 'adapt', 'adb', 'add', 'addit', 'address', 'admin', 'affect', 'agent', 'aggreg', 'aim', 'alert', 'algorithm', 'allow', 'alpha', 'alreadi', 'altern', 'alway', 'annot', 'anoth', 'ant', 'anyth', 'anywher', 'api', 'apidoc', 'app', 'appear', 'append', 'appli', 'applic', 'approach', 'appropri', 'archiv', 'arg', 'argument', 'arquillian', 'array', 'artifact', 'ask', 'assembl', 'assign', 'assum', 'attach', 'attempt', 'attribut', 'authent', 'author', 'auto', 'automat', 'auxiliari', 'avail', 'avatica', 'avoid', 'avro', 'awt', 'baafb', 'bad', 'bar', 'base', 'basic', 'batch', 'bean', 'beanutil', 'becom', 'begin', 'behavior', 'behaviour', 'believ', 'belong', 'best', 'beta', 'better', 'big', 'bin', 'binari', 'bind', 'bio', 'bit', 'block', 'bodi', 'boolean', 'box', 'branch', 'break', 'broken', 'buffer', 'bug', 'build', 'builder', 'built', 'bundl', 'button', 'byte', 'ca', 'cach', 'calcul', 'case', 'c

Execute this cell in order to load data for further manipulations

In [14]:
df = pd.read_csv("issues.csv")
mat = load_npz('matrix.npz')
type_list = df.issue_type.to_list()
corpus = df.title_and_desc.to_list()
corpus = [s if type(s) == str else '' for s in corpus] # there is one issue without description in the list


## Some statistics about the data

In [16]:
n_issues = len(corpus)
all_types = set(type_list)

all_count = dict()
for t in all_types:
    all_count[t] = type_list.count(t)
    
print(all_count)


rel_freq = all_count.copy()
for key in all_types:
    rel_freq[key] = all_count[key]/n_issues
print("Relative frequencies: " + str(rel_freq))


relevant_classes1000 = [t for t in all_types if all_count[t] >= 1000 ]
#[y if y == 'Bug' else 'Other' for y in y_train]
print(relevant_classes1000)
relevant_classes400 = [t for t in all_types if all_count[t] >= 400 ]
#[y if y == 'Bug' else 'Other' for y in y_train]
print(relevant_classes400)

{'Blogs - New Blog User Account Request': 1, 'Improvement': 12535, 'Planned Work': 1, 'Umbrella': 8, 'New Feature': 3283, 'Brainstorming': 2, 'Blog - New Blog Request': 3, 'Technical task': 5, 'Documentation': 71, 'Dependency upgrade': 9, 'New JIRA Project': 2, 'Temp': 1, 'Project': 1, 'Proposal': 1, 'Bug': 21129, 'Story': 12, 'New Git Repo': 1, 'Epic': 91, 'Sub-task': 1870, 'Wish': 550, 'Question': 85, 'Test': 281, 'Task': 2905, 'New TLP ': 1}
Relative frequencies: {'Blogs - New Blog User Account Request': 2.333831217326363e-05, 'Improvement': 0.2925457430918596, 'Planned Work': 2.333831217326363e-05, 'Umbrella': 0.00018670649738610905, 'New Feature': 0.0766196788648245, 'Brainstorming': 4.667662434652726e-05, 'Blog - New Blog Request': 7.001493651979089e-05, 'Technical task': 0.00011669156086631815, 'Documentation': 0.0016570201643017178, 'Dependency upgrade': 0.00021004480955937266, 'New JIRA Project': 4.667662434652726e-05, 'Temp': 2.333831217326363e-05, 'Project': 2.33383121732636

## Create new data structures in which raw classes are eliminated

In [16]:
def get_rid(class_arr, name_extension, df = df, n_words = 1000):
    df2 = df.copy()
    df2 = df2[df2.issue_type.isin(class_arr)]
    
    corpus2 = df2.title_and_desc.to_list()
    corpus2 = [s if type(s) == str else '' for s in corpus2]
    mat_corpus2 = make_matrix(corpus2, n_words)
    
    save_npz('matrix_reduced' + name_extension, mat_corpus2[1])
    with open('feature_names_reduced'+name_extension+'.txt', "wb") as fp:
        pickle.dump(mat_corpus2[0], fp)
    df2.to_csv('issues_reduced' + name_extension + '.csv')
    
    return df2

df1000 = get_rid(relevant_classes1000, '')
set(df1000.issue_type)


{'Bug', 'Improvement', 'New Feature', 'Sub-task', 'Task'}

In [17]:
df400 = get_rid(relevant_classes400, '_plus_wish')

## Upsampling

Apply various method to add new data with rare classes.

In [18]:
df_umbrella = df.loc[df['issue_type'] == 'Umbrella']
df_minority_upsampled = resample(df_umbrella, 
                                 replace=True,     # sample with replacement
                                 n_samples=400,    # to match majority class
                                 random_state=1505) # reproducible results
df_minority_upsampled

Unnamed: 0,issue_type,title_and_desc,priority,created_at,code,exception,link,non_english_char
38928,Umbrella,"('releas relat improv', ['contains_code', 'con...",Major,2017-01-31 05:09:46.563000,True,False,False,True
37488,Umbrella,('conveni umbrella track improv trigger work d...,Major,2016-03-22 18:38:17.828000,True,False,False,True
39430,Umbrella,('ml jira place understand ml machin learn pla...,Major,2017-07-19 09:24:31.756000,True,False,False,True
39430,Umbrella,('ml jira place understand ml machin learn pla...,Major,2017-07-19 09:24:31.756000,True,False,False,True
38802,Umbrella,"('releas relat improv', ['contains_code', 'con...",Major,2016-10-05 02:00:12.390000,True,False,False,True
...,...,...,...,...,...,...,...,...
39442,Umbrella,"('creat issu type umbrella', ['contains_code',...",Trivial,2017-07-26 00:07:49.116000,True,False,False,True
39331,Umbrella,('exampl includ notebook exampl document inclu...,Major,2017-05-12 00:23:48.910000,True,False,False,False
39442,Umbrella,"('creat issu type umbrella', ['contains_code',...",Trivial,2017-07-26 00:07:49.116000,True,False,False,True
37488,Umbrella,('conveni umbrella track improv trigger work d...,Major,2016-03-22 18:38:17.828000,True,False,False,True
