In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
import re
import unicodedata as ud
from nltk.corpus import wordnet as wn
from nltk.corpus import words as w
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import os
pd.set_option('display.max_colwidth', None)

In [2]:
train = pd.read_csv('original_datasets/github-labels-top3-803k-train.csv')
test = pd.read_csv('original_datasets/github-labels-top3-803k-test.csv')
print(train.shape, test.shape)

(722899, 8) (80518, 8)


In [3]:
train.head()

Unnamed: 0.1,Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,issue_title,issue_body
0,0,https://api.github.com/repos/eamodio/vscode-gitlens/issues/1302,bug,2021-01-02T18:07:30Z,NONE,https://api.github.com/repos/eamodio/vscode-gitlens,Welcome screen on every editor window is very tedious,"I just discovered Gitlens and find the functionality useful, thank you to all who contribute.\r\n\r\nI have about a dozen editor windows open, and the install process added a Gitlens welcome tab to each and every one of them. Combined with the snowflake effect, all of the sudden VScode was consuming 300-400% cpu and my fan was raging, as soon as I hunted them all down everything was back to fine. The welcome note content is great (although putting it on _all_ the windows is a bit much, don't know how much control you have on that). But overall it was a bit of a sour first-use experience, just wanted to provide that feedback."
1,1,https://api.github.com/repos/binwiederhier/pcopy/issues/9,bug,2020-12-31T18:19:31Z,OWNER,https://api.github.com/repos/binwiederhier/pcopy,"""pcopy invite"" and ""pcopy paste abc:"" does not check if clipboard exists",
2,2,https://api.github.com/repos/binwiederhier/pcopy/issues/16,bug,2021-01-03T04:33:36Z,OWNER,https://api.github.com/repos/binwiederhier/pcopy,"UI: Modal overlay is half transparent, shouldn't be",
3,3,https://api.github.com/repos/Sothatsit/RoyalUrClient/issues/41,enhancement,2020-12-25T00:46:00Z,OWNER,https://api.github.com/repos/Sothatsit/RoyalUrClient,Make the loading screen scale with browser window size,"Currently the loading wheel is a fixed size in pixels, but it would be better to specify it in terms of percentage of the browser size."
4,4,https://api.github.com/repos/Malivil/TTT-Custom-Roles/issues/190,bug,2021-01-02T21:36:57Z,OWNER,https://api.github.com/repos/Malivil/TTT-Custom-Roles,Spectator - Investigate a way to strip weapons before they are spectating a player,To bring magneto stick floating


In [4]:
train['original_issue_title'] = train['issue_title']
train['original_issue_body'] = train['issue_body']

train.rename(columns={'issue_title': 'preprocessed_title'}, inplace=True)
train.rename(columns={'issue_body': 'preprocessed_body'}, inplace=True)

test['original_issue_title'] = test['issue_title']
test['original_issue_body'] = test['issue_body']

test.rename(columns={'issue_title': 'preprocessed_title'}, inplace=True)
test.rename(columns={'issue_body': 'preprocessed_body'}, inplace=True)

In [5]:
label = 'issue_label'
time = 'issue_created_at'
repo = 'repository_url'
title  = 'preprocessed_title'
body = 'preprocessed_body'
author = 'issue_author_association'
url = 'issue_url'
label_col = 'labels'
text_col = 'text'
max_title = 30
max_body = 170
# punctuations = '!"$%&\()*,/:;<=>[\\]^`{|}~+#@-`' below line is added since this line (original) gives SyntaxWarning: invalid escape sequence '\('
punctuations = r'!"$%&\()*,/:;<=>[\\]^`{|}~+#@-`'
ascii_regex = re.compile(r'[^\x00-\x7f]')
issue_regex = re.compile(r'#[0-9]+')
function_regex = re.compile(r'[a-zA-Z][a-zA-Z0-9_.]*\([a-zA-Z0-9_, ]*\)')

first deduplicate the TRAINING dataset based on issue URls

In [6]:
dedup_train = train.sort_values(url).drop_duplicates(subset=[url]).copy()
print('Number of dropped issue duplications: ' , train.shape[0] - dedup_train.shape[0])

dedup_train[title] = dedup_train[title].astype(str)
dedup_train[body] = dedup_train[body].astype(str)
dedup_train[author] = dedup_train[author].astype(str)
dedup_train[time] = dedup_train[time].astype(str)
dedup_train[repo] = dedup_train[repo].astype(str)

test[title] = test[title].astype(str)
test[body] = test[body].astype(str)
test[author] = test[author].astype(str)
test[time] = test[time].astype(str)
test[repo] = test[repo].astype(str)

Number of dropped issue duplications:  26220


text normalization on text columns of issues

In [7]:
print('Replacing functions...')
dedup_train[body] = dedup_train[body].apply(lambda x:function_regex.sub(" function ",x))
test[body] = test[body].apply(lambda x:function_regex.sub(" function ",x))

print('Replacing issue numbers...')
dedup_train[title] = dedup_train[title].apply(lambda x:issue_regex.sub(" issue ",x))
dedup_train[body] = dedup_train[body].apply(lambda x:issue_regex.sub(" issue ",x))
test[title] = test[title].apply(lambda x:issue_regex.sub(" issue ",x))
test[body] = test[body].apply(lambda x:issue_regex.sub(" issue ",x))


print('Converting to lower case...')
dedup_train[title] = dedup_train[title].str.lower()
dedup_train[body] = dedup_train[body].str.lower()
test[title] = test[title].str.lower()
test[body] = test[body].str.lower()

Replacing functions...
Replacing issue numbers...
Converting to lower case...


remove extra information from text

In [8]:
print('Removing punctuations...')
replace_string = ' '*len(punctuations)
dedup_train[title] = dedup_train[title].str.translate(str.maketrans(punctuations, replace_string))
dedup_train[body] = dedup_train[body].str.translate(str.maketrans(punctuations, replace_string))
test[title] = test[title].str.translate(str.maketrans(punctuations, replace_string))
test[body] = test[body].str.translate(str.maketrans(punctuations, replace_string))

print('Removing non-ascii charachters...')
dedup_train[title] = dedup_train[title].apply(lambda x:re.sub(ascii_regex, '', x))
dedup_train[title] = dedup_train[title].apply(lambda x:ud.normalize('NFD', x))
dedup_train[body] = dedup_train[body].apply(lambda x:re.sub(ascii_regex, '', x))
dedup_train[body] = dedup_train[body].apply(lambda x:ud.normalize('NFD', x))

test[title] = test[title].apply(lambda x:re.sub(ascii_regex, '', x))
test[title] = test[title].apply(lambda x:ud.normalize('NFD', x))
test[body] = test[body].apply(lambda x:re.sub(ascii_regex, '', x))
test[body] = test[body].apply(lambda x:ud.normalize('NFD', x))

print('Replacing fixed part of repo URl column...')
dedup_train[repo] = dedup_train[repo].apply(lambda x: x.replace('https://api.github.com/repos/', ''))
test[repo] = test[repo].apply(lambda x: x.replace('https://api.github.com/repos/', ''))

print('Replacing white spaces...')
dedup_train[title] = dedup_train[title].apply(lambda x:" ".join(x.split()))
dedup_train[body] = dedup_train[body].apply(lambda x:" ".join(x.split()))
test[title] = test[title].apply(lambda x:" ".join(x.split()))
test[body] = test[body].apply(lambda x:" ".join(x.split()))

Removing punctuations...
Removing non-ascii charachters...
Replacing fixed part of repo URl column...
Replacing white spaces...


truncate columns

In [9]:
dedup_train[title] = dedup_train[title].apply(lambda x: ' '.join(x.split(maxsplit=max_title)[:max_title]))
dedup_train[body] = dedup_train[body].apply(lambda x: ' '.join(x.split(maxsplit=max_body)[:max_body]))
test[title] = test[title].apply(lambda x: ' '.join(x.split(maxsplit=max_title)[:max_title]))
test[body] = test[body].apply(lambda x: ' '.join(x.split(maxsplit=max_body)[:max_body]))

prepare labels column

In [10]:
# convert categorical data to codes
dedup_train[label] = pd.Categorical(dedup_train[label])
test[label] = pd.Categorical(test[label])
dedup_train[label_col] = dedup_train[label].cat.codes
test[label_col] = test[label].cat.codes

concat issue columns in one "text" column to feed the model

In [11]:
# concat columns in a bag of sentences
dedup_train[text_col] = 'time ' + dedup_train[time] + ' author ' + dedup_train[author] +' repo ' + dedup_train[repo] + ' title ' + dedup_train[title] + ' body ' + dedup_train[body]
test[text_col] = 'time ' + test[time] + ' author ' + test[author] +' repo ' + test[repo] + ' title ' + test[title] + ' body ' + test[body]

In [12]:
dedup_train.head(1)

Unnamed: 0.1,Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,preprocessed_title,preprocessed_body,original_issue_title,original_issue_body,labels,text
258501,287489,https://api.github.com/repos/0-Yama/Projet-Final-Python/issues/3,enhancement,2021-05-26T20:57:00Z,OWNER,0-Yama/Projet-Final-Python,the beginning,we need to add the first file of our project,The beginning,We need to add the first file of our project,1,time 2021-05-26T20:57:00Z author OWNER repo 0-Yama/Projet-Final-Python title the beginning body we need to add the first file of our project


In [13]:
test.head(1)

Unnamed: 0.1,Unnamed: 0,issue_url,issue_label,issue_created_at,issue_author_association,repository_url,preprocessed_title,preprocessed_body,original_issue_title,original_issue_body,labels,text
0,6,https://api.github.com/repos/tlnagy/TIFF.jl/issues/7,enhancement,2020-04-07T09:08:50Z,NONE,tlnagy/TIFF.jl,error keyerror key tiff.sampleformat_int 0x0008 not found,one more error might need to be caught. 4d series.ome.tif is sample file from ome tiff website https docs.openmicroscopy.org ome model 6.0.0 ome tiff data.html . julia julia tiff.load 4d series.ome.tif error keyerror key tiff.sampleformat_int 0x0008 not found stacktrace 1 getindex dict tuple tiff.sampleformats int64 datatype tuple tiff.sampleformats uint16 at . dict.jl 477 2 output tiff.ifd uint32 at home hf .julia dev tiff src ifds.jl 113 3 load string at home hf .julia dev tiff src load.jl 14 4 top level scope at repl 2 1,"ERROR: KeyError: key (TIFF.SAMPLEFORMAT_INT, 0x0008) not found","One more error might need to be caught.\r\n`4D-series.ome.tif` is sample file from [OME-TIFF website]( https://docs.openmicroscopy.org/ome-model/6.0.0/ome-tiff/data.html).\r\n\r\n```julia\r\njulia> TIFF.load(""4D-series.ome.tif"")\r\nERROR: KeyError: key (TIFF.SAMPLEFORMAT_INT, 0x0008) not found\r\nStacktrace:\r\n [1] getindex(::Dict{Tuple{TIFF.SampleFormats,Int64},DataType}, ::Tuple{TIFF.SampleFormats,UInt16}) at ./dict.jl:477\r\n [2] output(::TIFF.IFD{UInt32}) at /home/hf/.julia/dev/TIFF/src/ifds.jl:113\r\n [3] load(::String) at /home/hf/.julia/dev/TIFF/src/load.jl:14\r\n [4] top-level scope at REPL[2]:1\r\n```\r\n",1,time 2020-04-07T09:08:50Z author NONE repo tlnagy/TIFF.jl title error keyerror key tiff.sampleformat_int 0x0008 not found body one more error might need to be caught. 4d series.ome.tif is sample file from ome tiff website https docs.openmicroscopy.org ome model 6.0.0 ome tiff data.html . julia julia tiff.load 4d series.ome.tif error keyerror key tiff.sampleformat_int 0x0008 not found stacktrace 1 getindex dict tuple tiff.sampleformats int64 datatype tuple tiff.sampleformats uint16 at . dict.jl 477 2 output tiff.ifd uint32 at home hf .julia dev tiff src ifds.jl 113 3 load string at home hf .julia dev tiff src load.jl 14 4 top level scope at repl 2 1


split and save datasets

In [14]:
# reset index
dedup_train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

dedup_train[['original_issue_title','preprocessed_title','original_issue_body','preprocessed_body','text','issue_label','labels']].to_csv(f'nlbse22_dataset_nlbse22_preprocessing/train_clean_concat_{max_title+max_body}_detailed_data.csv', index = False)
test[['original_issue_title','preprocessed_title','original_issue_body','preprocessed_body','text','issue_label','labels']].to_csv(f'nlbse22_dataset_nlbse22_preprocessing/test_clean_concat_{max_title+max_body}_detailed_data.csv', index = False)

In [None]:
dedup_train[['text','labels']].to_csv(f'nlbse22_dataset_nlbse22_preprocessing/train_clean_concat_{max_title+max_body}.csv', index = False)
test[['text','labels']].to_csv(f'nlbse22_dataset_nlbse22_preprocessing/test_clean_concat_{max_title+max_body}.csv', index = False)