In [3]:
import numpy as np
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re
from concurrent import futures
import threading
import pandas as pd

def strip_html_tags(text):
    soup = BeautifulSoup(text, "lxml")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text, re.I)
    return stripped_text


def remove_urls(text):
    url_pattern = '((https?:\/\/)(\s)*(www\.)?|(www\.))(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*'
    text = re.sub(url_pattern, ' ', text, re.I)
    return text


def remove_checklists(text):
    checklist_pattern = r'\[[xX\.\s]\]'
    text = re.sub(checklist_pattern, ' ', text, re.I | re.DOTALL)
    return text


def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)


def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9/\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, ' ', text)
    return text


def pre_process_document(document):
    # strip HTML
    document = strip_html_tags(document)

    # remove URLS
    document = remove_urls(document)

    # remove checklists
    document = remove_checklists(document)

    # expand contractions
    #document = expand_contractions(document)

    # lower case
    document = document.lower()

    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))

    # remove accented characters
    document = remove_accented_chars(document)

    # remove special characters and\or digits
    # insert spaces between special characters to isolate them
    #special_char_pattern = re.compile(r'([{.(-)!}])')
    #document = special_char_pattern.sub(" \\1 ", document)
    #document = remove_special_characters(document, remove_digits=False)

    # remove only numbers
    #document = re.sub(r'\b\d+\b', '', document)

    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()

    return document


def parallel_preprocessing(idx, doc, total_docs):
    if idx % 5000 == 0 or idx == (total_docs - 1):
        print('{}: working on doc num: {}'.format(threading.current_thread().name,
                                                  idx)
    )
    return pre_process_document(doc)


def pre_process_documents_parallel(documents):
    total_docs = len(documents)
    docs_input = [[idx, doc, total_docs] for idx, doc in enumerate(documents)]
    
    ex = futures.ThreadPoolExecutor(max_workers=None)
    print('preprocessing: starting')
    norm_descriptions_map = ex.map(parallel_preprocessing, 
                                   [record[0] for record in docs_input],
                                   [record[1] for record in docs_input],
                                   [record[2] for record in docs_input])
    norm_descriptions = list(norm_descriptions_map)
    return norm_descriptions

In [4]:
df = pd.read_csv('./data/GH_complete_labeled_issues_prs.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152151 entries, 0 to 152150
Data columns (total 2 columns):
description    152151 non-null object
label          152151 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [5]:
docs = df['description'].values
labels = df['label'].values

In [6]:
%%time

norm_docs = pre_process_documents_parallel(documents=docs)
print(len(norm_docs))

preprocessing: starting
ThreadPoolExecutor-0_0: working on doc num: 0
ThreadPoolExecutor-0_4: working on doc num: 5000
ThreadPoolExecutor-0_11: working on doc num: 10000
ThreadPoolExecutor-0_18: working on doc num: 15000
ThreadPoolExecutor-0_3: working on doc num: 20000
ThreadPoolExecutor-0_8: working on doc num: 25000
ThreadPoolExecutor-0_13: working on doc num: 30000
ThreadPoolExecutor-0_16: working on doc num: 35000
ThreadPoolExecutor-0_10: working on doc num: 40000
ThreadPoolExecutor-0_16: working on doc num: 45000
ThreadPoolExecutor-0_11: working on doc num: 50000
ThreadPoolExecutor-0_19: working on doc num: 55000
ThreadPoolExecutor-0_3: working on doc num: 60000
ThreadPoolExecutor-0_9: working on doc num: 65000
ThreadPoolExecutor-0_15: working on doc num: 70000
ThreadPoolExecutor-0_12: working on doc num: 75000
ThreadPoolExecutor-0_16: working on doc num: 80000
ThreadPoolExecutor-0_4: working on doc num: 85000
ThreadPoolExecutor-0_16: working on doc num: 90000
ThreadPoolExecutor-

In [7]:
new_df = pd.DataFrame({'description': norm_docs, 
                       'label': labels})
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152151 entries, 0 to 152150
Data columns (total 2 columns):
description    152151 non-null object
label          152151 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [8]:
new_df.head()

Unnamed: 0,description,label
0,openshift-node is logging private rsa keys to ...,2
1,issue with auth not logging when it fails ther...,2
2,webhook secrets are vulnerable to timing attac...,2
3,sql: support placeholders for identifiers exam...,2
4,syscall: guard against windows dll preloading ...,2


In [9]:
new_df.label.value_counts()

0    128908
1     22572
2       671
Name: label, dtype: int64

In [26]:
new_df.iloc[1915]

description     
label          0
Name: 1915, dtype: object

In [25]:
new_df.to_csv('./data/GH_complete_labeled_issues_prs - preprocessed.csv', index=False, encoding='utf-8')