In [118]:
import pandas as pd
import numpy as np
import re
import dill
import tqdm
import gc

In [2]:
df = pd.read_csv('GH_processed_labeled_issues_prs.csv')
df.head()

Unnamed: 0,description,label
0,pip should not execute arbitrary code from the...,2
1,Make load safe_load - [ ] Make `yaml.load` def...,2
2,"CSRF vulnerability CVE-2013-7259 Hi, I'm looki...",2
3,"Add salt to Array#hash For #2437, we partially...",2
4,Cross Site Scripting Vulnerability Versions up...,2


In [4]:
from sklearn.model_selection import train_test_split

X = df.description.values
y = df.label.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((481390,), (120348,), (481390,), (120348,))

In [5]:
from collections import Counter
print('Train:', Counter(y_train))
print('Test:', Counter(y_test))

Train: Counter({0: 413990, 1: 66929, 2: 471})
Test: Counter({0: 103497, 1: 16744, 2: 107})


In [108]:
import contractions
from bs4 import BeautifulSoup
import unicodedata
import re


def strip_html_tags(text):
    soup = BeautifulSoup(text, "lxml")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text, re.I)
    return stripped_text


def remove_urls(text):
    url_pattern = '((https?:\/\/)(\s)*(www\.)?|(www\.))(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*'
    text = re.sub(url_pattern, ' ', text, re.I)
    return text

def remove_checklists(text):
    checklist_pattern = r'\[[xX\.\s]\]'
    text = re.sub(checklist_pattern, ' ', text, re.I|re.DOTALL)
    return text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def expand_contractions(text):
    return contractions.fix(text)



def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, ' ', text)
    return text


def pre_process_document(document):
    
    # strip HTML
    document = strip_html_tags(document)
    
    # remove URLS
    document = remove_urls(document)
    
    # remove checklists
    document = remove_checklists(document)
    
    # expand contractions    
    document = expand_contractions(document)
    
    # lower case
    document = document.lower()
    
    # remove extra newlines (often might be present in really noisy text)
    document = document.translate(document.maketrans("\n\t\r", "   "))
    
    # remove accented characters
    document = remove_accented_chars(document)
    
    
               
    # remove special characters and\or digits    
    # insert spaces between special characters to isolate them    
    special_char_pattern = re.compile(r'([{.(-)!}])')
    document = special_char_pattern.sub(" \\1 ", document)
    document = remove_special_characters(document, remove_digits=False)  
    
    # remove only numbers
    document = re.sub(r'\b\d+\b', ' ', document)
        
    # remove extra whitespace
    document = re.sub(' +', ' ', document)
    document = document.strip()
    
    return document


pre_process_corpus = np.vectorize(pre_process_document)

In [172]:
gc.collect()

18229

In [173]:
docs = []
for idx, record in tqdm.tqdm(enumerate(X_train)):
    doc = pre_process_document(record)
    docs.append(doc)
    if ((idx+1) % 50000 == 0) or ((idx+1) == len(X_train)):
        with open('Xtrain_norm.pkl', 'ab+') as f:
            dill.dump(docs, f)
        print('Processed and stored {} docs'.format(len(docs)))
        docs = []

50033it [02:10, 234.99it/s]

Processed and stored 50000 docs


100004it [04:22, 219.67it/s]

Processed and stored 50000 docs


150048it [06:39, 259.77it/s]

Processed and stored 50000 docs


200038it [08:48, 260.91it/s]

Processed and stored 50000 docs


250069it [10:55, 284.75it/s]

Processed and stored 50000 docs


300028it [13:09, 237.82it/s]

Processed and stored 50000 docs


350040it [15:23, 228.85it/s]

Processed and stored 50000 docs


400035it [17:34, 236.44it/s]

Processed and stored 50000 docs


450047it [19:44, 276.53it/s]

Processed and stored 50000 docs


481390it [21:04, 380.56it/s]

Processed and stored 31390 docs





In [174]:
gc.collect()

15514

In [175]:
docs = []
for idx, record in tqdm.tqdm(enumerate(X_test)):
    doc = pre_process_document(record)
    docs.append(doc)
    if ((idx+1) % 50000 == 0) or ((idx+1) == len(X_test)):
        with open('Xtest_norm.pkl', 'ab+') as f:
            dill.dump(docs, f)
        print('Processed and stored {} docs'.format(len(docs)))
        docs = []

50029it [02:04, 234.28it/s]

Processed and stored 50000 docs


100025it [04:17, 210.21it/s]

Processed and stored 50000 docs


120348it [05:10, 387.22it/s]

Processed and stored 20348 docs





In [176]:
gc.collect()

11190

In [177]:
with open('ytrain_labels.pkl', 'wb') as f:
    dill.dump(y_train, f)
with open('ytest_labels.pkl', 'wb') as f:
    dill.dump(y_test, f)

In [181]:
gc.collect()

0