In [10]:
import pandas as pd

In [11]:
train_file_path = r"C:\Users\upech\Documents\MLforMDE\issue-report-classification\data\issues_train.csv"
test_file_path = r"C:\Users\upech\Documents\MLforMDE\issue-report-classification\data\issues_test.csv"

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

In [5]:
!pip install ekphrasis



In [12]:
train_df.columns

Index(['repo', 'created_at', 'label', 'title', 'body'], dtype='object')

In [6]:
!pip install scikit-learn



In [4]:
from sklearn.preprocessing import LabelEncoder

from ekphrasis.dicts.emoticons import emoticons
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.classes.preprocessor import TextPreProcessor

from tqdm import tqdm

In [7]:
def get_ekphrasis_preprocessor():
    return TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
    )

In [14]:
import re

label_to_int = {
    "bug": 0,
    "feature": 1,
    "question": 2
}

image_regex = re.compile('!(.*).*')
link_regex_1 = re.compile('(.*).*')
link_regex_2 = re.compile('(.*): [^\s]+')
code_regex = re.compile('(:?`[^`]+`|```[^`]*```)')

def perform_preprocessing(text, preprocessor):
    text = " ".join(text.split())
    text = text.lower()
    cleaned_text = re.sub(image_regex, r'\1 ', text)
    cleaned_text = re.sub(link_regex_1, r'\1 ', cleaned_text)
    cleaned_text = re.sub(link_regex_2, r'\1 ', cleaned_text)
    cleaned_text = re.sub(code_regex, r'\1 ', cleaned_text)

    cleaned_text = " ".join(preprocessor.pre_process_doc(cleaned_text))
    
    return cleaned_text

def preprocess_dataframe(df):
    preprocessor = get_ekphrasis_preprocessor()
    cleaned_df = df.copy()
    cleaned_df["issue_text"] = cleaned_df.apply(lambda x: str(x["title"]) + " " + str(x["body"]), axis=1)
    label_encoder = lambda x: label_to_int[x]
    cleaned_df["label"] = [label_encoder(x) for x in cleaned_df['label']]
    cleaned_df = cleaned_df[["repo", "issue_text", "label"]]
    cleaned_df["issue_text"] = cleaned_df["issue_text"].apply(lambda x: perform_preprocessing(x, preprocessor))
    
    return cleaned_df

cleaned_train_df = preprocess_dataframe(train_df)
cleaned_test_df = preprocess_dataframe(test_df)


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...
Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [15]:
cleaned_train_df.to_csv("preprocessed_issues_train.csv", index=False)
cleaned_test_df.to_csv("preprocessed_issues_test.csv", index=False)


In [18]:
cleaned_train_df.shape
cleaned_test_df.shape

(1500, 3)