In [1]:
import pandas as pd
import regex as re
import unicodedata
import string
import nltk
import numpy as np
import spacy

In [2]:
df = pd.read_csv('employer_raw_data_group_1.csv')

In [3]:
!python -m spacy download en_core_web_sm >> /dev/null
!python -m nltk.downloader stopwords >> /dev/null

2021-10-22 21:15:15.766659: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-22 21:15:15.766723: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
PUNCT_TO_REMOVE = string.punctuation
SNOWBALL_STEMMER = nltk.stem.SnowballStemmer("english")
NLP = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
STOP_WORDS = nltk.corpus.stopwords.words("english")

In [5]:
def get_preprocessing_function(
    use_phone_number: bool = True,
    use_emoji: bool = True,
    use_lower: bool = True,
    use_email: bool = True,
    use_normalize: bool = True,
    use_url: bool = True,
    use_rand_digits: bool = True,
    use_punctuation: bool = True,
    use_stopwords: bool = True,
    use_lemmatizer: bool = True,
    use_stemming: bool = False
):

    def remove_phone_number(text: str):
        phone_number_pattern = re.compile(r'\+?([0-9]{1,2})?\s?-?\s?\(?\d{3}\)?\s?-?\s?\d{3,4}\s?-?\s?\d{4}')
        return phone_number_pattern.sub("", text) if use_phone_number else text

    def remove_emoji(text: str):
        emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0000231B-\U0001F566"  # time
            "]+", flags=re.UNICODE)
        
        return emoji_pattern.sub(r'', text) if use_emoji else text
    
    def remove_email_address(text: str):
        email_address_pattern = re.compile(r"[\w.-]+@[\w.]+\.[a-zA-Z]{2,4}")
        return email_address_pattern.sub(r'', text) if use_email else text
    
    def remove_urls(text: str):
        url_pattern = re.compile(r'(https?):\/\/(\w{2,3}\.[A-Za-z-]{2,256}\.[a-z]{2,6})([-a-zA-Z-0-9@:%_\+.~#?&//=]*)')
        return url_pattern.sub(r'', text) if use_url else text
    
    def remove_punctuation(text: str):
        return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE)) if use_punctuation else text
    
    def all_lower_case(text: str):
        return text.lower() if use_lower else text
    
    def remove_random_digits(text: str):
        return ''.join([i for i in text if not i.isdigit()]) if use_rand_digits else text

    def normalize_text(text: str):
        return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') if use_normalize else text
    
    def clean_stopwords(text: str):
        temp = text.split(" ")
        return " ".join([token for token in temp if token not in STOP_WORDS]) if use_stopwords else text

    def apply_spacy_lemm(text: str):
        temp = NLP(text)
        return " ".join([word.lemma_ for word in temp]) if use_lemmatizer else text
    
    def apply_snowball_stemmer(text: str):
        temp = text.split(" ")
        return ' '.join([SNOWBALL_STEMMER.stem(word) for word in temp]) if use_stemming else text

    def preprocess(text: str):
        funcs = [remove_emoji, 
                remove_phone_number, 
                all_lower_case, 
                remove_email_address, 
                normalize_text,
                remove_urls, 
                remove_random_digits, 
                remove_punctuation,
                clean_stopwords,
                apply_spacy_lemm,
                apply_snowball_stemmer]
        for func in funcs:
            text = func(text)
        
        return text
        
    return preprocess

In [6]:
preprocess = get_preprocessing_function(
    use_phone_number = True,
    use_emoji = True,
    use_lower = True,
    use_email = True,
    use_normalize = True,
    use_url = True,
    use_rand_digits = True,
    use_punctuation = True,
    use_stopwords = True,
    use_lemmatizer = True,
    use_stemming = False
)

In [7]:
df['description'] = df['description'].fillna(".")
df['description'] = df['description'].astype(str)
df['description'] = df['description'].apply(preprocess)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0f8c70f2-5291-4247-be8a-0c0d763ed56d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>