In [1]:
import nltk

In [2]:
import re

In [3]:
from bs4 import BeautifulSoup

In [53]:
from contractions import CONTRACTION_MAP

In [64]:
import unicodedata

In [72]:
import spacy

In [73]:
nlp=spacy.load('en')

In [87]:
from nltk.tokenize.toktok import ToktokTokenizer

In [88]:
tokenizer=ToktokTokenizer()

In [89]:
stopword_list=nltk.corpus.stopwords.words('english')

In [50]:
def strip_html_tags(text):
    soup=BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe','script'])]
    stripped_text=soup.get_text()
    stripped_text=re.sub(r'[\r|\n|\r\n]+','\n',stripped_text)
    return stripped_text

In [66]:
def remove_accented_chars(text):
    text=unicodedata.normalize('NFKD', text).encode('ascii','ignore').decode('utf-8','ignore')
    return text

In [54]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    contractions_pattern=re.compile('({})'.format('|'.join(contraction_mapping.keys())), flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match=contraction.group(0)
        first_char=match[0]
        expanded_contraction=contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        expanded_contraction=first_char+expanded_contraction[1:]
        return expanded_contraction
    expanded_text=contractions_pattern.sub(expand_match, text)
    expanded_text=re.sub("'","", expanded_text)
    return expanded_text

In [55]:
def lemmatize_text(text):
    text=nlp(text)
    text=' '.join([word.lemma_ if word.lemma_!='-PRON-' else word.text for word in text])
    return text

In [56]:
def remove_special_characters(text, remove_digits=False):
    pattern=r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text=re.sub(pattern,'',text)
    return text

In [57]:
def remove_stopwords(text, is_lower_case=False):
    tokens=tokenizer.tokenize(text)
    tokens=[token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens=[token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens=[token for token in tokens if token.lower() not in stopword_list]
    filtered_text=' '.join(filtered_tokens)
    return filtered_text

In [107]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True, accented_char_removal=True, text_lower_case=True, text_lemmatization=True, special_char_removal=True, stopword_removal=True, remove_digits=True):
    normalized_corpus=[]
    for doc in corpus:
        if html_stripping:
            doc=strip_html_tags(doc)
        if accented_char_removal:
            doc=remove_accented_chars(doc)
        if contraction_expansion:
            doc=expand_contractions(doc)
        if text_lower_case:
            doc=doc.lower()
        doc=re.sub(r'[\r|\n|\r\n]+',' ',doc)
        if text_lemmatization:
            doc=lemmatize_text(doc)
        if special_char_removal:
            special_char_pattern=re.compile(r'([{.(-)}])')
            doc=special_char_pattern.sub(" \\1 ", doc)
            doc=remove_special_characters(doc, remove_digits=remove_digits)
        doc=re.sub(' +',' ',doc)
        if stopword_removal:
            doc=remove_stopwords(doc, is_lower_case=text_lower_case)
        
        normalized_corpus.append(doc)
    return normalized_corpus


In [108]:
sample_text=("US unveils world's most powerful supercomputer, bears China. The US has unveiled the world's most powerful supercomputer called 'Summit',""beating the previous record-holder China's Sunway TaihuLight. With a peak performance""of 200,000 trillion calculations per second, it is over twice as fast as Sunway Taihulight, ""which is capable of 93000 trillion calculations per second. Summit has 4608 servers, ""which reportedly take up the size of two tennis courts.")

In [110]:
normalize_corpus([sample_text])

['us unveil world powerful supercomputer bear china us unveil world powerful supercomputer call summit beat previous record holder chinas sunway taihulight peak performanceof trillion calculation per second twice fast sunway taihulight capable trillion calculation per second summit server reportedly take size two tennis court']