# HANDY REGULAR EXPRESSIONS

In [359]:
import re

## a) two words text separated with underscore

In [244]:
text = 'sea_me tea_see dee_pee dodd_sssl'

#### i) getting first word

In [172]:
re.findall('(\w+)(?:_)',text) #getting first word before underscore 


#Explanation: \w   - regex metacharachter to include small alphabets a-z, capital alphabets A-Z and numbers also underscore.
#              +   - quantifier searching one or more occurence
#             ?:   - searching parameter with underscore 

['sea', 'tea', 'dee', 'dodd']

#### ii) getting second word

In [169]:
re.findall('(?:\w+_)(\w+)',text) #getting the second word after underscore


#Explanation: \w   - regex metacharachter to include small alphabets a-z, capital alphabets A-Z and numbers also underscore.
#              +   - quantifier searching one or more occurence
#       (?:\w+_)   - searching parameter with word after underscore 

['me', 'see', 'pee', 'sssl']

#### iii) removing underscore

In [267]:
re.findall('(\w+)(?:_)(\w+)(?: ?)',text)

[('sea', 'me'), ('tea', 'see'), ('dee', 'pee'), ('dodd', 'sssl')]

## b) three words text separated with underscores

In [314]:
text3 = 'load_vo_edd view_exx_trx slip_dsma_slla traz_ams_scs'

#### i) getting first word

In [334]:
re.findall('(?: ?)(\w+)(?:_)(?:\w+)(?:_)(?:\w+)',text3)

['load', 'view', 'slip', 'traz']

#### ii) getting second word

In [329]:
re.findall('(?:_)(\w+)(?:_)',text3)

['vo', 'exx', 'dsma', 'ams']

#### iii) getting last word

In [332]:
re.findall('(?: ?)(?:\w+)(?:_)(?:\w+)(?:_)(\w+)',text3)

['edd', 'trx', 'slla', 'scs']

## c) text with strings and numbers

In [363]:
text4 = 'xxaa_121 zaas_qqa aza12_211 as811_asff'

#### i) getting numbers

In [364]:
re.findall('\d+',text4)

['121', '12', '211', '811']

#### ii) removing underscores

In [365]:
re.findall('(\w+)(?:_)(\w+)',text4)

[('xxaa', '121'), ('zaas', 'qqa'), ('aza12', '211'), ('as811', 'asff')]

## d) variable words separated with underscores

In [419]:
text5 = 'aa_bb, aaa_ccc_ssdd, asd_qqsa, lkdd_trw_asdf_asdd'

#### i) getting first word

In [420]:
split_text5 = text5.split()

In [421]:
split_text5

['aa_bb,', 'aaa_ccc_ssdd,', 'asd_qqsa,', 'lkdd_trw_asdf_asdd']

In [422]:
first_Words = []

In [423]:
for string in split_text5:
    re1 = re.search(r'[^_]*', string)
    re2 = re1.group(0)
    first_Words.append(re2)

print(first_Words)

['aa', 'aaa', 'asd', 'lkdd']


In [426]:
re.findall(r'[^_]*', 'aa_bb')

['aa', '', 'bb', '']

In [428]:
all_Words = []
clean_list = []

In [429]:
for string in split_text5:
    re3 = re.findall(r'[^_]*', string)
    all_Words.extend(re3)

for word in all_Words:
    if word not in '':
        clean_list.append(word)


In [430]:
all_Words = clean_list

In [431]:
print(all_Words)

['aa', 'bb,', 'aaa', 'ccc', 'ssdd,', 'asd', 'qqsa,', 'lkdd', 'trw', 'asdf', 'asdd']


# TEXT PREPROCESSING

In [442]:
sample_metadata = '''h1>Title Goes Here</h1>
<b>Bolded Text</b>
<i>Italicized Text</i>
<img src="this should all be gone"/>
<a href="this will be gone, too">But this will still be here!</a>
I run. He ran. She is running. Will they stop running?
I talked. She was talking. They talked to them about running. Who ran to the talking runner?
[Some text we don't want to keep is in here]
¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!
something... is! wrong() with.,; this :: sentence.
I can't do this anymore. I didn't know them. Why couldn't you have dinner at the restaurant?
My favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.
Don't do it.... Just don't. Billy! I know what you're doing. This is a great little house you've got here.
[This is some other unwanted text]
John: "Well, well, well."
James: "There, there. There, there."
&nbsp;&nbsp;
There are a lot of reasons not to do this. There are 101 reasons not to do it. 1000000 reasons, actually.
I have to go get 2 tutus from 2 different stores, too.
22    45   1067   445
{{Here is some stuff inside of double curly braces.}}
{Here is more stuff in single curly braces.}
[DELETE]
</body>
</html>'''

In [443]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

### a) noise removal

In [445]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

sample = denoise_text(sample_metadata)
print(sample)

h1>Title Goes Here
Bolded Text
Italicized Text

But this will still be here!
I run. He ran. She is running. Will they stop running?
I talked. She was talking. They talked to them about running. Who ran to the talking runner?

¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!
something... is! wrong() with.,; this :: sentence.
I can't do this anymore. I didn't know them. Why couldn't you have dinner at the restaurant?
My favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.
Don't do it.... Just don't. Billy! I know what you're doing. This is a great little house you've got here.

John: "Well, well, well."
James: "There, there. There, there."
  
There are a lot of reasons not to do this. There are 101 reasons not to do it. 1000000 reasons, actually.
I have to go get 2 tutus from 2 different stores, too.
22    45   1067   445
{{Here is some stuff inside of double curly braces.}}
{Here i

In [446]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

sample = replace_contractions(sample)
print(sample)

h1>Title Goes Here
Bolded Text
Italicized Text

But this will still be here!
I run. He ran. She is running. Will they stop running?
I talked. She was talking. They talked to them about running. Who ran to the talking runner?

¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!
something... is! wrong() with.,; this :: sentence.
I cannot do this anymore. I did not know them. Why could not you have dinner at the restaurant?
My favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.
do not do it.... Just do not. Billy! I know what you are doing. This is a great little house you have got here.

John: "Well, well, well."
James: "There, there. There, there."
  
There are a lot of reasons not to do this. There are 101 reasons not to do it. 1000000 reasons, actually.
I have to go get 2 tutus from 2 different stores, too.
22    45   1067   445
{{Here is some stuff inside of double curly braces.}}

### b) tokenization 

In [447]:
words = nltk.word_tokenize(sample)
print(words)

['h1', '>', 'Title', 'Goes', 'Here', 'Bolded', 'Text', 'Italicized', 'Text', 'But', 'this', 'will', 'still', 'be', 'here', '!', 'I', 'run', '.', 'He', 'ran', '.', 'She', 'is', 'running', '.', 'Will', 'they', 'stop', 'running', '?', 'I', 'talked', '.', 'She', 'was', 'talking', '.', 'They', 'talked', 'to', 'them', 'about', 'running', '.', 'Who', 'ran', 'to', 'the', 'talking', 'runner', '?', '¡Sebastián', ',', 'Nicolás', ',', 'Alejandro', 'and', 'Jéronimo', 'are', 'going', 'to', 'the', 'store', 'tomorrow', 'morning', '!', 'something', '...', 'is', '!', 'wrong', '(', ')', 'with.', ',', ';', 'this', ':', ':', 'sentence', '.', 'I', 'can', 'not', 'do', 'this', 'anymore', '.', 'I', 'did', 'not', 'know', 'them', '.', 'Why', 'could', 'not', 'you', 'have', 'dinner', 'at', 'the', 'restaurant', '?', 'My', 'favorite', 'movie', 'franchises', ',', 'in', 'order', ':', 'Indiana', 'Jones', ';', 'Marvel', 'Cinematic', 'Universe', ';', 'Star', 'Wars', ';', 'Back', 'to', 'the', 'Future', ';', 'Harry', 'Pott

### c) normalization

In [448]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

words = normalize(words)
print(words)

['h1', 'title', 'goes', 'bolded', 'text', 'italicized', 'text', 'still', 'run', 'ran', 'running', 'stop', 'running', 'talked', 'talking', 'talked', 'running', 'ran', 'talking', 'runner', 'sebastian', 'nicolas', 'alejandro', 'jeronimo', 'going', 'store', 'tomorrow', 'morning', 'something', 'wrong', 'sentence', 'anymore', 'know', 'could', 'dinner', 'restaurant', 'favorite', 'movie', 'franchises', 'order', 'indiana', 'jones', 'marvel', 'cinematic', 'universe', 'star', 'wars', 'back', 'future', 'harry', 'potter', 'billy', 'know', 'great', 'little', 'house', 'got', 'john', 'well', 'well', 'well', 'james', 'lot', 'reasons', 'one hundred and one', 'reasons', 'one million', 'reasons', 'actually', 'go', 'get', 'two', 'tutus', 'two', 'different', 'stores', 'twenty-two', 'forty-five', 'one thousand and sixty-seven', 'four hundred and forty-five', 'stuff', 'inside', 'double', 'curly', 'braces', 'stuff', 'single', 'curly', 'braces']
