## Manning Live Project - Building Domain-Specific Language Models
### Excercise 1

#### Objective
The raw text is noisy and we want to remove nonwords and non-ASCII characters, keep punctuation to a minimum, and reduce the overall vocabulary of the corpus.

In [2]:
import re
import pandas as pd
from nltk.tokenize import word_tokenize

In [31]:
regex_library = {
        'latex_eq':'\$.*?\$',
        'html_tags' : '<.*?>',
        'per_sign' : '%',
        'http_link' : "http\S+",
        'math_fomula': '[a-z]*[0-9]* *\~([a-z]*[0-9]* *\+)* *[a-z]*[0-9]*',
        'code_params' : '[a-z_]*=[\'|\"]*[0-9a-z_]*',
        'new_lines' : '\n'
    }

def clean_text(text :str, clean_patt : dict, to_lower : bool = True)-> str:
    """ Clean the text data for NLP using reguler expressions
        :param text: Text data
        :param clean_patt: regex patterns to relace by space
        :parm to_lower: indicated if the text to be lowercased or not
        :returns clean_text: clean text data
    """
    sub_values = r'|'.join(list(clean_patt.values()))
    multiple_spaces = r'\s+'
    
    if len(text.strip()) > 2:
        clean_text = re.sub(sub_values,'',text.lower().strip())
        clean_text = re.sub(multiple_spaces,' ',clean_text)
    else:
        clean_text = text
    
    return clean_text

### Load the Data

In [32]:
sof_data = pd.read_csv("../data/stackexchange_812k.csv",
                       low_memory=True)

In [33]:
sof_data.head(1)

Unnamed: 0,post_id,parent_id,comment_id,text,category
0,1,,,Eliciting priors from experts,title


In [34]:
sof_data['clean_text'] = sof_data.text.apply(clean_text,
                                             args=(regex_library,True))

In [35]:
sof_data.head(1)

Unnamed: 0,post_id,parent_id,comment_id,text,category,clean_text
0,1,,,Eliciting priors from experts,title,eliciting priors from experts


In [42]:
sof_data['tokens'] = sof_data.clean_text.apply(word_tokenize,
                                             args=('english',False))

In [43]:
sof_data.head(1)

Unnamed: 0,post_id,parent_id,comment_id,text,category,clean_text,tokens
0,1,,,Eliciting priors from experts,title,eliciting priors from experts,"[eliciting, priors, from, experts]"


In [None]:
sof_data.head(10).to_csv(../data/sample_res.csv')