In [618]:
import numpy as np
import string
import re

from functools import reduce
from collections import Counter

import pandas as pd

In [835]:
########################## --- doc/ string method: cleaning --- ##########################

# def sub_on_regex(string_text, regex):
#     """
#     Read and compile a regex, 
#     substitute/ replace the pattern per nothing.
    
#     Support function to implement a list of regex.
    
#     Args:
#         regex: regex pattern
#         string_text: text type string
#     Returns:
#         new string without the string pattern and striped
#     """
#     regex = re.compile(regex)
#     return regex.sub('', string_text.lower()).strip()


# def sub_regex_patterns(string_text, list_regex=list_re):
#     """ 
#     Iterate through a list of regex patterns over a string
#     Args:
#         list_regex: list of regex patterns
#         string_text: text type string
#     Returns:
#         new string with the replaced patterns
#     """    
#     for regex in list_regex:
#         # apply the sub_on_regex function
#         string_text = sub_on_regex(string_text, regex)
        
#     return string_text

# def remove_stopwords(string_text, stopwords=stopwords):
#     """Remove a list of words from a string"""
#     return ' '.join([w for w in string_text.split() if w not in stopwords])


def clean_string(string_text):
    """Apply  the regex cleaning methods and remove stopwords from a string"""
    string_clean = sub_regex_patterns(string_text)
    return remove_stopwords(string_clean)

########################## --- list of strings cleaning methods --- ##########################

def clean_list_of_string(list_of_strings):
    """Clean each string from a list of strings"""
    l_clean = []
    
    # iterate through a list of strings
    for s in list_of_strings:
        # clean each string
        string_cleared = clean_string(s)
        # append each cleaned string into a list
        l_clean.append(string_cleared)
        
    # return a list of cleaned strings
    return l_clean


########################## --- string method: tokenize --- ##########################

# def tokenize(string_text):
#     """
#     Return a list of words splitted by space
#     Args:
#         string_text: text type string
#     """
#     return string_text.split()


########################## --- vocab methods: list of strings --- ##########################

# def count_freq(string_text):
#     """
#     Counts the frequency of elements in a list.
#     Not so useful. Should use Counter from collections instead
#     """
    
#     # tokenize a string
#     l = tokenize(string_text)
    
#     # returns a dictionary counting the number of words
#     return {i:l.count(i) for i in l}

def count_words_list_of_strings(list_of_strings):
    """
    Returns a dict summing the number of words from each of a list os strings.
    Args:
        list_of_strings: list of docs as a string type
    """
    l = []
    # iterate through a list
    for string in list_of_strings:
        # tokenize each of the strings
        t = tokenize(i)
        # count the number of words
        count = Counter(t)
        # append the dictionaries into a list
        l.append(dict(count))
    
    # return the sum of all dictionaries
    return sum_dicts(l)


def sum_dicts(list_of_dicts):
    """
    Sum the elements from a list of dictionaries. 
    Support function to count_words_list_of_strings.
    Args:
        list_of_dicts: list of dictionaries with the frequency of each word
    """
    return dict(reduce(operator.add, map(Counter, list_of_dicts)))


def exclude_rare_words(list_of_strings, n_count_words=2):
    """
    Exclude rare words.
    Agrs:
        d: dictionary counting words
        n_count_words: minimum number of frequency to consider
    Returns:
        String without words with a minium counts
    """
    d = count_words_list_of_strings(list_of_strings)
    
    return ' '.join([k for k, v in d.items() if v > n_count_words])


def exclude_common_words(list_of_strings, n_top_words=20):
    """
    Returns a string without the n_top_words common words
    Args:
        list_of_strings: list of string
        n_top_words: number of most comon words to be excluded
    """
    # create a dictionary
    d = count_words_list_of_strings(list_of_strings)
    
    # 
    df = pd.DataFrame(data=d.items(), columns=['words', 'count']).sort_values(by='count', ascending=False)[n_top_words:]
    return ' '.join(df.words.to_list())
    


In [837]:
########### --- regex patterns --- ###########

# list all patterns that should be removed

# hyperlinks
hyperlinks_re = r'https?:\/\/.*[\r\n]*'

# emails
email_re = ''

# punctuation
punctionation_re = '[%s]' % re.escape(string.punctuation)

# numbers
bad_symbols_re = r'[^0-9a-z #+_a]'

# spaces
space_re = r'[/(){}\[\]\|@#,;]'

list_re = [hyperlinks_re, space_re, bad_symbols_re, hashtags_re, punctionation_re, regex_re]

########### --- stopwords --- ###########
stopwords = ['no', 'all', 'to', 'us', 'ca']


########### --- string --- ###########
s = 'No results results found. View all teams. Prod Fundraistrick. 350 10th Ave, Suite 1100. San Diego, CA 92101 US. Back to top. Donor Support braistrick@stayclassy.org. http://localhost:8888/notebooks/nlp/cleaning_sandbox.ipynb'


In [836]:
l

['No results results found. View all teams. Prod Fundraistrick. 350 10th Ave, Suite 1100. San Diego, CA 92101 US. Back to top. Donor Support braistrick@stayclassy.org. http://localhost:8888/notebooks/nlp/cleaning_sandbox.ipynb',
 'No results results found. View all teams. Prod Fundraistrick. 350 10th Ave, Suite 1100. San Diego, CA 92101 US. Back to top. Donor Support braistrick@stayclassy.org. http://localhost:8888/notebooks/nlp/cleaning_sandbox.ipynb']

In [831]:
# test with one string
s_clean = sub_regex_patterns(s, list_re)
s_clean = remove_stopwords(s_clean)
s_clean

'results results found view teams prod fundraistrick ave suite san diego back top donor support braistrickstayclassyorg'

In [832]:
# test with a list of string
l = [s, s]
l_clean = list(map(sub_regex_patterns, l))
l_clean

['no results results found view all teams prod fundraistrick   ave suite  san diego ca  us back to top donor support braistrickstayclassyorg',
 'no results results found view all teams prod fundraistrick   ave suite  san diego ca  us back to top donor support braistrickstayclassyorg']

In [833]:
list_dicts = [a, b, a, b]
d = sum_dicts(list_dicts)

In [834]:
exclude_rare_words(d, 4)

'No results found. View all teams. Prod Fundraistrick. 350 10th Ave, Suite 1100. San Diego, CA 92101 US. Back to top. Donor Support braistrick@stayclassy.org. http://localhost:8888/notebooks/nlp/cleaning_sandbox.ipynb'