In [1]:
import os

## Converting non ascii characters to ascii

Sometimes because how the data was captured or stored weird charachters are added to the text. This and add noise to the model. 

In [1]:
# represent Unicode in ASCII characters
from unidecode import unidecode

In [5]:
text = 'À È Ì Ò Ù Ỳ Ǹ Ẁ'
print(text)
text = unidecode(text) # ascii rep of text
print('ASCII representation: ', text)

À È Ì Ò Ù Ỳ Ǹ Ẁ
ASCII representation:  A E I O U Y N W


# Correct spelling
The [original source](https://github.com/wolfgarbe/SymSpell), but I am using a simple [python port](https://github.com/mammothb/symspellpy).

In [10]:
from symspellpy.symspellpy import SymSpell

In [11]:
# maximum edit distance per dictionary precalculation
max_edit_distance_dictionary = 0
prefix_length = 7
# create object
sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
# load dictionary
dictionary_path = "frequency_dictionary_en_82_765.txt"
term_index = 0  # column of the term in the dictionary text file
count_index = 1  # column of the term frequency in the dictionary text file
if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
    print("Dictionary file not found")

In [14]:
input_term = "I'd like toknowhow I'd done that!"#"thequickbrownfoxjumpsoverthelazydog"

result = sym_spell.word_segmentation(input_term)
# display suggestion term, term frequency, and edit distance
print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
                          result.log_prob_sum))

I'd like to know how I'd done that !, 10, -58.50888790936667


# Removing contractions:

I will be using the [pycontractions](https://pypi.org/project/pycontractions/) library. It takes a three-pass approach. 
* First, the simple contractions with only a single rule are replaced. 
* On the second pass if any contractions are present with multiple rules we proceed to replace all combinations of rules to produce all possible texts. 
* Each text is then passed through a grammar checker and the Word Mover’s Distance (WMD) is calculated between it and the original text. The hypotheses are then sorted by least number of grammatical errors and shortest distance from the original text and the top hypothesis is returned as the expanded form.

In [1]:
from pycontractions import Contractions

In [7]:
# Load your favorite semantic vector model in gensim keyedvectors format from disk
# cont = Contractions('GoogleNews-vectors-negative300.bin')
# or specify any model from the gensim.downloader api
cont = Contractions(api_key='glove-wiki-gigaword-50')
# optional, prevents loading on first expand_texts call
cont.load_models()



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
list(cont.expand_texts(["I'd like to know how I'd done that!",
                            "We're going to the zoo and I don't think I'll be home for dinner.",
                            "Theyre going to the zoo and she'll be home for dinner."]))

['I had like to know how I had done that!',
 'we are going to the zoo and I do not think I will be home for dinner.',
 'they are going to the zoo and she will be home for dinner.']

In [9]:
list(cont.expand_texts(["I'd like to know how I'd done that!",
                            "We're going to the zoo and I don't think I'll be home for dinner.",
                            "Theyre going to the zoo and she'll be home for dinner."], precise=True))

['I would like to know how I had done that!',
 'we are going to the zoo and I do not think I will be home for dinner.',
 'they are going to the zoo and she will be home for dinner.']

## Regular Expressions to clean text

In [2]:
import re

In [3]:
email_pattern = re.compile('\"?([-a-zA-Z0-9.`?{}]+@\w+\.\w+)\"?')
dollar_and_decimals_pattern = re.compile('(\$[-\d]*\.*\d+)|(\d*\.\d+)')
us_phone_pattern = re.compile('\d{3}-\d{3}-\d{4}|\(\d{3}\)\d{3}-\d{4}')
date_pattern = re.compile('\s+\d{1,4}[/-]\d{1,2}[/-]*\d{0,4}|\s*\d{1,4}[/-]\d{1,2}[/-]*\d{0,4}\s+|\d{2}-\D{3}-\d{2,4}')
add_space_around_punct_patern = re.compile(r'([\[.,!?():;\]])')
remove_multiple_space_pattern = re.compile('\s{2,}|\t')
split_sentence_pattern = re.compile('[!&\.;?]|\*{2,}|\-{2,}|/{2,}|,[\s\w]{25,},')

## Language detection

In [1]:
import fasttext
from fasttext import tokenize
import re
import os

In [2]:
# I am using lid.176.bin, which is faster and slightly more accurate,
# there is a compressed version avaiable at https://fasttext.cc/docs/en/language-identification.html
fasttext_language_model = fasttext.load_model(os.path.join("model", "lid.176.bin"))



In [3]:
white_space_pattern = re.compile(r"\s")
def preprocess_text_for_language_detection(text: str):
    """
    Cleans the text as per fasttext requirements.
    The requirements can be found here: https://pypi.org/project/fasttext/
    :text: str: text to clean
    :returns: str: cleaned text
    """
    # fastText assumes UTF-8 encoded text
    text = str(text)
    
    # fastText is not aware of UTF-8 whitespace
    # Replace all white space with space
    text = white_space_pattern.sub(text, " ")
    
    # Tokenize text, per fastext function and rejoin
    tokens = tokenize(text)
    text = " ".join(tokens)
    n = len(tokens)
    
    # Remove white space char as it affects the model accuracy
    text = text.replace("</s>", "")
    
    return text.lower()

def identify_languages(text: str, no_of_languages: int =1):
    """
    Uses fasttext language detection and some simple cleaning 
    to identify languages in text.
    Returns language code from here -  https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
    :text: str:
    :no_of_languages: int
    :returns: list of tuples containing language and its probability
    """
    clean_text = preprocess_text_for_language_detection(text)
    ft_output = fasttext_language_model.predict(text, no_of_languages)
    # format output
    result = [(ft_output[0][i][-2:], ft_output[1][i]) for i in range(len(ft_output[0]))]
    return result

In [4]:
identify_languages("¿Cómo estás")

[('es', 1.0000468492507935)]

## Tokenizing

In [1]:
from sacremoses import MosesTokenizer

In [2]:
mt = MosesTokenizer(lang='en')
text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf'
tokenized_text = mt.tokenize(text, return_str=True)

In [3]:
tokenized_text

'This , is a sentence with weird » symbols … appearing everywhere ¿'

In [None]:
from sacremoses import M