In [1]:
# ! pip install nltk
# ! pip install spellchecker
# ! pip install autocorrect
# ! pip install presidio_analyzer
# ! pip install presidio_anonymizer
# ! pip install spacy
# ! pip install pyspellchecker
# ! python3 -m spacy download en_core_web_sm
# ! python3 -m spacy download es_core_news_sm
# ! python3 -m spacy download fr_core_news_sm
# ! python3 -m spacy download en_core_web_lg
# ! python3 -m spacy download pt_core_news_sm
# ! pip install tensorflow

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

from nltk import word_tokenize


from spellchecker import SpellChecker
from autocorrect import Speller

from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_analyzer import PatternRecognizer
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngineProvider

import re
import os
import numpy as np  
import pandas as pd  
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm_notebook
from tqdm.auto import tqdm
tqdm.pandas()


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from string import punctuation
punctuation = [p for p in punctuation ]
punctuation.remove('?')
punctuation.remove('!')
punctuation.remove('"')
import spacy
print(" Important: Adjust data reading to your needs")

[nltk_data] Downloading package punkt to /home/nicolas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nicolas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


 Important: Adjust data reading to your needs


In [3]:
nlp_pt = spacy.load("pt_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")
nlp_es = spacy.load("es_core_news_sm")
nlp_fr = spacy.load("fr_core_news_sm")

In [4]:
def get_labels_and_texts(values, n_text, n_label):
    texts = []
    labels = []
    for e in values:
        utt = e[n_text]
        texts.append(utt)
    for e in values:
        utt = e[n_label]
        labels.append(utt)
    #print(texts, labels)
    data = pd.DataFrame(data = {"text": texts, "label": labels})

    return data

def read_df(data, index_text, index_label):
    values = data.values
    df = get_labels_and_texts(values, index_text, index_label)
    return df

In [5]:
def groupby(df, column):
    print(str(df.groupby(column).count()))
    return df

# Target One Hot Encoding

In [6]:
def one_hot_encoding_func(df):
    y_encoder= OHE().fit(np.array(df.label).reshape(-1,1))
    ytr_encoded= y_encoder.transform(np.array(df.label).reshape(-1,1)).toarray()
    
    return ytr_encoded
#print(one_hot_encoding(df))

# Convert text to Lowercase

In [7]:
def convert_to_lowercase_func(df):
    df["text"]= df.text.map(lambda x: x.lower())
    return df
#print(convert_to_lowercase(df))

# Data Protection and Anonymization (presidio)


In [8]:
# Set up the engine, loads the NLP module (spaCy model by default) 
# and other PII recognizers

def remove_synonyms_email(text):
    # deve vir depois da mascara de phone_number
    text = re.sub('e-mails|e-mail|emails|emailed|email|mail', 'email', text, flags=re.I)
    return text
    

def anonymize_link(text):
    regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    text = re.sub(regex, '<LINK>', text)
    text = re.sub('dell.com', '<LINK>', text)
    return text
            
def anonymize_date(text):
    text = re.sub(r"[\d]{1,2}/[\d]{1,2}/[\d]{4}", '<DATE>', text) #10/10/2015
    text = re.sub(r"[\d]{1,2}-[\d]{1,2}-[\d]{2}", '<DATE>', text) #10-10-15
    text = re.sub(r"[\d]{1,2} [ADFJMNOS]\w* [\d]{4}", '<DATE>',text) #1 NOV 2010
    text = re.sub(r"[\d]{1,2} [ADFJMNOS]\w* [\d]{4}", '<DATE>', text) #10 March 2015
    return text

def anonymize_phone_number(text):
    if re.findall(r'téléphone|teléfono|telefone|phone', text.lower()):
        text = re.sub(r'0-000-000-0000', 'PHONE_NUMBER', text)
        text = re.sub(r'\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}', 'PHONE_NUMBER', text)
    return text

def anonymize_service_request(text):
    # deve vir depois da mascara de phone_number
    if re.findall(r'servidor|servicio|serveur|service|request|sr', text, flags=re.I):
        text = re.sub(r'\d{5,}', '<SR_NUMBER>', text)
    return text

def anonymize_number(text):
    # deve vir depois da mascara de phone_number
    text = re.sub(r'\d{5,}', '<NUMBER>', text)
    return text


In [9]:
configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "es", "model_name": "es_core_news_sm"},
               {"lang_code": "en", "model_name": "en_core_web_lg"},
               {"lang_code": "pt", "model_name": "pt_core_news_sm"},
               {"lang_code": "fr", "model_name": "fr_core_news_sm"}
              ],
}

anonymizer = AnonymizerEngine()
provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine_with_multilanguages = provider.create_engine()
analyzer = AnalyzerEngine( nlp_engine=nlp_engine_with_multilanguages, 
    supported_languages=["en", "es", "pt", "fr"])


def presidio(language, text):
    results = analyzer.analyze(text=text,
                               entities=["PHONE_NUMBER", "EMAIL_ADDRESS" ],
                               language=language)

    anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)
    text = anonymize_link(anonymized_text.text)
    text = anonymize_service_request(text)
    text = remove_synonyms_email(text)
    text = anonymize_date(text)
    text = anonymize_phone_number(text)
    text = anonymize_number(text)
    
    #print(text)
    return text

def presidio_func(df, lang):
    df["text"]= df.text.progress_map(lambda x: presidio(language=lang, text=x))
    return df

# Word Tokenize

In [10]:
def normalize(text):
    return " ".join(text)

def word_tokenize_func(df, tokenize):
    df["text"]= df.text.progress_map(tokenize)
    df["text"]= df.text.apply(normalize)
    return df

# SpellCheck

In [11]:
def normalize(text):
    return " ".join(text)

def spell_checker_func(df, lang):
    spell = Speller(fast=True, lang=lang)
    df["text"] = df.text.progress_map(lambda xs: [spell(x) for x in xs.split()])
    df["text"]= df.text.apply(normalize)
    return df

# Remove StopWords

In [12]:
#Remove Stop Words
def remove_stop(strings, stop_list):
    classed= [s for s in strings.split() if s not in stop_list]
    return classed
def normalize(text):
    return " ".join(text)

def remove_stopwords_func(df,language):
    stop= stopwords.words(language)
    stop_punc= list(set(punctuation))+ stop
    df["text"]= df.text.progress_map(lambda x: remove_stop(x, stop_punc))
    df["text"]= df.text.apply(normalize)
    return df

#remove_stopwords(df)
#print(df)

# Lemming

In [13]:
def lemme(text, lang):
    if lang == "english":
        doc = nlp_en(text)
    elif lang == "spanish":
        doc = nlp_es(text)
    elif lang == "portuguese":
        doc = nlp_pt(text)
    elif lang == "french": 
        doc = nlp_fr(text)
    phrase = " ".join([token.lemma_ for token in doc])
    #print(phrase)
    return phrase

def lemming_func(df, lang):
    df["text"]= df.text.progress_map(lambda text: lemme(text, lang))
    return df


# Stemming

In [14]:
def normalize(text):
    return " ".join(text)

def stemming_func(df):
    stemmer= PorterStemmer()
    df["text"]= df.text.progress_map(lambda xs: [stemmer.stem(x) for x in xs.split()])
    df["text"]= df.text.apply(normalize)
    
    return df

#stemming(df)
#print(df)

In [15]:
def preprocess(data,language, hot_encoding = False, lower_case= False, presidio=False, 
               word_tokenize= False, w_tokenize = word_tokenize, spell_checker= False, 
               remove_stopwords = False, lemming=False, steeming= False, replace_syn=False):
    
    encoding = hot_encoding
    lower = lower_case
    pred = presidio
    tokenize =  word_tokenize
    stopwords = remove_stopwords
    stee = steeming
    lemme = lemming
    spell = spell_checker
    
    df = groupby(data, "label")
    
    
    if language == "portuguese":
        lang = 'pt'
    elif language == "english":
        lang = 'en'
    elif language == "spanish": 
        lang = 'es'
    elif language == "french":
        lang = 'fr'
    
    
    if encoding == True:
        print("One Hot Encoding:")
        encoded = one_hot_encoding_func(df)
        print(encoded)
        print("...")
    if lower == True:
        print("Converting to LowerCase")
        df = convert_to_lowercase_func(df)
    if pred == True:
        print("Doing Data Protection and Anonymization")
        df = presidio_func(df,lang)
    if tokenize ==  True:
        print("Doing word_tokenize")
        df = word_tokenize_func(df, w_tokenize)
    if spell == True: 
        print("Doing Spell Checking")
        df = spell_checker_func(df, lang)
    if stopwords == True:
        print("Removing StopWords")
        df = remove_stopwords_func(df,language)
    if lemme == True:
        print("Doing Lemmatization")
        df = lemming_func(df, language)
    if stee == True:
        print("Doing steeming")
        df = stemming_func(df)
    
    return df
