In [None]:
DATA_PATH = '../../data/text/cleaned/final.csv'
DATA_OUTPUT_PATH = '../../data/text/cleaned/preprocessed/final.pkl'
CHUNK_SIZE = 10**6
COLUMNS = ['text','priority']
words = {}

# PRE PROCESAMIENTO DE DATOS

## Funciones

### Preprocesamiento
Es necesario remover de nuestros datos información irrelevante como etiquetas, puntución, números y caracteres especiales.

In [None]:
import re

TAG_RE = re.compile(r'@[^> ]+')

def remove_at_sign(sentence: str):
    '''
    Replaces '@' from and input string for an empty space
    :param sentence: String that contains @
    :return: sentence without @
    '''

    return TAG_RE.sub('', sentence)

In [None]:
import langid
from deep_translator import GoogleTranslator

def translate_sentence(sentence: str):
    """
    Translate a sentence to english if it's in different language.
    :param sentence: The string/sentence to translate
    :return: The original sentence in english
    """
    lang = langid.classify(sentence)[0]
    if lang != 'en':
        sentence = GoogleTranslator(source='auto').translate(sentence)
    return sentence

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

def preprocess_text(sentence: str):
    '''
    Cleans up a sentence leaving only 2 or more non-stopwords composed of upper and lowercase
    :param sentence: String to be cleaned
    :return: sentence without numbers, special chars and long stopwords
    '''

    cleaned_sentence = sentence.lower()
    cleaned_sentence = remove_at_sign(cleaned_sentence)
    cleaned_sentence = re.sub('[^a-zA-Z]', ' ', cleaned_sentence)
    cleaned_sentence = re.sub('\s+[a-zA-Z]\s', ' ', cleaned_sentence)
    cleaned_sentence = re.sub('\s+', ' ', cleaned_sentence)

    #Translate
    cleaned_sentence = translate_sentence(cleaned_sentence)

    #Removal of stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s')
    cleaned_sentence = pattern.sub('', cleaned_sentence)

    return cleaned_sentence

In [None]:
import numpy as np

def add_to_dict(dictionary, filename):
    """
    Agrega elementos a un diccionario desde un archivo.

    :param dictionary: Diccionario al que se agregarán los elementos.
    :type dictionary: dict
    :param filename: Nombre del archivo.
    :type filename: str
    """
    with open(filename, 'r') as f:
        for line in f.readlines():
            line = line.split(' ')

            try:
                dictionary[line[0]] = np.array(line[1:], dtype=float)
            except:
                continue

### Tokenización y Lematización
Una vez cargada la información de los tokens de GloVe se procede a tokenizar y lematizar cada
una de las oraciones en nuestro set de datos.

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

def sentence_to_token_list(sentence: str):
     """
    Convierte una oración en una lista de tokens útiles.

    :param sentence: Oración a procesar.
    :type sentence: str
    :return: Lista de tokens útiles.
    :rtype: list[str]
    """
    tokens = tokenizer.tokenize(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    useful_tokens = [token for token in lemmatized_tokens if token in words]

    return  useful_tokens

Con el token anterior, el cual sabemos se puede representar por medio de uno de los tokens almacenados en `words`, entonces pasamos a la representación de estos:

In [None]:
def sentence_to_words_vectors(sentence: str, word_dict=words):
    """
    Convierte una oración en vectores de palabras.

    :param sentence: Oración a procesar.
    :type sentence: str
    :param word_dict: Diccionario de palabras con vectores asociados, por defecto es `words`.
    :type word_dict: dict, optional
    :return: Array de vectores de palabras.
    :rtype: numpy.ndarray
    """
    processed_tokens = sentence_to_token_list(sentence)

    vectors = []
    for token in processed_tokens:
        if token in word_dict:
            token_vector = word_dict[token]
            vectors.append(token_vector)

    return np.array(vectors, dtype=float)

## Ejecucion

In [None]:
import pandas as pd

In [None]:
def process_data(data:pd.DataFrame):
    """
    Procesa los datos en un DataFrame.

    :param data: DataFrame que contiene los datos a procesar.
    :type data: pd.DataFrame
    :return: Columna 'text' del DataFrame procesada como vectores de palabras.
    :rtype: pandas.Series
    """
    data['text'] = data['text'].apply(preprocess_text)
    return data['text'].apply(lambda sentence: sentence_to_words_vectors(sentence))

In [None]:

add_to_dict(words, './GloVe/glove.6B/glove.6B.50d.txt')

first = True
count = 1
data_array = []

for chunk in pd.read_csv(DATA_PATH, chunksize=10**5,nrows=10**6):  
    print("PROCESSING " + str(count))
    data = process_data(chunk)
    data_array.append(data)

    count = count +1