In [1]:
DATA_PATH = '../../data/text/cleaned/final.csv'
DATA_OUTPUT_PATH = '../../data/text/cleaned/preprocessed/final.pkl'
CHUNK_SIZE = 10**6
COLUMNS = ['text','priority']
words = {}

# PRE PROCESAMIENTO DE DATOS

## Funciones

### Preprocesamiento
Es necesario remover de nuestros datos información irrelevante como etiquetas, puntución, números y caracteres especiales.

In [2]:
import re

TAG_RE = re.compile(r'@[^> ]+')

def remove_at_sign(sentence: str):
    '''
    Replaces '@' from and input string for an empty space
    :param sentence: String that contains @
    :return: sentence without @
    '''

    return TAG_RE.sub('', sentence)

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yonosoysantiago/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords

def preprocess_text(sentence: str):
    '''
    Cleans up a sentence leaving only 2 or more non-stopsentences composed of upper and lowercase
    :param sentence: String to be cleaned
    :return: sentence without numbers, special chars and long stopsentences
    '''

    cleaned_sentence = sentence.lower()
    cleaned_sentence = remove_at_sign(cleaned_sentence)
    cleaned_sentence = re.sub('[^a-zA-Z]', ' ', cleaned_sentence)
    cleaned_sentence = re.sub('\s+[a-zA-Z]\s', ' ', cleaned_sentence)
    cleaned_sentence = re.sub('\s+', ' ', cleaned_sentence)

    #Removal of stopsentences
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s')
    cleaned_sentence = pattern.sub('', cleaned_sentence)

    return cleaned_sentence

In [5]:
import numpy as np

def add_to_dict(dictionary, filename):
    with open(filename, 'r') as f:
        for line in f.readlines():
            line = line.split(' ')

            try:
                dictionary[line[0]] = np.array(line[1:], dtype=float)
            except:
                continue

### Tokenización y Lematización
Una vez cargada la información de los tokens de GloVe se procede a tokenizar y lematizar cada
una de las oraciones en nuestro set de datos.

In [6]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/yonosoysantiago/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/yonosoysantiago/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

def sentence_to_token_list(sentence: str):
    tokens = tokenizer.tokenize(sentence)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    useful_tokens = [token for token in lemmatized_tokens if token in words]

    return  useful_tokens

Con el token anterior, el cual sabemos se puede representar por medio de uno de los tokens almacenados en `words`, entonces pasamos a la representación de estos:

In [8]:
def sentence_to_words_vectors(sentence: str, word_dict=words):
    processed_tokens = sentence_to_token_list(sentence)

    vectors = []
    for token in processed_tokens:
        if token in word_dict:
            token_vector = word_dict[token]
            vectors.append(token_vector)

    return np.array(vectors, dtype=float)

## Ejecucion

In [9]:
import pandas as pd
import pickle

In [10]:
def process_data(data:pd.DataFrame):
    data['text'] = data['text'].apply(preprocess_text)
    return data['text'].apply(lambda sentence: sentence_to_words_vectors(sentence))
    
def write_data(data:pd.DataFrame, first_one=False):
    if first_one:  
        data.to_pickle(DATA_OUTPUT_PATH, mode='a', index=False)
    else: 
        data.to_csv(DATA_OUTPUT_PATH, mode='a', index=False, header=False)
        

In [11]:

add_to_dict(words, './GloVe/glove.6B/glove.6B.50d.txt')

first = True
count = 1
data_array = []
for chunk in pd.read_csv(DATA_PATH, chunksize=10**5,nrows=10**6):  
    print("PROCESSING " + str(count))
    data = process_data(chunk)
    # print("WRITING \n"+ str(count))
    # write_data(data,first_one=first)
    data_array.append(data)
    count = count+1
    # first = False

data = pd.concat(data_array, ignore_index=True)
with open(DATA_OUTPUT_PATH, 'wb') as f:
    pickle.dump(data, f)

PROCESSING 1
PROCESSING 2
PROCESSING 3
PROCESSING 4
PROCESSING 5
PROCESSING 6
PROCESSING 7
PROCESSING 8
PROCESSING 9
PROCESSING 10
