## Formas de preprocesamiento

### `sklearn.feature_extraction.text.CountVectorizer`

In [1]:
import nltk
nltk.download('stopwords')

from scipy import sparse
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/veterok/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
demo = ["No te me subas al coco no, No te me subas al coco no, No nena no al coco no al coco no",
       "No no no al coco no al coco no, No no no al coco no al coco no, No te me subas al coco no, No te me subas al coco no, No nena no al coco no al coco no",
       "No te me subas al coco no, No te me subas al cono no, No nena no al coco no al coco no",
       "No no no al coco no al coco no, No no no al coco no al coco no, No te me subas al coco no, No te me subas al coco no, No nena no al coco no al coco no"]
        

In [11]:
es_stopwords = stopwords.words('spanish')

In [37]:
vectorize = CountVectorizer(max_features=3)
letras = vectorize.fit_transform(demo)

In [38]:
vectorize.get_feature_names()

['al', 'coco', 'no']

In [29]:
letras.toarray()

array([[ 4,  0,  1,  8,  2],
       [ 8,  0,  1, 18,  2],
       [ 3,  1,  1,  8,  2],
       [ 8,  0,  1, 18,  2]], dtype=int64)

In [30]:
demo_df = pd.DataFrame(letras.toarray())

In [31]:
demo_df.columns = vectorize.get_feature_names()

In [32]:
demo_df

Unnamed: 0,coco,cono,nena,no,subas
0,4,0,1,8,2
1,8,0,1,18,2
2,3,1,1,8,2
3,8,0,1,18,2


### `sklearn.feature_extraction.text.TfidfVectorizer`

Term Frequency Inverse Document Frequency: Mide la "originalidad" de una palabra mediante la comparación de la cantidad de veces que aparece una palabra en un documento con el número de documentos:


$$
\textsf{TF-IDF} = \textsf{TF}(\textsf{termino}, \textsf{documento}) \times \textsf{IDF}(\textsf{termino})
$$

donde $\textsf{TF}$ es la frecuencia del término en un documento específico y $\textsf{IDF}$

$$
\textsf{log}\frac{1 + \textsf{Numero de documentos}}{1 + \textsf{DF}} + 1
$$

donde $\textsf{DF}$ representa la frecuencia del documento asociado con un término específico.


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
tfidf_vect = TfidfVectorizer()
letras_tfidf = tfidf_vect.fit_transform(demo)

In [42]:
tfidf_vect.get_feature_names()

['al', 'coco', 'cono', 'me', 'nena', 'no', 'subas', 'te']

In [43]:
letras_tfidf.toarray()

array([[0.38313051, 0.38313051, 0.        , 0.19156526, 0.09578263,
        0.76626103, 0.19156526, 0.19156526],
       [0.37099112, 0.37099112, 0.        , 0.09274778, 0.04637389,
        0.83473001, 0.09274778, 0.09274778],
       [0.38911653, 0.2918374 , 0.1864151 , 0.19455826, 0.09727913,
        0.77823306, 0.19455826, 0.19455826],
       [0.37099112, 0.37099112, 0.        , 0.09274778, 0.04637389,
        0.83473001, 0.09274778, 0.09274778]])

In [44]:
demo_tfidf = pd.DataFrame(letras_tfidf.toarray())
demo_tfidf.columns = tfidf_vect.get_feature_names()

In [45]:
demo_tfidf

Unnamed: 0,al,coco,cono,me,nena,no,subas,te
0,0.383131,0.383131,0.0,0.191565,0.095783,0.766261,0.191565,0.191565
1,0.370991,0.370991,0.0,0.092748,0.046374,0.83473,0.092748,0.092748
2,0.389117,0.291837,0.186415,0.194558,0.097279,0.778233,0.194558,0.194558
3,0.370991,0.370991,0.0,0.092748,0.046374,0.83473,0.092748,0.092748


In [None]:
def preprocess_strings(string_array):
    # ingresamos stopwords y stemmer
    stopwords = nltk.corpus.stopwords.words("english")
    # con el stemmer reducimos las palabras a una raíz semántica común.
    stemmer = nltk.stem.snowball.SnowballStemmer("english")
    # holder de strings a nivel de array
    string_array_holder = []
    # por cada string en el array
    for string in string_array:
        # holder de palabras en cada string
        string_holder = []
        # separamos (tokenizamos) cada palabra en el string
        tokenized_strings = nltk.word_tokenize(string)
        # por cada palabra en el string
        for token in tokenized_strings:
            # lowercase
            token = token.lower()
            # reemplazamos todo caracter noalfanumérico a nada.
            token = re.sub(re.compile("[^A-Za-z0-9]+"), "", token)
            # si la palabra no es vacía y no se encuentra a nivel de stopwords
            if token != "" and token not in stopwords:
                # reducimos a la raíz semántica
                token = stemmer.stem(token)
            # si es que es válida
            if token != "":
                # concatenamos
                string_holder.append(token)
        # concatenamos a nivel de holders
        string_array_holder.append(string_holder)
    # devolvemos
    
    return string_array_holder

### Elementos a considerar en el preprocesamiento