In [1]:
%load_ext watermark
%watermark

2020-09-14T12:51:01-05:00

CPython 3.7.6
IPython 7.13.0

compiler   : GCC 7.3.0
system     : Linux
release    : 5.4.0-47-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 4
interpreter: 64bit


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
plt.rcParams['figure.figsize'] = (10,10)

## Variables categoricas

Los modelos están diseñados para trabajar con variables numéricas. Esto implica que para poder entrenar los modelos con variables categóricas, por tanto tenemos que convertirlas a numeros. Este proceso se llama *(encoding)*

In [3]:
datos = pd.read_csv("datos_procesamiento.csv")
datos.head()

Unnamed: 0,col_inexistente1,col2,col3,col_outliers,col_outliers2,col_categorica,col_ordinal,col_texto
0,59.0,52.0,2.232832,-50,0.771666,ratón,muy bien,Tenía en su casa una ama que pasaba de los cua...
1,31.0,74.0,0.906147,-5,1.068558,elefante,regular,"El resto della concluían sayo de velarte, calz..."
2,81.0,28.0,0.62675,-32,0.846396,ratón,muy mal,"El resto della concluían sayo de velarte, calz..."
3,34.0,16.0,0.816738,-84,0.637381,gato,mal,"Una olla de algo más vaca que carnero, salpicó..."
4,32.0,28.0,0.571131,65,4.540614,gato,bien,Tenía en su casa una ama que pasaba de los cua...


In [4]:
var_categoricas = datos[["col_categorica","col_ordinal"]]

In [6]:
var_categoricas.head()

Unnamed: 0,col_categorica,col_ordinal
0,ratón,muy bien
1,elefante,regular
2,ratón,muy mal
3,gato,mal
4,gato,bien


Utilizamos LabelEncoder

In [9]:
from sklearn import preprocessing
label_codificador = preprocessing.LabelEncoder()
label_codificador.fit(datos.col_ordinal)

LabelEncoder()

In [10]:
label_codificador.classes_

array(['bien', 'mal', 'muy bien', 'muy mal', 'regular'], dtype=object)

In [11]:
label_codificador.transform(['bien', 'mal', 'muy bien', 'muy mal', 'regular'])

array([0, 1, 2, 3, 4])

Ha tomado los valores de forma alfabetica por lo que *LabelEncoder* no suele ser muy utilizado

In [16]:
label_codificador.inverse_transform([0,0,1,2])

array(['bien', 'bien', 'mal', 'muy bien'], dtype=object)

In [22]:
label_codificador_categorica = preprocessing.LabelEncoder()
label_codificador_categorica.fit_transform(datos.col_categorica)
label_codificador_categorica.classes_

array(['elefante', 'gato', 'perro', 'ratón'], dtype=object)

Para variables categoricas (por ejemplo animales) no tiene sentido este tipo de codificación ya que *raton + raton no produce elefante*

Para estas variables utilizamos [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)

In [24]:
oh_codificador = preprocessing.OneHotEncoder()

A **oh_codificador** no le podemos pasar datos de texto que es como se encuentra clasificada la variable categorica por tanto debemos clasificarlo nosotros antes de hacer fit

In [25]:
categorias_codificadas = label_codificador_categorica.fit_transform(datos.col_categorica)

In [26]:
categorias_codificadas

array([3, 0, 3, 1, 1, 2, 2, 2, 0, 0, 2, 0, 1, 3, 1, 3, 0, 3, 3, 0, 3, 3,
       3, 1, 1, 3, 1, 0, 1, 0, 0, 0, 0, 3, 0, 1, 0, 1, 1, 1, 1, 3, 0, 1,
       1, 2, 0, 1, 0, 2, 3, 2, 0, 3, 3, 1, 1, 0, 3, 2, 1, 1, 1, 1, 3, 0,
       0, 2, 1, 3, 2, 0, 1, 1, 3, 3, 0, 2, 3, 3, 0, 1, 2, 3, 1, 0, 0, 2,
       2, 2, 3, 1, 3, 1, 1, 0, 1, 1, 2, 2, 3, 2, 1, 0, 3, 2, 1, 1, 3, 1,
       2, 2, 2, 1, 2, 2, 0, 2, 3, 1, 1, 1, 1, 3, 3, 3, 2, 0, 0, 2, 3, 1,
       0, 1, 3, 3, 0, 1, 2, 3, 1, 0, 0, 0, 0, 2, 0, 0, 1, 3, 2, 2, 1, 2,
       2, 0, 1, 2, 2, 2, 2, 1, 3, 1, 1, 1, 2, 3, 3, 0, 3, 3, 0, 0, 3, 0,
       0, 0, 3, 3, 1, 2, 0, 0, 1, 1, 2, 0, 0, 0, 3, 2, 2, 1, 3, 1, 2, 3,
       0, 3, 3, 1, 3, 1, 3, 0, 1, 1, 2, 0, 2, 2, 1, 0, 0, 0, 1, 2, 1, 2,
       0, 0, 0, 0, 2, 1, 1, 0, 3, 2, 2, 3, 3, 2, 2, 0, 0, 2, 1, 2, 3, 2,
       3, 1, 3, 2, 0, 1, 1, 3, 3, 2, 1, 3, 3, 2, 0, 1, 3, 2, 0, 3, 0, 2,
       3, 2, 2, 2, 3, 3, 3, 2, 0, 2, 0, 1, 0, 0, 0, 0, 3, 0, 3, 0, 2, 3,
       1, 0, 2, 2, 3, 2, 3, 1, 1, 2, 1, 3, 2, 0, 1,

Como es un LabelEncoder nos devuelve la clasificación numerica

In [27]:
categorias_oh_codificadas = oh_codificador.fit_transform(categorias_codificadas.reshape(1000,1))
categorias_oh_codificadas

<1000x4 sparse matrix of type '<class 'numpy.float64'>'
	with 1000 stored elements in Compressed Sparse Row format>

Se utiliza reshape porque one hot encoder debe recibir una lista de listas, este nos devuelve una matriz "sparse" o matriz escasa. Esta es una manera de representar matrices con muchos ceros (como es el caso de OneHotEncoding) para consumir poca memoria

Convertimos esta matriz en un arreglo

In [29]:
categorias_oh_codificadas.toarray()

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]])

como es de observar nos arrojo una matriz donde se encuentra marcado con un **1** la posición que indica cual era el registro en el dataset para esa posición, para el primer registro es un rato y para el segundo registro será un elefante

In [30]:
import sys
sys.getsizeof(categorias_oh_codificadas)

64

In [31]:
sys.getsizeof(categorias_oh_codificadas.toarray())

32112

S queremos que la función nos devuelva array de numpys porque son más legibles podemos pasarle el parametro sparse como **False**

In [33]:
oh_codificador = preprocessing.OneHotEncoder(sparse=False)

categorias_oh_codificadas = oh_codificador.fit_transform(categorias_codificadas.reshape(1000,1))
categorias_oh_codificadas

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [35]:
oh_codificador.categories_

[array([0, 1, 2, 3])]

Padas tiene una función que hace esto por si solo de forma más facil

In [36]:
pd.get_dummies(datos.col_categorica).head()

Unnamed: 0,elefante,gato,perro,ratón
0,0,0,0,1
1,1,0,0,0
2,0,0,0,1
3,0,1,0,0
4,0,1,0,0


## Procesado de texto

In [37]:
from sklearn import feature_extraction

In [39]:
datos.col_texto.values[:10]

array(['Tenía en su casa una ama que pasaba de los cuarenta, y una sobrina que no llegaba a los veinte, y un mozo de campo y plaza, que así ensillaba el rocín como tomaba la podadera.',
       'El resto della concluían sayo de velarte, calzas de velludo para las fiestas con sus pantuflos de lo mismo, los días de entre semana se honraba con su vellori de lo más fino.',
       'El resto della concluían sayo de velarte, calzas de velludo para las fiestas con sus pantuflos de lo mismo, los días de entre semana se honraba con su vellori de lo más fino.',
       'Una olla de algo más vaca que carnero, salpicón las más noches, duelos y quebrantos los sábados, lentejas los viernes, algún palomino de añadidura los domingos, consumían las tres partes de su hacienda.',
       'Tenía en su casa una ama que pasaba de los cuarenta, y una sobrina que no llegaba a los veinte, y un mozo de campo y plaza, que así ensillaba el rocín como tomaba la podadera.',
       'En un lugar de la Mancha, de cuyo nom

Lo que hacemos para trabajar el texto es vectorizar las palabras, es decir, convertir texto en vectores.

[CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) develve un vector con valor 0 en todas las palabras que no existen en una frase y con el número de ocurrencias de las palabras que si existen.

In [40]:
ejemplo_frases = [
    "los coches rojos",
    "los aviones son rojos",
    "los coches y los aviones son rojos",
    "los camiones rojos"
]

vectorizador_count = feature_extraction.text.CountVectorizer()
X = vectorizador_count.fit_transform(ejemplo_frases)
X

<4x6 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [41]:
vectorizador_count.get_feature_names()

['aviones', 'camiones', 'coches', 'los', 'rojos', 'son']

In [42]:
pd.DataFrame(X.toarray(), columns=vectorizador_count.get_feature_names())

Unnamed: 0,aviones,camiones,coches,los,rojos,son
0,0,0,1,1,1,0
1,1,0,0,1,1,1
2,1,0,1,2,1,1
3,0,1,0,1,1,0


Existen algunos conectores que son necesarios para expresarnos pero que no aportan nada a nivel semantico al significado de una frase, por ejemplo la palabra *los*.

Una manera más sofisticada de vectorizar un texto es en vez de usar el número de apariciones, se puede usar "TF-IDF." Que es la "Frecuencia de Texto" y "Frecuencia Inversa de Documento", es una medida que asigna pesos a las palabras en función de su frecuencia de aparición en todos los documentos.

In [44]:
vectorizador_tfidf = feature_extraction.text.TfidfVectorizer()
X = vectorizador_tfidf.fit_transform(ejemplo_frases)
pd.DataFrame(X.toarray(), columns=vectorizador_tfidf.get_feature_names())

Unnamed: 0,aviones,camiones,coches,los,rojos,son
0,0.0,0.0,0.730064,0.483222,0.483222,0.0
1,0.589645,0.0,0.0,0.39028,0.39028,0.589645
2,0.438931,0.0,0.438931,0.581047,0.290524,0.438931
3,0.0,0.804612,0.0,0.41988,0.41988,0.0


In [45]:
vectorizador_tfidf = feature_extraction.text.TfidfVectorizer()
texto_vectorizado = vectorizador_tfidf.fit_transform(datos.col_texto)
texto_vectorizado

<1000x134 sparse matrix of type '<class 'numpy.float64'>'
	with 28295 stored elements in Compressed Sparse Row format>

In [46]:
label_codificador.classes_

array(['bien', 'mal', 'muy bien', 'muy mal', 'regular'], dtype=object)

In [47]:
pd.DataFrame(texto_vectorizado.toarray(), columns=vectorizador_tfidf.get_feature_names())

Unnamed: 0,acordarme,adarga,algo,alguna,algún,ama,amigo,antigua,astillero,así,...,una,vaca,veinte,velarte,vellori,velludo,verdad,verosímiles,viernes,vivía
0,0.000000,0.000000,0.000000,0.0,0.000000,0.204745,0.000000,0.000000,0.000000,0.204745,...,0.318989,0.000000,0.204745,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
1,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.181842,0.181842,0.181842,0.0,0.0,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.181842,0.181842,0.181842,0.0,0.0,0.000000,0.000000
3,0.000000,0.000000,0.194272,0.0,0.194272,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.143675,0.194272,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.194272,0.000000
4,0.000000,0.000000,0.000000,0.0,0.000000,0.204745,0.000000,0.000000,0.000000,0.204745,...,0.318989,0.000000,0.204745,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.218526,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
996,0.000000,0.000000,0.000000,0.0,0.000000,0.204745,0.000000,0.000000,0.000000,0.204745,...,0.318989,0.000000,0.204745,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000
997,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.181842,0.181842,0.181842,0.0,0.0,0.000000,0.000000
998,0.197887,0.197887,0.000000,0.0,0.000000,0.000000,0.000000,0.197887,0.197887,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.197887


### Juntando todo lo que se hizo

In [53]:
col_numericas = ["col_inexistente1", "col2", "col3", "col_outliers", "col_outliers2"]
col_categorica = ["col_categorica"]
col_texto = ["col_texto"]

from sklearn.impute import SimpleImputer

# Variables numericas
imputador = SimpleImputer(strategy="mean")
escalador = preprocessing.StandardScaler()
var_numericas_imputadas_escalado_standard = escalador.fit_transform(
                                                imputador.fit_transform(datos[col_numericas])
                                            )

df_numerico_procesado = pd.DataFrame(var_numericas_imputadas_escalado_standard,
                                    columns=col_numericas)

# Variable Categorica
label_codificador_categorico = preprocessing.LabelEncoder()
categorias_codificadas = label_codificador_categorico.fit_transform(datos.col_categorica)
oh_codificador = preprocessing.OneHotEncoder(sparse=False)
categorias_oh_codificadas = oh_codificador.fit_transform(categorias_codificadas.reshape(1000,1)) # 1000 es el largo del dataset

df_categorico_procesado = pd.DataFrame(categorias_oh_codificadas, columns=label_codificador_categorico.classes_)


# Texto
vectorizador_tfidf = feature_extraction.text.TfidfVectorizer()
texto_vectorizado = vectorizador_tfidf.fit_transform(datos.col_texto)
df_texto_procesado = pd.DataFrame(texto_vectorizado.toarray(), columns=vectorizador_tfidf.get_feature_names())

datos_procesados = pd.concat([
    df_numerico_procesado,
    df_categorico_procesado,
    df_texto_procesado
],axis=1)

# Variable ordinal
label_codifcador = preprocessing.LabelEncoder()
datos_procesados["col_ordinal"] = label_codificador.fit_transform(datos.col_ordinal)


In [54]:
datos_procesados.head()

Unnamed: 0,col_inexistente1,col2,col3,col_outliers,col_outliers2,elefante,gato,perro,ratón,acordarme,...,vaca,veinte,velarte,vellori,velludo,verdad,verosímiles,viernes,vivía,col_ordinal
0,0.399217,0.082807,0.442819,-0.6946,-0.038365,0.0,0.0,0.0,1.0,0.0,...,0.0,0.204745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,-0.653605,0.861333,-0.32339,-0.118466,-0.038278,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.181842,0.181842,0.181842,0.0,0.0,0.0,0.0,4
2,1.226435,-0.766494,-0.484752,-0.464146,-0.038343,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.181842,0.181842,0.181842,0.0,0.0,0.0,0.0,3
3,-0.540803,-1.191145,-0.375028,-1.129901,-0.038405,0.0,1.0,0.0,0.0,0.0,...,0.194272,0.0,0.0,0.0,0.0,0.0,0.0,0.194272,0.0,1
4,-0.616004,-0.766494,-0.516874,0.777743,-0.037257,0.0,1.0,0.0,0.0,0.0,...,0.0,0.204745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
