# Preprocesado de datos

## Creación de conjunto de entrenamiento y pruebas

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv("precios_casas.csv")
X = df.iloc[:,1:].values
y = df.iloc[:,0].values
print(X)
entX, pruX, enty, pruy = train_test_split(X, y, test_size=0.2, random_state=100)


[[3.0 1.5 1340 ... 'Shoreline' 'WA 98133' 'USA']
 [5.0 2.5 3650 ... 'Seattle' 'WA 98119' 'USA']
 [3.0 2.0 1930 ... 'Kent' 'WA 98042' 'USA']
 ...
 [3.0 2.5 3010 ... 'Renton' 'WA 98059' 'USA']
 [4.0 2.0 2090 ... 'Seattle' 'WA 98178' 'USA']
 [3.0 2.5 1490 ... 'Covington' 'WA 98042' 'USA']]


## Manejo de datos ausentes

In [4]:
df.isnull().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

In [26]:
import numpy as np
df = pd.DataFrame([
        ['1', 1, 30],
        ['2', np.nan, 32],
        ['3', 0]],               
)
df.columns = ['codigo','credito','edad']

In [27]:
import numpy as np
df.replace('?', np.nan, inplace=True) #esto es solo una prueba
df

Unnamed: 0,codigo,credito,edad
0,1,1.0,30.0
1,2,,32.0
2,3,0.0,


In [28]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(df.values)
imp_datos = imp.transform(df.values)
imp_datos

array([[ 1. ,  1. , 30. ],
       [ 2. ,  0.5, 32. ],
       [ 3. ,  0. , 31. ]])

## Manejo de datos categóricos

In [30]:
df = pd.DataFrame([
        ['M', 30, 'Amarillo','Clase 1'],
        ['P', 28 , 'Azul','Clase 2'],
        ['J', 21, 'Rojo', 'Clase 1']],               
)
df.columns = ['nombre','edad','color', 'etiqueta']
df

Unnamed: 0,nombre,edad,color,etiqueta
0,M,30,Amarillo,Clase 1
1,P,28,Azul,Clase 2
2,J,21,Rojo,Clase 1


In [34]:
from sklearn.preprocessing import LabelEncoder
le_clase = LabelEncoder()
y = le_clase.fit_transform(df.etiqueta)
print(y)

[0 1 0]


In [35]:
clase_inv = le_clase.inverse_transform(y)
print(clase_inv)

['Clase 1' 'Clase 2' 'Clase 1']


In [42]:
from sklearn.preprocessing import OneHotEncoder
le_color = LabelEncoder()
ohe_color = OneHotEncoder(categories='auto')
df['color_cod'] = le_color.fit_transform(df.color)
print(df)

  nombre  edad     color etiqueta  color_cod
0      M    30  Amarillo  Clase 1          0
1      P    28      Azul  Clase 2          1
2      J    21      Rojo  Clase 1          2


In [43]:
datos_ohe = ohe_color.fit_transform(df.color_cod.values.reshape(-1,1)).toarray()
dfOneHot = pd.DataFrame(datos_ohe, columns = ["Color_"+str(int(i)) for i in range(len(df.color))]) 
df = pd.concat([df, dfOneHot], axis=1)
print(df)

  nombre  edad     color etiqueta  color_cod  Color_0  Color_1  Color_2
0      M    30  Amarillo  Clase 1          0      1.0      0.0      0.0
1      P    28      Azul  Clase 2          1      0.0      1.0      0.0
2      J    21      Rojo  Clase 1          2      0.0      0.0      1.0


In [44]:
df= df.drop(['color', 'color_cod'], axis=1)

In [47]:
#this example didn't mention it
pd.get_dummies(data=dfAux, columns=["color"], drop_first=True)

KeyError: "None of [Index(['color'], dtype='object')] are in the [columns]"

## Escalamiento de características

In [48]:
def normalizar(columna):
     x_norma = (columna - columna.min()) / (columna.max() - columna.min()) 
     return x_norma

In [55]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
df = pd.read_csv("precios_casas.csv")
esc = MinMaxScaler()
X_ent = esc.fit_transform(df.price.values.reshape(-1,1))
print(X_ent)

[[0.01177134]
 [0.08965777]
 [0.01286198]
 ...
 [0.01567898]
 [0.00764949]
 [0.00829635]]


In [56]:
def estandarizar(columna):
    x_estandar = ( columna - columna.mean())  / columna.std()
    return x_estandar

In [57]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("precios_casas.csv")
est = StandardScaler()
X_ent = est.fit_transform(df.price.values.reshape(-1,1))
print(X_ent)

[[-0.42386353]
 [ 3.2495981 ]
 [-0.37242442]
 ...
 [-0.23956224]
 [-0.61826787]
 [-0.58775916]]
