Manejo de categorías --- 11:35
===

* 11:35 min | Última modificación: Octubre 14, 2021 | [YouTube](https://youtu.be/zl2Mq7k_lYc)

In [1]:
import numpy as np
import pandas as pd

Categorías inconsistentes
---

In [2]:
%%writefile /tmp/data.csv
personId,eventType
1,AA
2,A
3,AZ
4,AB
5,ZB
6,ZZ
7,BA
8,BB

Overwriting /tmp/data.csv


In [3]:
valid_eventType = {'AA', 'AB', 'BA', 'BB'}

df = pd.read_csv('/tmp/data.csv')

#
# Categorias inconsistentes
#
set(df.eventType).difference(valid_eventType)

{'A', 'AZ', 'ZB', 'ZZ'}

In [4]:
#
# Registros con categorias inconsistentes
#
df[~df.eventType.isin(valid_eventType)]

Unnamed: 0,personId,eventType
1,2,A
2,3,AZ
4,5,ZB
5,6,ZZ


Posibles soluciones:

* Borrado del registro.

* Reemplazo de las categorias inconsistentes

* Inferencia la categoria a partir de otros campos.

In [5]:
#
# Borrado de registros inconsistentes
#
df = df[df.eventType.isin(valid_eventType)]
df

Unnamed: 0,personId,eventType
0,1,AA
3,4,AB
6,7,BA
7,8,BB


Consistencia de valores en variables categóricas
---

In [6]:
%%writefile /tmp/data.csv
personId,status
1,divorced_male
2,single_male
3,SINGLE_MALE
4,single_female
5,divorced_female
6,MARRIED_MALE
7,single female
8,single_male
9,divorced male

Overwriting /tmp/data.csv


In [7]:
df = pd.read_csv('/tmp/data.csv')

#
# Búsqueda de valores inconsistentes. Para 
# columnas con muchos valores diferentes no
# funciona bien
#
df.status.value_counts()

single_male        2
single_female      1
SINGLE_MALE        1
divorced_male      1
MARRIED_MALE       1
single female      1
divorced male      1
divorced_female    1
Name: status, dtype: int64

In [8]:
status_df = pd.DataFrame({ 'status': list(df.status) })
status_df

Unnamed: 0,status
0,divorced_male
1,single_male
2,SINGLE_MALE
3,single_female
4,divorced_female
5,MARRIED_MALE
6,single female
7,single_male
8,divorced male


In [9]:
status_df = status_df.drop_duplicates()
status_df

Unnamed: 0,status
0,divorced_male
1,single_male
2,SINGLE_MALE
3,single_female
4,divorced_female
5,MARRIED_MALE
6,single female
8,divorced male


In [10]:
status_df = status_df.assign(key=status_df.status.str.lower())
status_df

Unnamed: 0,status,key
0,divorced_male,divorced_male
1,single_male,single_male
2,SINGLE_MALE,single_male
3,single_female,single_female
4,divorced_female,divorced_female
5,MARRIED_MALE,married_male
6,single female,single female
8,divorced male,divorced male


In [11]:
status_df.groupby('key').agg(list)

Unnamed: 0_level_0,status
key,Unnamed: 1_level_1
divorced male,[divorced male]
divorced_female,[divorced_female]
divorced_male,[divorced_male]
married_male,[MARRIED_MALE]
single female,[single female]
single_female,[single_female]
single_male,"[single_male, SINGLE_MALE]"


In [12]:
#
# Corrección por reemplazo
#
df['status'] = df.status.str.replace(' ', '_')
df['status'] = df.status.str.lower()
df.status.value_counts()

single_male        3
single_female      2
divorced_male      2
divorced_female    1
married_male       1
Name: status, dtype: int64

Reemplazo de valores numéricos por categorias
--

In [13]:
%%writefile /tmp/data.csv
salary
100
90
86
25
53
48
23
10
45
94
32
67
25
12
99

Overwriting /tmp/data.csv


In [14]:
#
# Rangos y categorias.
#
ranges = [0, 20, 40, 60, 80, np.inf]
names = ['0-19', '20-39', '40-59', '60-79', '80+']

df = pd.read_csv('/tmp/data.csv')

#
# Creación de la columna
#
df['group'] = pd.cut(
    df.salary,
    bins=ranges,
    labels=names,
)
df

Unnamed: 0,salary,group
0,100,80+
1,90,80+
2,86,80+
3,25,20-39
4,53,40-59
5,48,40-59
6,23,20-39
7,10,0-19
8,45,40-59
9,94,80+


Reemplazo de strings por categorias
---

In [15]:
%%writefile /tmp/data.csv
vehicle
trailblazer
spark
bolt ev
trax
equinox
blazer
beat
joy
trailblazer
spark
bolt ev
traverse
bolt euv
tahoe
malibu
suburban
onix
trax
bolt ev
trax
equinox
blazer
equinox
blazer
beat
joy
traverse

Overwriting /tmp/data.csv


In [16]:
df = pd.read_csv("/tmp/data.csv")

#
# Categorias
#
mapping = {
    "beat": "Cars",
    "blazer": "SUVs",
    "bolt euv": "Electric",
    "bolt ev": "Electric",
    "equinox": "SUVs",
    "joy": "Cars",
    "malibu": "Cars",
    "onix": "Cars",
    "spark": "Cars",
    "suburban": "SUVs",
    "tahoe": "SUVs",
   ,
    "traverse": "SUVs",
    "trax": "SUVs",
}

df['category'] = df.vehicle.replace(mapping)
df

Unnamed: 0,vehicle,category
0,trailblazer,SUVs
1,spark,Cars
2,bolt ev,Electric
3,trax,SUVs
4,equinox,SUVs
5,blazer,SUVs
6,beat,Cars
7,joy,Cars
8,trailblazer,SUVs
9,spark,Cars
