In [1]:
# Importamos bibliotecas.
import pandas as pd
import numpy as np
import numpy.random as rd
import matplotlib.pyplot as plt

In [2]:
# Generamos un dataframe con datos aleatorios en forma de diccionario.
N = 1000
df = pd.DataFrame(
    {
        'A': rd.normal(size=N),
        'B': 1.5 + 2.5 * rd.normal(size=N),
        'C': rd.uniform(5, 32, N)
    }, index = range(43, 43+N)
)

df.head()

Unnamed: 0,A,B,C
43,0.607259,1.716413,28.908384
44,1.293496,2.608391,30.247845
45,-1.248252,3.12523,22.483253
46,-0.019091,0.012885,16.031863
47,0.348725,0.195801,10.299685


In [3]:
# Calculamos cantidades estadísticas.
df.describe()

Unnamed: 0,A,B,C
count,1000.0,1000.0,1000.0
mean,0.027141,1.437893,18.531332
std,0.979608,2.577179,7.604665
min,-3.102841,-5.92097,5.01894
25%,-0.63105,-0.244894,12.027617
50%,0.024504,1.436835,18.555185
75%,0.722116,3.149602,25.243986
max,2.865679,10.975522,31.979228


In [4]:
# Generamos en sí un dummy dataframe.

# Vamos a definir variables categóricas.
gender = ['male', 'female']
income = ['low', 'mid', 'high']

# Definimos la longitud del dataframe y las columnas como arreglos vacíos.
N = 1000
gender_data = []
income_data = []

# Llenamos estas columnas.
for i in range(N):
    gender_data.append(rd.choice(gender))
    income_data.append(rd.choice(income))

# Generamos otras columnas de datos.
height = 160 + 30*rd.normal(size=N)
weight = 70 + 15*rd.normal(size=N)
age = np.round(30 + 5*rd.normal(size=N))

# Creamos el dataset en sí.
df = pd.DataFrame(
    {
        'Gender': gender_data,
        'Income': income_data,
        'Height': height,
        'Weight': weight,
        'Age': age
    }
)

df.head()

Unnamed: 0,Gender,Income,Height,Weight,Age
0,male,high,125.708634,86.376284,28.0
1,female,low,127.027198,70.207254,30.0
2,male,low,192.551252,63.60987,37.0
3,male,mid,143.963281,73.92693,29.0
4,female,high,183.826497,78.787622,34.0


In [5]:
# Podemos agrupar los datos de variables categóricas.
grouped_gender = df.groupby('Gender')

# Imprimimos.
for names, groups in grouped_gender:
    print(names)
    print(groups)

female
     Gender Income      Height     Weight   Age
1    female    low  127.027198  70.207254  30.0
4    female   high  183.826497  78.787622  34.0
8    female    mid  186.663852  76.260748  36.0
10   female    low  127.543484  66.541766  27.0
11   female   high  154.411329  56.474493  39.0
..      ...    ...         ...        ...   ...
994  female    mid  169.190510  60.081403  33.0
995  female    mid  132.762332  63.320124  18.0
997  female    mid  238.521812  68.059332  33.0
998  female   high  187.387855  86.536402  30.0
999  female   high  163.950026  81.084548  32.0

[509 rows x 5 columns]
male
    Gender Income      Height     Weight   Age
0     male   high  125.708634  86.376284  28.0
2     male    low  192.551252  63.609870  37.0
3     male    mid  143.963281  73.926930  29.0
5     male    mid  148.762026  70.051017  32.0
6     male   high  203.841841  80.564916  33.0
..     ...    ...         ...        ...   ...
985   male   high  141.332700  92.484531  30.0
986   male  

In [6]:
# También podemos imprimir un solo grupo.
grouped_gender.get_group('female')

Unnamed: 0,Gender,Income,Height,Weight,Age
1,female,low,127.027198,70.207254,30.0
4,female,high,183.826497,78.787622,34.0
8,female,mid,186.663852,76.260748,36.0
10,female,low,127.543484,66.541766,27.0
11,female,high,154.411329,56.474493,39.0
...,...,...,...,...,...
994,female,mid,169.190510,60.081403,33.0
995,female,mid,132.762332,63.320124,18.0
997,female,mid,238.521812,68.059332,33.0
998,female,high,187.387855,86.536402,30.0


In [7]:
# Podemos también hacer dobles agrupaciones.
db_group = df.groupby(['Gender', 'Income'])

In [8]:
# Podemos hacer operaciones con estas agrupaciones, como calcular valores promedio y otras cantidades estadísticas.
print(db_group.mean())
print('\n')
print(db_group.size())

                   Height     Weight        Age
Gender Income                                  
female high    159.069748  69.138138  29.994318
       low     157.007616  68.344235  29.772152
       mid     164.153749  69.617164  29.965714
male   high    159.868750  70.884941  29.918750
       low     161.719243  69.898752  29.556962
       mid     156.064617  69.898535  29.832370


Gender  Income
female  high      176
        low       158
        mid       175
male    high      160
        low       158
        mid       173
dtype: int64
