## Seleção condicional de dados

In [29]:
# importando bibliotecas
import pandas as pd
import numpy as np

# importando sub bibliotecas
from numpy import random as rd

In [30]:
# setando o seed pra gerar sempre os mesmo números randômicos
rd.seed(101)

In [31]:
# Dataframe com 20 números em distribuição normal (gaussiana)
df = pd.DataFrame(data=rd.randn(5,4), index='A B C D E'.split(),
                 columns='W X Y Z'.split())

In [32]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [33]:
# criando um Dataframe booleano para filtrar dados do Dataframe numérico
bol = df > 0

In [34]:
bol

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [35]:
# Filtrando
df[bol]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [36]:
# filtrando linhas por valor de uma determinada coluna
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [37]:
# juntando à condição anterior, a seleção de uma ou mais colunas
df[df['W'] > 0]['Y']

A    0.907969
B   -0.848077
D   -0.933237
E    2.605967
Name: Y, dtype: float64

In [38]:
df[df['W'] > 0][['Y', 'Z']]

Unnamed: 0,Y,Z
A,0.907969,0.503826
B,-0.848077,0.605965
D,-0.933237,0.955057
E,2.605967,0.683509


In [39]:
# Para se utilizar expressões lógicas com valores de Series,
# não se usa and e or, porque esses operadores funcionam apenas para
# comparação de escalares, no caso de comparação entre iteráveis,
# usa-se o & e o | (pipe)

df[(df['W'] > 0) & (df['Y'] > 1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


### Redefinindo os índices de um DataFrame

Para redefinir os índices de um Dataframe para valores numéricos
padrão utiliza-se o método reset_index:

In [40]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


O método reset_index redefine os índices do dataframe para valores numéricos iniciados em 0 e transforma os índices antigos em uma nova coluna do DataFrame.

Seu efeito ocorre apenas no retorno do método, ou seja, se é desejado
que a redefinição de índice ocorra no dataframe original, deve-se 
realizar ou reatribuição do dataframe, ou a utilização do parâmetro
```inplace = True```

In [41]:
# mesmo após a chamada do método reset_index, o índice não mudou
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [42]:
# realizando cópia do df original para outros exemplos
aux = df.copy()

# reatribuindo o dataframe com redefinição de índices
df = df.reset_index()

In [44]:
# Dataframe original teve seu índice modificado definitavamente
df

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [53]:
df = aux.copy()

In [54]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [55]:
# utilizando o parâmetro inplace. Assim a mudança ocorre diretamente
# no dataframe original

df.reset_index(inplace=True)

In [56]:
df

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


#### Alterando o índice do Dataframe para um coluna já existente

In [57]:
# criando a nova coluna
col = 'RS RJ SP AM SC'.split()

In [58]:
col

['RS', 'RJ', 'SP', 'AM', 'SC']

In [59]:
# colocando a nova coluna no DataFrame

df['Estado'] = col

In [60]:
col

['RS', 'RJ', 'SP', 'AM', 'SC']

In [61]:
# utilizando o método set_index para atribuir a nova coluna como índice
df.set_index('Estado')

Unnamed: 0_level_0,index,W,X,Y,Z
Estado,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RS,A,2.70685,0.628133,0.907969,0.503826
RJ,B,0.651118,-0.319318,-0.848077,0.605965
SP,C,-2.018168,0.740122,0.528813,-0.589001
AM,D,0.188695,-0.758872,-0.933237,0.955057
SC,E,0.190794,1.978757,2.605967,0.683509


In [63]:
# da mesma forma que acontece com o reset_index, o set_index não aplica
# a alteração diretamente ao dataframe original, para isso é necessário
# utilizar o parâmetro inplace = True
df

Unnamed: 0,index,W,X,Y,Z,Estado
0,A,2.70685,0.628133,0.907969,0.503826,RS
1,B,0.651118,-0.319318,-0.848077,0.605965,RJ
2,C,-2.018168,0.740122,0.528813,-0.589001,SP
3,D,0.188695,-0.758872,-0.933237,0.955057,AM
4,E,0.190794,1.978757,2.605967,0.683509,SC


In [64]:
df.set_index('Estado', inplace=True)

In [65]:
df

Unnamed: 0_level_0,index,W,X,Y,Z
Estado,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RS,A,2.70685,0.628133,0.907969,0.503826
RJ,B,0.651118,-0.319318,-0.848077,0.605965
SP,C,-2.018168,0.740122,0.528813,-0.589001
AM,D,0.188695,-0.758872,-0.933237,0.955057
SC,E,0.190794,1.978757,2.605967,0.683509
