# Pandas

## Estrutura de Dados

### Series

In [1]:
from pandas import Series, DataFrame
import pandas as pd
obj = Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [2]:
print(obj.values)
print(obj.index) #obj.index.values

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)


In [4]:
print(obj.index.values)

[0 1 2 3]


In [5]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

Comparado com o numpy array, você pode usar rótulos no índice quando seleciona um único valor ou um conjunto de valores:

In [6]:
obj2["a"]

np.int64(-5)

In [7]:
obj2["d"] = 6 #atribuição

In [8]:
obj2[["c", "a", "d"]] #uma lista de índices

c    3
a   -5
d    6
dtype: int64

In [9]:
import numpy as np
print(obj2[obj2 > 0])
print()
print(obj2 * 2) #OPERAÇÃO vetorizada
print()
print(np.exp(obj2))

d    6
b    7
c    3
dtype: int64

d    12
b    14
a   -10
c     6
dtype: int64

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [10]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

Podemos converter a série num dicionário novamente usando o método `to_dict`:

In [11]:
obj3.to_dict()

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [12]:
states = ['Oregon', 'Texas','California', 'Ohio']
obj4 = Series(sdata, index=states)
obj4

Oregon        16000.0
Texas         71000.0
California        NaN
Ohio          35000.0
dtype: float64

In [13]:
print(pd.isna(obj4)) #isnull
print(pd.notna(obj4)) # notnull

Oregon        False
Texas         False
California     True
Ohio          False
dtype: bool
Oregon         True
Texas          True
California    False
Ohio           True
dtype: bool


In [14]:
print(obj3)
print()
print(obj4)
print()
obj3 + obj4

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

Oregon        16000.0
Texas         71000.0
California        NaN
Ohio          35000.0
dtype: float64



California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [15]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
Oregon        16000.0
Texas         71000.0
California        NaN
Ohio          35000.0
Name: population, dtype: float64

In [16]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrame

In [17]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


Para grandes DataFrames, podemos usar os métodos `head()`e `tail()`para visualizar apenas uma parte dos dados:

In [18]:
df.head() #mostra as 5 primeiras linhas

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [19]:
df.tail(2) #mostra as 2 últimas linhas

Unnamed: 0,state,year,pop
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [20]:
df2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                index=['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [21]:
df2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [22]:
print(df['state'])
print()
print(df.year)

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64


In [23]:
print(df2.loc['four']) #label
print()
print(df.iloc[0]) #int

year       2001
state    Nevada
pop         2.4
debt        NaN
Name: four, dtype: object

state    Ohio
year     2000
pop       1.5
Name: 0, dtype: object


In [25]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data.loc['Colorado'] #seleciona a linha cujo índice é Colorado

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [26]:
data.loc['Colorado'] #seleciona a linha cujo índice é Colorado

one      4
two      5
three    6
four     7
Name: Colorado, dtype: int64

In [27]:
data.loc[["Colorado", "New York"]] #seleciona as linhas Colorado e New York

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
New York,12,13,14,15


In [29]:
data.loc[["Colorado", "New York"], ["two", "three"]] #seleciona a linha Colorado e as colunas two e three

Unnamed: 0,two,three
Colorado,5,6
New York,13,14


In [30]:
data.iloc[2]#linha 2

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [31]:
data.iloc[[2, 1]] #linhas 2 e 1, nessa ordem

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Colorado,4,5,6,7


In [33]:
data.iloc[2, [3, 0, 1]] #linha 2, colunas 3, 0 e 1, nessa ordem

four    11
one      8
two      9
Name: Utah, dtype: int64

In [34]:
data.iloc[[1, 2], [3, 0, 1]] #linhas 1 e 2, colunas 3,0 e 1, nessa ordem

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [35]:
# todas as linhas, as 3 primeiras colunas, desde que seja maior que 5
data.iloc[:, :3][data.three > 5] 

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


In [36]:
data.loc[data.three >= 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [37]:
df2['debt'] = np.arange(len(df2))
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4


In [38]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df2['debt'] = val
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [39]:
df2['eastern'] = df2.state == 'Ohio'
df2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [40]:
del df2['eastern']
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [41]:
df2.values # df2.to_numpy()

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

In [42]:
obj = Series(range(3), index=['a', 'b', 'c'])
obj.index.values

array(['a', 'b', 'c'], dtype=object)

In [43]:
print('state' in df2.columns)
print(0 in df.index)

True
True


## Eliminando entradas de um dos eixos

In [44]:
import numpy as np
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [45]:
new_obj = obj.drop("c")
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [46]:
obj.drop(["d", "c"])

a    0.0
b    1.0
e    4.0
dtype: float64

In [47]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [48]:
data.drop(index=["Colorado", "Ohio"]) #linhas

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [49]:
data.drop(columns=["two"]) #colunas

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [50]:
data.drop("two", axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [51]:
data.drop(["two", "four"], axis="columns")

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


## Aplicação de Função e Mapeamento

In [52]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [53]:
list('abc')

['a', 'b', 'c']

In [54]:
df = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df)
print()
np.abs(df) #retorna valor absoluto

               b         d         e
Utah    1.407451 -2.825913 -0.046003
Ohio   -1.326172  0.961265 -0.171430
Texas  -0.053057  1.337334  2.237124
Oregon -0.803167  1.807554 -1.303199



Unnamed: 0,b,d,e
Utah,1.407451,2.825913,0.046003
Ohio,1.326172,0.961265,0.17143
Texas,0.053057,1.337334,2.237124
Oregon,0.803167,1.807554,1.303199


In [55]:
f = lambda x: x.max() - x.min()
print(df.apply(f))
print()
print(df.apply(f, axis=1))

b    2.733623
d    4.633467
e    3.540324
dtype: float64

Utah      4.233364
Ohio      2.287437
Texas     2.290182
Oregon    3.110754
dtype: float64


In [56]:
def f2(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

df.apply(f2)

Unnamed: 0,b,d,e
min,-1.326172,-2.825913,-1.303199
max,1.407451,1.807554,2.237124


In [57]:
df

Unnamed: 0,b,d,e
Utah,1.407451,-2.825913,-0.046003
Ohio,-1.326172,0.961265,-0.17143
Texas,-0.053057,1.337334,2.237124
Oregon,-0.803167,1.807554,-1.303199


In [58]:
format2 = lambda x: '%.2f' % x
df.applymap(format2)

  df.applymap(format2)


Unnamed: 0,b,d,e
Utah,1.41,-2.83,-0.05
Ohio,-1.33,0.96,-0.17
Texas,-0.05,1.34,2.24
Oregon,-0.8,1.81,-1.3


## Ordenação e Ranking

In [59]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
df2 = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
print(obj)
print()
print(df2)
print()
print(obj.sort_index())
print()
print(df2.sort_index())
print()
print(df2.sort_index(axis=1))

d    0
a    1
b    2
c    3
dtype: int64

       d  a  b  c
three  0  1  2  3
one    4  5  6  7

a    1
b    2
c    3
d    0
dtype: int64

       d  a  b  c
one    4  5  6  7
three  0  1  2  3

       a  b  c  d
three  1  2  3  0
one    5  6  7  4


In [60]:
obj = Series([4, 7, -3, 2])
obj.sort_values(ascending=False) #igual para pandas DataFrame

1    7
0    4
3    2
2   -3
dtype: int64

In [61]:
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'nota': [8, 7, 7.5, 10, 8]}
df4 = DataFrame(data)
print(df4)
print()
df4['rank'] = df4['nota'].rank(ascending=0)
df4.sort_values('rank')

    name  nota
0  Jason   8.0
1  Molly   7.0
2   Tina   7.5
3   Jake  10.0
4    Amy   8.0



Unnamed: 0,name,nota,rank
3,Jake,10.0,1.0
0,Jason,8.0,2.5
4,Amy,8.0,2.5
2,Tina,7.5,4.0
1,Molly,7.0,5.0


## Sumarização e Estatística Descritiva

In [62]:
df5 = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print(df5)
print()
print(df5.sum())
print()
print(df5.sum(axis=1))
print()
print(df5.count())

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3

one    9.25
two   -5.80
dtype: float64

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

one    3
two    2
dtype: int64


In [63]:
df5.size

8

In [64]:
df5.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [65]:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [66]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [67]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [68]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

## Manipulação de Valores Faltantes 

In [69]:
string_data = Series(['laranja', 'uva', np.nan, 'abacate'])
print(string_data)
print()
print(string_data.isnull())
string_data[0] = None
print()
print(string_data.isnull())

0    laranja
1        uva
2        NaN
3    abacate
dtype: object

0    False
1    False
2     True
3    False
dtype: bool

0     True
1    False
2     True
3    False
dtype: bool


In [70]:
data = DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                  [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
print(data)
cleaned = data.dropna()
print('\n',cleaned)
print()
data.dropna(how='all')

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0

      0    1    2
0  1.0  6.5  3.0



Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [71]:
print(data.fillna(0))
print()
print(data.fillna(data.mean()))

     0    1    2
0  1.0  6.5  3.0
1  1.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  6.5  3.0

     0    1    2
0  1.0  6.5  3.0
1  1.0  6.5  3.0
2  1.0  6.5  3.0
3  1.0  6.5  3.0


## TODO Section

### Manipulação de DataFrame

        > Crie, a partir do dicionário abaixo, um DataFrame cujo index seja os valores da variável labels
        > encontre a média dos valores da coluna age e preencha os valores faltantes dessa coluna com o valor da média
        > crie uma nova coluna chamada 'rank', que mostre os animais que receberam mais visitas
        > qual o animal que recebeu a maior quantidade de visitas? Use o método max()

In [72]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [73]:
#resposta
df2 = DataFrame(data, index = labels,)
df2

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [77]:
df2.age.mean()

np.float64(3.4375)

In [79]:
df2.age = df2.age.fillna(df2.age.mean())
df2

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,3.4375,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,3.4375,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [81]:
df2['rank'] = df2['visits'].rank(ascending=0)
df2

Unnamed: 0,animal,age,visits,priority,rank
a,cat,2.5,1,yes,8.5
b,cat,3.0,3,yes,2.0
c,snake,0.5,2,no,5.0
d,dog,3.4375,3,yes,2.0
e,dog,5.0,2,no,5.0
f,cat,2.0,3,no,2.0
g,snake,4.5,1,no,8.5
h,cat,3.4375,1,yes,8.5
i,dog,7.0,2,no,5.0
j,dog,3.0,1,no,8.5


In [87]:
max(df2.visits)

3

In [89]:
df2[df2.visits >= max(df2.visits)] 

Unnamed: 0,animal,age,visits,priority,rank
b,cat,3.0,3,yes,2.0
d,dog,3.4375,3,yes,2.0
f,cat,2.0,3,no,2.0


## Carregamento e Armazenamento de Dados

### Arquivo CSV

In [None]:
import pandas as pd
poke = pd.read_csv('bases/Pokemon.csv')
poke.head(n=10)

## Combinação de Dados

In [None]:
import pandas as pd

In [None]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})

df2 = pd.DataFrame({'key': ['a', 'b', 'd','b'],
                 'data2': range(4)})

pd.merge(df1,df2) #default inner

In [None]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})

df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

In [None]:
df3

In [None]:
df4

In [None]:
pd.merge(df3, df4, how='outer',left_on='lkey',right_on='rkey')

In [None]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

In [None]:
import numpy as np
arr = np.arange(12).reshape((3, 4))
print(arr)
print()
np.concatenate([arr, arr], axis=1)

In [None]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
pd.concat([s1, s2, s3])

In [None]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), 
                index=['a', 'b', 'c'],
                columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), 
                index=['a', 'c'],
                columns=['three', 'four'])
print(df1)
print()
print(df2)
print()
pd.concat([df1, df2], axis=1)

## TODO Section

### Manipulação de Dados usando Pandas

Usando o dataset Pokemon.csv, faça:

    1) Verifique em qual(is) coluna(s) existem valores faltantes
    2) Preencha os valores faltantes da coluna Type 2 com os valores correspondentes da coluna Type 1
    3) Crie um DataFrame a partir dos dados originais contendo apenas pokemons lendários. Imprima os 5 primeiros
    4) Use apply/applymap para passar todos os valores das colunas Name, Type 1 e Type 2 para minúscula

In [None]:
poke = pd.read_csv('bases/Pokemon.csv')
poke.head(n=10)

In [None]:
# Resposta 1

In [None]:
# Resposta 2

In [None]:
# Resposta 3

In [None]:
# Resposta 4