Basics of pandas

#### Series

In [21]:
import pandas as pd
from pandas import DataFrame, Series
import numpy as np

In [10]:
series = Series([0, 4, 10, 1.0])
series

0     0.0
1     4.0
2    10.0
3     1.0
dtype: float64

Index on the left - automatic

In [11]:
print(series.index)
print(series.values)

RangeIndex(start=0, stop=4, step=1)
[ 0.  4. 10.  1.]


In [15]:
print(Series(["a", "b", "c"]))
print(Series(["a", 1.0]))

0    a
1    b
2    c
dtype: object
0    a
1    1
dtype: object


In [20]:
series2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print(series2)
print(series2['d'])
print()
print(series2**2)

d    4
b    7
a   -5
c    3
dtype: int64
4

d    16
b    49
a    25
c     9
dtype: int64


In [32]:
series3 = Series([1, 2, np.nan])
print(series3)
print()
print(series3.isnull())
print()
print(pd.isnull(series3))
print()
print(pd.notnull(series3))

0    1.0
1    2.0
2    NaN
dtype: float64

0    False
1    False
2     True
dtype: bool

0    False
1    False
2     True
dtype: bool

0     True
1     True
2    False
dtype: bool


In [35]:
series2.name = "col1"
series2.index.name = "names"
series2

names
d    4
b    7
a   -5
c    3
Name: col1, dtype: int64

Auto alignment- operacja series1+series2 automatycznie zrobi joina po indeksie, a nie po kolejności

#### DataFrame

In [38]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],        
        'year': [2000, 2001, 2002, 2001, 2002],        
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}

df = DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [39]:
print(df.state)
print()
print(df["state"])

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object


In [165]:
df.pop # brzydkie - dostajemy metodę pop zamiast kolumny

<bound method NDFrame.pop of     state  year  pop  new_col
0    Ohio  2000  1.5        0
1    Ohio  2001  1.7        1
2    Ohio  2002  3.6        2
3  Nevada  2001  2.4        3
4  Nevada  2002  2.9        4>

In [44]:
df["new_col"] = [i for i in range(5)]
df

Unnamed: 0,state,year,pop,new_col
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3
4,Nevada,2002,2.9,4


In [53]:
list(df.columns) # names(df)

['state', 'year', 'pop', 'new_col']

In [61]:
df[["state", "year"]] # select

Unnamed: 0,state,year
0,Ohio,2000
1,Ohio,2001
2,Ohio,2002
3,Nevada,2001
4,Nevada,2002


In [62]:
df.drop("year", axis=1) # select -col

Unnamed: 0,state,pop,new_col
0,Ohio,1.5,0
1,Ohio,1.7,1
2,Ohio,3.6,2
3,Nevada,2.4,3
4,Nevada,2.9,4


In [67]:
df.query("state == 'Ohio'")

Unnamed: 0,state,year,pop,new_col
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2


In [69]:
df.query("state == 'Ohio' & year == 2000" )

Unnamed: 0,state,year,pop,new_col
0,Ohio,2000,1.5,0


In [71]:
df.rename(columns=
          {'new_col': 'column_one'})

Unnamed: 0,state,year,pop,column_one
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3
4,Nevada,2002,2.9,4


In [79]:
f = lambda x: x.min()-x.max()
df.drop(["state"], axis = 1).apply(f) # Dla każdej kolumny zastosowanie funkcji


year      -2.0
pop       -2.1
new_col   -4.0
dtype: float64

In [86]:
format = lambda x: str(x)+"aaa"
df.applymap(format) # zastosuj dla każdej wartości w komórce

Unnamed: 0,state,year,pop,new_col
0,Ohioaaa,2000aaa,1.5aaa,0aaa
1,Ohioaaa,2001aaa,1.7aaa,1aaa
2,Ohioaaa,2002aaa,3.6aaa,2aaa
3,Nevadaaaa,2001aaa,2.4aaa,3aaa
4,Nevadaaaa,2002aaa,2.9aaa,4aaa


In [88]:
df.sort_index() 

Unnamed: 0,state,year,pop,new_col
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3
4,Nevada,2002,2.9,4


In [90]:
df.sort_values("year")

Unnamed: 0,state,year,pop,new_col
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
3,Nevada,2001,2.4,3
2,Ohio,2002,3.6,2
4,Nevada,2002,2.9,4


Summarising

In [97]:
print(df.min())
print()
print(df.describe())
print()
print(df.state.describe())

state      Nevada
year         2000
pop           1.5
new_col         0
dtype: object

             year       pop   new_col
count     5.00000  5.000000  5.000000
mean   2001.20000  2.420000  2.000000
std       0.83666  0.864292  1.581139
min    2000.00000  1.500000  0.000000
25%    2001.00000  1.700000  1.000000
50%    2001.00000  2.400000  2.000000
75%    2002.00000  2.900000  3.000000
max    2002.00000  3.600000  4.000000

count        5
unique       2
top       Ohio
freq         3
Name: state, dtype: object


In [100]:
df.cov()
df.corr()

Unnamed: 0,year,pop,new_col
year,1.0,0.89197,0.755929
pop,0.89197,1.0,0.640292
new_col,0.755929,0.640292,1.0


In [107]:
df[[True, True, True, True, False]] # przekazanie maski indeksów

Unnamed: 0,state,year,pop,new_col
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3


In [111]:
mask = df.state.isin(["Ohio"])
df[mask]

Unnamed: 0,state,year,pop,new_col
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2


In [113]:
data = {'state': ['Ohio', np.nan, 'Ohio', 'Nevada', 'Nevada'],        
        'year': [2000, 2001, np.nan, 2001, 2002],        
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
df2 = DataFrame(data)
df2

Unnamed: 0,state,year,pop
0,Ohio,2000.0,1.5
1,,2001.0,1.7
2,Ohio,,3.6
3,Nevada,2001.0,2.4
4,Nevada,2002.0,2.9


In [115]:
df2.dropna() # usuwa wiersze z nan

Unnamed: 0,state,year,pop
0,Ohio,2000.0,1.5
3,Nevada,2001.0,2.4
4,Nevada,2002.0,2.9


In [118]:
df2.dropna(how='all') # drop tylko jeżeli wszystkie w wierszu są na

Unnamed: 0,state,year,pop
0,Ohio,2000.0,1.5
1,,2001.0,1.7
2,Ohio,,3.6
3,Nevada,2001.0,2.4
4,Nevada,2002.0,2.9


In [121]:
df2.fillna(0) # uzupełnia wszędzie tym samym

Unnamed: 0,state,year,pop
0,Ohio,2000.0,1.5
1,0,2001.0,1.7
2,Ohio,0.0,3.6
3,Nevada,2001.0,2.4
4,Nevada,2002.0,2.9


In [124]:
df2.fillna({"state": "NaN", "year": 0})

Unnamed: 0,state,year,pop
0,Ohio,2000.0,1.5
1,,2001.0,1.7
2,Ohio,0.0,3.6
3,Nevada,2001.0,2.4
4,Nevada,2002.0,2.9


In [125]:
df2.fillna({"state": "NaN"})

Unnamed: 0,state,year,pop
0,Ohio,2000.0,1.5
1,,2001.0,1.7
2,Ohio,,3.6
3,Nevada,2001.0,2.4
4,Nevada,2002.0,2.9


Grouping

In [166]:
grouped = df["pop"].groupby([df["year"], df["state"]]) 
results = grouped.mean()
results

year  state 
2000  Ohio      1.5
2001  Nevada    2.4
      Ohio      1.7
2002  Nevada    2.9
      Ohio      3.6
Name: pop, dtype: float64

In [167]:
results.index

MultiIndex(levels=[[2000, 2001, 2002], ['Nevada', 'Ohio']],
           labels=[[0, 1, 1, 2, 2], [1, 0, 1, 0, 1]],
           names=['year', 'state'])

In [156]:
results.reset_index() #usuniecie hierarchical index, zostaje range

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Nevada,2.4
2,2001,Ohio,1.7
3,2002,Nevada,2.9
4,2002,Ohio,3.6


In [157]:
df

Unnamed: 0,state,year,pop,new_col
0,Ohio,2000,1.5,0
1,Ohio,2001,1.7,1
2,Ohio,2002,3.6,2
3,Nevada,2001,2.4,3
4,Nevada,2002,2.9,4


In [158]:
grouped = df[["pop", "new_col"]].groupby([df["year"], df["state"]]) 
results = grouped.mean()
results

Unnamed: 0_level_0,Unnamed: 1_level_0,pop,new_col
year,state,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,Ohio,1.5,0
2001,Nevada,2.4,3
2001,Ohio,1.7,1
2002,Nevada,2.9,4
2002,Ohio,3.6,2
