In [1]:
import pandas as pd
import numpy as np

**Creation**

In [2]:
# method 1: from a list
ser1 = pd.Series(data=[5, -7, 0])
ser1

0    5
1   -7
2    0
dtype: int64

In [3]:
# method 2: from a dict
mdata = {'Ohio': 35000, 'Texas': 70000, 'Utah': 5000}
ser2 = pd.Series(data=mdata)
ser2

Ohio     35000
Texas    70000
Utah      5000
dtype: int64

In [4]:
states = ['Califonia', 'Ohio', 'Texas', 'Utah']
ser3 = pd.Series(mdata, index=states)
ser3

Califonia        NaN
Ohio         35000.0
Texas        70000.0
Utah          5000.0
dtype: float64

**Indexing & selection**

In [5]:
%%time
ser2.loc['Ohio']

CPU times: user 1.69 ms, sys: 639 µs, total: 2.33 ms
Wall time: 11.3 ms


35000

In [6]:
%%time
ser2.iloc[0]

CPU times: user 314 µs, sys: 108 µs, total: 422 µs
Wall time: 2.29 ms


35000

In [7]:
%%time
ser2['Ohio']

CPU times: user 108 µs, sys: 10 µs, total: 118 µs
Wall time: 125 µs


35000

In [8]:
%%time
ser2[0]

CPU times: user 206 µs, sys: 217 µs, total: 423 µs
Wall time: 2.4 ms


35000

In [9]:
%%time
ser2[[0]]

CPU times: user 1e+03 µs, sys: 206 µs, total: 1.21 ms
Wall time: 3.01 ms


Ohio    35000
dtype: int64

In [10]:
%%time
ser2[['Ohio']]

CPU times: user 2.71 ms, sys: 1.31 ms, total: 4.02 ms
Wall time: 8.53 ms


Ohio    35000
dtype: int64

In [11]:
%%time
ser2[:2]

CPU times: user 971 µs, sys: 20 µs, total: 991 µs
Wall time: 1.02 ms


Ohio     35000
Texas    70000
dtype: int64

In [12]:
%%time
ser2['Ohio':'Texas']

CPU times: user 2.87 ms, sys: 1.1 ms, total: 3.97 ms
Wall time: 7.48 ms


Ohio     35000
Texas    70000
dtype: int64

**reindexing**

In [13]:
ser1

0    5
1   -7
2    0
dtype: int64

In [14]:
ser1.reindex([2, 0, 1])

2    0
0    5
1   -7
dtype: int64

**hierarchical indexing**

In [15]:
data = pd.Series(range(6), index=[['a', 'a', 'a', 'b', 'b', 'c'], [1, 2, 3, 1, 2, 3]])

In [16]:
data

a  1    0
   2    1
   3    2
b  1    3
   2    4
c  3    5
dtype: int64

In [17]:
data['b']

1    3
2    4
dtype: int64

In [18]:
data[[3]]

b  1    3
dtype: int64

In [19]:
data['b':'c']

b  1    3
   2    4
c  3    5
dtype: int64

In [20]:
data.loc[['a', 'c']]

a  1    0
   2    1
   3    2
c  3    5
dtype: int64

In [21]:
data.loc[:, 2]

a    1
b    4
dtype: int64

_`unstack()`_

In [22]:
data.unstack()

Unnamed: 0,1,2,3
a,0.0,1.0,2.0
b,3.0,4.0,
c,,,5.0


In [23]:
data.unstack().stack()

a  1    0.0
   2    1.0
   3    2.0
b  1    3.0
   2    4.0
c  3    5.0
dtype: float64

**arithmetic**

In [24]:
%%time
ser1[[0]] * 2

CPU times: user 4.71 ms, sys: 2.38 ms, total: 7.08 ms
Wall time: 7.41 ms


0    10
dtype: int64

In [25]:
%%time
ser1 * 2

CPU times: user 734 µs, sys: 1 µs, total: 735 µs
Wall time: 750 µs


0    10
1   -14
2     0
dtype: int64

In [26]:
%%time
ser2 + ser3

CPU times: user 2.17 ms, sys: 947 µs, total: 3.12 ms
Wall time: 11.3 ms


Califonia         NaN
Ohio          70000.0
Texas        140000.0
Utah          10000.0
dtype: float64

**arithmetic for different series**

In [27]:
s1 = pd.Series([3, -1, 4], index=['a', 'b', 'd'])
s2 = pd.Series([2, -5, 0], index=['b', 'c', 'd'])

In [28]:
s1 + s2

a    NaN
b    1.0
c    NaN
d    4.0
dtype: float64

**`in`** if the value is one of the series' index

In [29]:
%%time
1 in ser1

CPU times: user 25 µs, sys: 1 µs, total: 26 µs
Wall time: 29.8 µs


True

In [30]:
%%time
-7 in ser1

CPU times: user 33 µs, sys: 1 µs, total: 34 µs
Wall time: 41 µs


False

**Check `null`**

In [31]:
%%time
pd.isnull(ser3)

CPU times: user 351 µs, sys: 8 µs, total: 359 µs
Wall time: 380 µs


Califonia     True
Ohio         False
Texas        False
Utah         False
dtype: bool

**drop**

In [32]:
%%time
new_ser1 = ser1.drop(1)
new_ser1

CPU times: user 1.67 ms, sys: 230 µs, total: 1.9 ms
Wall time: 3.72 ms


0    5
2    0
dtype: int64

**sorting**

In [33]:
ser1

0    5
1   -7
2    0
dtype: int64

In [34]:
%%time
ser1.sort_values()

CPU times: user 2.75 ms, sys: 1.83 ms, total: 4.58 ms
Wall time: 5.12 ms


1   -7
2    0
0    5
dtype: int64

In [35]:
%%time
ser1.sort_index()

CPU times: user 300 µs, sys: 13 µs, total: 313 µs
Wall time: 321 µs


0    5
1   -7
2    0
dtype: int64

**ranking**

In [36]:
obj = pd.Series([2, 2, 4, 6, 1, 3])

In [37]:
obj

0    2
1    2
2    4
3    6
4    1
5    3
dtype: int64

In [38]:
# by default, rank breaks ties by assigning each group the mean rank
obj.rank()

0    2.5
1    2.5
2    5.0
3    6.0
4    1.0
5    4.0
dtype: float64

In [39]:
# assign according to the order in which they are observed in the data
obj.rank(method='first')

0    2.0
1    3.0
2    5.0
3    6.0
4    1.0
5    4.0
dtype: float64

In [40]:
# assign tie values the maximum rank in the group
obj.rank(method='max')

0    3.0
1    3.0
2    5.0
3    6.0
4    1.0
5    4.0
dtype: float64

**`is_unique`**

In [41]:
obj

0    2
1    2
2    4
3    6
4    1
5    3
dtype: int64

In [42]:
obj.index.is_unique

True

In [43]:
obj2 = pd.Series(range(5), index=['a', 'b', 'c', 'b', 'a'])
obj2

a    0
b    1
c    2
b    3
a    4
dtype: int64

In [44]:
obj2.index.is_unique

False

**`.isin()`**

In [45]:
obj

0    2
1    2
2    4
3    6
4    1
5    3
dtype: int64

In [46]:
obj.isin([1, 2])

0     True
1     True
2    False
3    False
4     True
5    False
dtype: bool

**computing descriptive statistics**

In [47]:
obj.sum()

18

In [48]:
obj.cumsum()

0     2
1     4
2     8
3    14
4    15
5    18
dtype: int64

In [49]:
obj.mean()

3.0

In [50]:
obj.idxmax()

3

In [51]:
obj.idxmin()

4

In [52]:
obj.describe()

count    6.000000
mean     3.000000
std      1.788854
min      1.000000
25%      2.000000
50%      2.500000
75%      3.750000
max      6.000000
dtype: float64

**unique values**

In [53]:
obj.unique()

array([2, 4, 6, 1, 3])

In [54]:
obj.nunique()

5

**`value_counts()`**

In [55]:
obj.value_counts()

2    2
6    1
4    1
3    1
1    1
dtype: int64

**`isnull()`**

In [56]:
obj = pd.Series(['a', 'b', 'c', np.nan,'d'])

In [57]:
obj

0      a
1      b
2      c
3    NaN
4      d
dtype: object

In [58]:
obj.isnull()

0    False
1    False
2    False
3     True
4    False
dtype: bool

**filtering out missing data**

In [59]:
obj.dropna()

0    a
1    b
2    c
4    d
dtype: object

In [60]:
obj[obj.notnull()]

0    a
1    b
2    c
4    d
dtype: object

**filling in missing data**

In [61]:
obj.fillna(method='ffill')

0    a
1    b
2    c
3    c
4    d
dtype: object

In [62]:
obj.fillna('m')

0    a
1    b
2    c
3    m
4    d
dtype: object

**removing duplicates**

In [63]:
obj = pd.Series([2, 2, 4, 6, 1, 3])

In [64]:
obj

0    2
1    2
2    4
3    6
4    1
5    3
dtype: int64

In [65]:
obj.duplicated()

0    False
1     True
2    False
3    False
4    False
5    False
dtype: bool

In [66]:
# keep first
obj.drop_duplicates()

0    2
2    4
3    6
4    1
5    3
dtype: int64

In [67]:
# keep last
obj.drop_duplicates(keep='last')

1    2
2    4
3    6
4    1
5    3
dtype: int64

**`map`**

In [68]:
digit_word_map = {
    1:'one',
    2:'two',
    3:'three',
    4:'four',
    5:'five',
    6:'six'
}

In [69]:
obj.map(digit_word_map)

0      two
1      two
2     four
3      six
4      one
5    three
dtype: object

**`replace`**

In [70]:
obj.replace(1, 7)

0    2
1    2
2    4
3    6
4    7
5    3
dtype: int64

In [71]:
obj.replace({4:5, 1:7})

0    2
1    2
2    5
3    6
4    7
5    3
dtype: int64

In [72]:
obj.replace([4, 1], [5, 7])

0    2
1    2
2    5
3    6
4    7
5    3
dtype: int64

**`concat`**

In [73]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [74]:
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [75]:
pd.concat([s1, s2, s3], axis='columns')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


**`numpy.where()`**

In [76]:
s1 = pd.Series([np.nan, 2, 1, 3, np.nan], index=['a', 'b', 'd', 'c', 'e'])
s2 = pd.Series([1, 3, np.nan, 2, np.nan], index=['f', 'd', 'a', 'c', 'b'])

In [77]:
s1

a    NaN
b    2.0
d    1.0
c    3.0
e    NaN
dtype: float64

In [78]:
s2

f    1.0
d    3.0
a    NaN
c    2.0
b    NaN
dtype: float64

In [79]:
np.where(pd.isnull(s1), s2, s1)

array([ 1.,  2.,  1.,  3., nan])