In [2]:
%matplotlib inline

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [2]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

In [3]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [5]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

In [6]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [7]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64

In [8]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

In [9]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [10]:
# method 'ffill' action as data forward copy; forward fill 
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [11]:
frame = DataFrame(
    np.arange(9).reshape(3, 3), index=['a', 'c', 'd'],
    columns=['Ohio', 'Texas', 'California']
)

In [13]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [14]:
# Re-index row(index)
frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [15]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [16]:
# Re-index columns
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [22]:
# Re-index row and columns at the same time
frame.reindex(
    index=['a', 'b', 'c', 'd'], columns=states
)

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [24]:
# Drop the specific axis item
obj = Series(np.arange(5), index=['a', 'b', 'c', 'd', 'e'])

In [25]:
obj

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [26]:
new_obj = obj.drop('c')

In [27]:
new_obj

a    0
b    1
d    3
e    4
dtype: int64

In [28]:
obj.drop(['d', 'c'])

a    0
b    1
e    4
dtype: int64

In [29]:
# Drop axis item in dataframe
data = DataFrame(
    np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four']
)

In [30]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [31]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [32]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [33]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [34]:
# Indexing, select and filter

In [35]:
obj = Series(np.arange(4), index=['a', 'b', 'c', 'd'])

In [36]:
obj

a    0
b    1
c    2
d    3
dtype: int64

In [37]:
obj['b']

1

In [39]:
obj[1]

1

In [38]:
obj[2:4]

c    2
d    3
dtype: int64

In [40]:
obj[['b', 'c', 'd']]

b    1
c    2
d    3
dtype: int64

In [41]:
obj[[1,3 ]]

b    1
d    3
dtype: int64

In [42]:
obj[obj < 2]

a    0
b    1
dtype: int64

In [43]:
# The right end is inclusive
obj['b': 'c']

b    1
c    2
dtype: int64

In [44]:
obj['b':'c'] = 5

In [45]:
obj

a    0
b    5
c    5
d    3
dtype: int64

In [46]:
data = DataFrame(
    np.arange(16).reshape((4, 4)), index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four']
)

In [47]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [48]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int64

In [49]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [50]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [51]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [52]:
data['three'] > 5

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool

In [53]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


In [54]:
data[data < 5] = 0

In [55]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [57]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [59]:
data.ix[['Colorado', 'Utah'], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [60]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [62]:
data.ix[data.three > 5, :3]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [63]:
# Math caculate and data align

In [64]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])

In [65]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [66]:
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])

In [67]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [68]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [69]:
df1 = DataFrame(
    np.arange(9).reshape((3, 3)), columns=list('bcd'),
    index=['Ohio', 'Texas', 'Colorado']
)

In [70]:
df1

Unnamed: 0,b,c,d
Ohio,0,1,2
Texas,3,4,5
Colorado,6,7,8


In [71]:
df2 = DataFrame(
    np.arange(12).reshape((4, 3)), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon']
)

In [72]:
df2

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [73]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [74]:
df3 = df1 + df2

In [76]:
df3 > 0

Unnamed: 0,b,c,d,e
Colorado,False,False,False,False
Ohio,True,False,True,False
Oregon,False,False,False,False
Texas,True,False,True,False
Utah,False,False,False,False


In [77]:
# Fill value in Math method

In [78]:
df1 = DataFrame(
    np.arange(12).reshape((3, 4)),
    columns=list('abcd')
)

In [79]:
df2 = DataFrame(np.arange(20).reshape((4, 5)), columns=list('abcde'))

In [80]:
df1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [81]:
df2

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [82]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [83]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [84]:
df1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [85]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,0
1,4,5,6,7,0
2,8,9,10,11,0


In [86]:
# Data compute with dataframe and series

In [87]:
arr = np.arange(12).reshape((3, 4))

In [88]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [89]:
arr[0]

array([0, 1, 2, 3])

In [90]:
# This is broadcast
arr - arr[0]

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [91]:
frame = DataFrame(
    np.arange(12).reshape((4, 3)), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon']
)

In [92]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [95]:
series = frame.iloc[0]

In [96]:
series

b    0
d    1
e    2
Name: Utah, dtype: int64

In [97]:
frame - series

Unnamed: 0,b,d,e
Utah,0,0,0
Ohio,3,3,3
Texas,6,6,6
Oregon,9,9,9


In [101]:
series2 = Series(range(3), index=list('bef'))

In [102]:
series2

b    0
e    1
f    2
dtype: int64

In [104]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [105]:
series3 = frame['d']

In [106]:
series3

Utah       1
Ohio       4
Texas      7
Oregon    10
Name: d, dtype: int64

In [107]:
frame

Unnamed: 0,b,d,e
Utah,0,1,2
Ohio,3,4,5
Texas,6,7,8
Oregon,9,10,11


In [108]:
# Take care of the axis = 0, frame.shape == 4, 3
frame.sub(series3, axis=0)

Unnamed: 0,b,d,e
Utah,-1,0,1
Ohio,-1,0,1
Texas,-1,0,1
Oregon,-1,0,1


In [109]:
# ufuncs apply and mapping

In [110]:
frame = DataFrame(
    np.random.randn(4, 3), columns=list('bde'),
    index=['Utah', 'Ohio', 'Texas', 'Oregon']
)

In [111]:
frame

Unnamed: 0,b,d,e
Utah,1.09747,-0.812701,0.748401
Ohio,-0.519476,0.468088,-0.170816
Texas,0.694947,0.883209,0.26317
Oregon,0.493497,0.385259,-0.076391


In [112]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,1.09747,0.812701,0.748401
Ohio,0.519476,0.468088,0.170816
Texas,0.694947,0.883209,0.26317
Oregon,0.493497,0.385259,0.076391


In [113]:
f = lambda x: x.max() - x.min()

In [114]:
frame.apply(f)

b    1.616946
d    1.695910
e    0.919217
dtype: float64

In [115]:
frame.apply(f, axis=1)

Utah      1.910171
Ohio      0.987563
Texas     0.620038
Oregon    0.569888
dtype: float64

In [118]:
def f(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

In [119]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.519476,-0.812701,-0.170816
max,1.09747,0.883209,0.748401


In [120]:
format = lambda x: '%.2f' % x

In [121]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,1.1,-0.81,0.75
Ohio,-0.52,0.47,-0.17
Texas,0.69,0.88,0.26
Oregon,0.49,0.39,-0.08


In [122]:
frame

Unnamed: 0,b,d,e
Utah,1.09747,-0.812701,0.748401
Ohio,-0.519476,0.468088,-0.170816
Texas,0.694947,0.883209,0.26317
Oregon,0.493497,0.385259,-0.076391


In [151]:
# Ranking and sorting

In [124]:
obj = Series(range(4), index=list('dabc'))

In [125]:
obj

d    0
a    1
b    2
c    3
dtype: int64

In [126]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [128]:
frame = DataFrame(
    np.arange(8).reshape((2, 4)), index=['three', 'one'],
    columns=list('dabc')
)

In [129]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [131]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [132]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [133]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [134]:
obj = Series([4, 7, -3, 2])

In [135]:
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [141]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [142]:
obj = Series([4, np.nan, 7, np.nan, -3, 2])

In [143]:
obj

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

In [144]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [145]:
frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})

In [146]:
frame

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


In [148]:
frame.sort_index(by='b')

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


In [149]:
frame.sort_index(by=['a', 'b'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [150]:
frame.sort_index(by=['a'])

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b
0,0,4
2,0,-3
1,1,7
3,1,2


In [153]:
# Ranking
obj = Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [154]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [156]:
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [159]:
frame = DataFrame({'b':[4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c': [-2, 5, 8, -2.5]})

In [160]:
frame

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [161]:
frame.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


In [165]:
# Axis index with duplicate value

In [167]:
obj = Series(range(5), index=list('aabbc'))

In [168]:
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [169]:
obj.index.is_unique

False

In [170]:
obj['a']

a    0
a    1
dtype: int64

In [171]:
obj['c']

4

In [172]:
df = DataFrame(np.random.randn(4, 3), index=list('aabb'))

In [174]:
df

Unnamed: 0,0,1,2
a,0.723827,0.059118,1.561316
a,-1.456527,2.912487,0.089878
b,1.584347,0.211172,-0.884129
b,-0.57997,-1.259476,-0.4926


In [175]:
df.loc['b']

Unnamed: 0,0,1,2
b,1.584347,0.211172,-0.884129
b,-0.57997,-1.259476,-0.4926


In [176]:
# Aggregation and calculate desc statistics

In [4]:
df = DataFrame(
    [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], 
    index=list('abcd'), columns=['one', 'two']
)

In [5]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [6]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [7]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [9]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [10]:
df.mean(axis=1, skipna=True)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [11]:
df.idxmax()

one    b
two    d
dtype: object

In [12]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [13]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [14]:
obj = Series(['a', 'a', 'b', 'c'] * 4)

In [15]:
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [18]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [1]:
# 相关系数和协方差

In [13]:
# 唯一值，值计数以及成员资格

In [15]:
obj = Series(list('cadaabbcc'))

In [17]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [20]:
uniques = obj.unique()

In [21]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [22]:
pd.value_counts(obj.values, sort=False)

a    3
c    3
b    2
d    1
dtype: int64

In [24]:
mask = obj.isin(['b', 'c'])

In [25]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [26]:
data = DataFrame({
    'Qu1': [1, 3, 4, 3, 4],
    'Qu2': [2, 3, 1, 2, 3],
    'Qu3': [1, 5, 2, 4, 4]
})

In [27]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [28]:
result = data.apply(pd.value_counts)

In [29]:
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [35]:
pd.value_counts(data['Qu1'].values)

4    2
3    2
1    1
dtype: int64

In [36]:
result = data.apply(pd.value_counts).fillna(0)

In [37]:
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [38]:
# 处理缺失数据

In [39]:
 string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [40]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [41]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [43]:
string_data[0] = None

In [44]:
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [45]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [46]:
# 滤除缺失数据

In [47]:
from numpy import nan as NA

In [48]:
data = Series([1, NA, 3.5, NA, 7])

In [49]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [50]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [51]:
data = DataFrame([[1., 6.5, 3.], [1, NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])

In [52]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [53]:
cleaned = data.dropna()

In [54]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [55]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [58]:
data[4] = NA

In [59]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [60]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [61]:
df = DataFrame(np.random.randn(7, 3))

In [62]:
df

Unnamed: 0,0,1,2
0,1.214014,-1.353523,0.493099
1,-0.665362,-1.270445,-0.533369
2,-0.232121,-0.004027,-0.805359
3,-1.396411,-1.268317,-0.805046
4,1.38732,0.074943,-1.78793
5,-0.043041,1.053955,-0.104773
6,1.007633,0.708681,-0.345984


In [64]:
df.loc[:4, 1] = NA

In [65]:
df

Unnamed: 0,0,1,2
0,1.214014,,0.493099
1,-0.665362,,-0.533369
2,-0.232121,,-0.805359
3,-1.396411,,-0.805046
4,1.38732,,-1.78793
5,-0.043041,1.053955,-0.104773
6,1.007633,0.708681,-0.345984


In [66]:
df.loc[:2, 2] = NA

In [67]:
df

Unnamed: 0,0,1,2
0,1.214014,,
1,-0.665362,,
2,-0.232121,,
3,-1.396411,,-0.805046
4,1.38732,,-1.78793
5,-0.043041,1.053955,-0.104773
6,1.007633,0.708681,-0.345984


In [70]:
df.dropna(thresh=3)

Unnamed: 0,0,1,2
5,-0.043041,1.053955,-0.104773
6,1.007633,0.708681,-0.345984


In [71]:
# 填充丢失数据

In [72]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.214014,0.0,0.0
1,-0.665362,0.0,0.0
2,-0.232121,0.0,0.0
3,-1.396411,0.0,-0.805046
4,1.38732,0.0,-1.78793
5,-0.043041,1.053955,-0.104773
6,1.007633,0.708681,-0.345984


In [74]:
df.fillna({1:0.5, 3: -1})

Unnamed: 0,0,1,2
0,1.214014,0.5,
1,-0.665362,0.5,
2,-0.232121,0.5,
3,-1.396411,0.5,-0.805046
4,1.38732,0.5,-1.78793
5,-0.043041,1.053955,-0.104773
6,1.007633,0.708681,-0.345984


In [75]:
df

Unnamed: 0,0,1,2
0,1.214014,,
1,-0.665362,,
2,-0.232121,,
3,-1.396411,,-0.805046
4,1.38732,,-1.78793
5,-0.043041,1.053955,-0.104773
6,1.007633,0.708681,-0.345984


In [76]:
_ = df.fillna(0, inplace=True)

In [77]:
df

Unnamed: 0,0,1,2
0,1.214014,0.0,0.0
1,-0.665362,0.0,0.0
2,-0.232121,0.0,0.0
3,-1.396411,0.0,-0.805046
4,1.38732,0.0,-1.78793
5,-0.043041,1.053955,-0.104773
6,1.007633,0.708681,-0.345984


In [78]:
_

Unnamed: 0,0,1,2
0,1.214014,0.0,0.0
1,-0.665362,0.0,0.0
2,-0.232121,0.0,0.0
3,-1.396411,0.0,-0.805046
4,1.38732,0.0,-1.78793
5,-0.043041,1.053955,-0.104773
6,1.007633,0.708681,-0.345984


In [79]:
df = DataFrame(np.random.randn(6, 3))

In [80]:
df

Unnamed: 0,0,1,2
0,-1.498116,1.913388,-0.009023
1,0.097418,1.37348,0.504438
2,0.078416,1.295066,0.681302
3,-0.612028,0.16754,-1.252097
4,-0.342322,-0.45673,0.101355
5,-1.578334,0.067372,-1.540914


In [82]:
df.loc[2:, 1] = NA

In [83]:
df

Unnamed: 0,0,1,2
0,-1.498116,1.913388,-0.009023
1,0.097418,1.37348,0.504438
2,0.078416,,0.681302
3,-0.612028,,-1.252097
4,-0.342322,,0.101355
5,-1.578334,,-1.540914


In [84]:
df.loc[4:, 2] = NA

In [85]:
df

Unnamed: 0,0,1,2
0,-1.498116,1.913388,-0.009023
1,0.097418,1.37348,0.504438
2,0.078416,,0.681302
3,-0.612028,,-1.252097
4,-0.342322,,
5,-1.578334,,


In [86]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-1.498116,1.913388,-0.009023
1,0.097418,1.37348,0.504438
2,0.078416,1.37348,0.681302
3,-0.612028,1.37348,-1.252097
4,-0.342322,1.37348,-1.252097
5,-1.578334,1.37348,-1.252097


In [87]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-1.498116,1.913388,-0.009023
1,0.097418,1.37348,0.504438
2,0.078416,1.37348,0.681302
3,-0.612028,1.37348,-1.252097
4,-0.342322,,-1.252097
5,-1.578334,,-1.252097


In [88]:
data = Series([1, NA, 3.5, NA, 7])

In [89]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [3]:
# 层次花索引"

In [7]:
data = Series(
    np.random.randn(10),
    index=[list('aaabbbccdd'), [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]
)

In [8]:
data

a  1    0.837142
   2    0.014810
   3    0.128634
b  1   -1.846075
   2   -1.051755
   3   -0.118460
c  1    0.568077
   2   -0.056253
d  2    0.842097
   3   -1.792067
dtype: float64

In [9]:
data.index

MultiIndex(levels=[[u'a', u'b', u'c', u'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [10]:
data['b']

1   -1.846075
2   -1.051755
3   -0.118460
dtype: float64

In [11]:
data['b':'c']

b  1   -1.846075
   2   -1.051755
   3   -0.118460
c  1    0.568077
   2   -0.056253
dtype: float64

In [12]:
data[:, 2]

a    0.014810
b   -1.051755
c   -0.056253
d    0.842097
dtype: float64

In [13]:
# Unstack series to dataframe
data.unstack()

Unnamed: 0,1,2,3
a,0.837142,0.01481,0.128634
b,-1.846075,-1.051755,-0.11846
c,0.568077,-0.056253,
d,,0.842097,-1.792067


In [14]:
# Stack back
data.unstack().stack()

a  1    0.837142
   2    0.014810
   3    0.128634
b  1   -1.846075
   2   -1.051755
   3   -0.118460
c  1    0.568077
   2   -0.056253
d  2    0.842097
   3   -1.792067
dtype: float64

In [16]:
frame = DataFrame(
    np.arange(12).reshape((4, 3)),
    index=[list('aabb'), [1, 2, 1, 2]],
    columns=[['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']]
)

In [17]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [18]:
frame.index.names = ['key1', 'key2']

In [19]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [20]:
frame.columns.names = ['state', 'color']

In [21]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [22]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [23]:
# 重新分级排序

In [24]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [28]:
frame.sort_index(1)

Unnamed: 0_level_0,state,Colorado,Ohio,Ohio
Unnamed: 0_level_1,color,Green,Green,Red
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,2,0,1
a,2,5,3,4
b,1,8,6,7
b,2,11,9,10


In [31]:
frame.swaplevel(0, 1).sort_index(0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [32]:
# 根据级别汇总统计

In [33]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [34]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [35]:
# 使用DataFrame的列

In [36]:
frame = DataFrame({
    'a': range(7),
    'b': range(7, 0, -1),
    'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
    'd': [0, 1, 2, 0, 1, 2, 3]
})

In [37]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [38]:
frame2 = frame.set_index(['c', 'd'])

In [39]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [40]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [41]:
# Transport back
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [42]:
# 其他有关pandas的话题

In [48]:
ser = Series(np.arange(3))

In [49]:
ser

0    0
1    1
2    2
dtype: int64

In [50]:
ser[1]

1

In [51]:
ser2 = Series(np.arange(3), index=list('abc'))

In [52]:
ser2

a    0
b    1
c    2
dtype: int64

In [54]:
ser2[-1]

2

In [55]:
ser.loc[:1]

0    0
1    1
dtype: int64

In [56]:
ser3 = Series(range(3), index=[-5, 1, 3])

In [57]:
ser3

-5    0
 1    1
 3    2
dtype: int64

In [59]:
frame = DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1])

In [60]:
frame

Unnamed: 0,0,1
2,0,1
0,2,3
1,4,5


In [63]:
# Does not support any more
# frame.irow(0)

In [67]:
# 面板数据 panel