# Pandas数据操作

In [1]:
import pandas as pd

##  索引

### Series索引

In [2]:
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())

a    0
b    1
c    2
d    3
e    4
dtype: int64


In [3]:
# 行索引
print(ser_obj['a'])
print(ser_obj[0])

0
0


In [4]:
# 切片索引
print(ser_obj[1:3])
print(ser_obj['b':'d'])

b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64


In [5]:
# 不连续索引
print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])

a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64


In [6]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])

a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64


### DataFrame索引

In [7]:
import numpy as np

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0 -0.150325  0.571047 -0.405202  1.717307
1  1.112662 -1.146074 -0.899065 -0.841979
2 -0.716260  0.329100  0.596207  2.142039
3  0.646757 -0.931443  0.373743  0.545380
4 -0.827586  0.474916  2.017043 -0.957625


In [16]:
# 列索引
print('列索引')
print(df_obj['a']) # 返回Series类型
# print(type(df_obj[[0]])) # 返回DataFrame类型

# 不连续索引
print('不连续索引')
print(df_obj[['a','c']])
# print(df_obj[[1, 3]])

列索引
0   -0.150325
1    1.112662
2   -0.716260
3    0.646757
4   -0.827586
Name: a, dtype: float64
不连续索引
          a         c
0 -0.150325 -0.405202
1  1.112662 -0.899065
2 -0.716260  0.596207
3  0.646757  0.373743
4 -0.827586  2.017043


### 三种索引方式

In [17]:
# 标签索引 loc
# Series
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d'])

# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a'])

b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
0   -0.150325
1    1.112662
2   -0.716260
3    0.646757
4   -0.827586
Name: a, dtype: float64
0   -0.150325
1    1.112662
2   -0.716260
Name: a, dtype: float64


In [18]:
# 整型位置索引 iloc
print(ser_obj[1:3])
print(ser_obj.iloc[1:3])

# DataFrame
print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的区别

b    1
c    2
dtype: int64
b    1
c    2
dtype: int64
0   -0.150325
1    1.112662
Name: a, dtype: float64


In [19]:
# 混合索引 ix
print(ser_obj.ix[1:3])
print(ser_obj.ix['b':'c'])

# DataFrame
print(df_obj.ix[0:2, 0]) # 先按标签索引尝试操作，然后再按位置索引尝试操作

b    1
c    2
dtype: int64
b    1
c    2
dtype: int64
0   -0.150325
1    1.112662
2   -0.716260
Name: a, dtype: float64


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


## 运算与对齐

In [21]:
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))

print('s1: ' )
print(s1)

print('') 

print('s2: ')
print(s2)

s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int64


In [22]:
# Series 对齐运算
s1 + s2

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64

In [23]:
import numpy as np

df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)

df1: 
     a    b
0  1.0  1.0
1  1.0  1.0

df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


In [24]:
# DataFrame对齐操作
df1 + df2

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


In [25]:
# 填充未对齐的数据进行运算
print(s1)
print(s2)

s1.add(s2, fill_value = -1)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
0    20
1    21
2    22
3    23
4    24
dtype: int64


0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    14.0
6    15.0
7    16.0
8    17.0
9    18.0
dtype: float64

In [26]:
df1.sub(df2, fill_value = 2.)

Unnamed: 0,a,b,c
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,1.0,1.0


In [27]:
# 填充NaN
s3 = s1 + s2
print(s3)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [28]:
s3_filled = s3.fillna(-1)
print(s3_filled)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64


In [29]:
df3 = df1 + df2
print(df3)

     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN


In [30]:
df3.fillna(100, inplace = True)
print(df3)

       a      b      c
0    2.0    2.0  100.0
1    2.0    2.0  100.0
2  100.0  100.0  100.0


## 函数应用

In [32]:
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df))

          0         1         2         3
0 -0.316140 -3.064905 -1.444454 -1.821545
1  0.807875 -0.480535 -2.578694 -2.261349
2 -1.120805 -2.416925 -0.586853 -0.569865
3 -1.857471 -1.238321 -1.179279  1.059874
4 -1.846543 -1.152695 -2.530968 -0.737452
          0         1         2         3
0  0.316140  3.064905  1.444454  1.821545
1  0.807875  0.480535  2.578694  2.261349
2  1.120805  2.416925  0.586853  0.569865
3  1.857471  1.238321  1.179279  1.059874
4  1.846543  1.152695  2.530968  0.737452


In [33]:
# 使用apply应用行或列数据
#f = lambda x : x.max()
print(df.apply(lambda x : x.max()))

0    0.807875
1   -0.480535
2   -0.586853
3    1.059874
dtype: float64


In [34]:
# 指定轴方向
print(df.apply(lambda x : x.max(), axis=1))

0   -0.316140
1    0.807875
2   -0.569865
3    1.059874
4   -0.737452
dtype: float64


In [35]:
# 使用applymap应用到每个数据
f2 = lambda x : '%.2f' % x
print(df.applymap(f2))

       0      1      2      3
0  -0.32  -3.06  -1.44  -1.82
1   0.81  -0.48  -2.58  -2.26
2  -1.12  -2.42  -0.59  -0.57
3  -1.86  -1.24  -1.18   1.06
4  -1.85  -1.15  -2.53  -0.74


## 排序

In [36]:
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)

2    10
4    11
0    12
1    13
4    14
dtype: int64


In [37]:
# 索引排序
s4.sort_index()

0    12
1    13
2    10
4    11
4    14
dtype: int64

In [38]:
df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
print(df4)

          2         3         2         3
2 -0.914610 -0.592229 -0.953043 -1.938191
1 -0.211296 -0.360658  0.664035 -0.689057
0 -1.506987  0.295612 -0.214609  2.253874


In [39]:
#df4.sort_index(ascending=False)
df4.sort_index(axis=1)

Unnamed: 0,2,2.1,3,3.1
2,-0.91461,-0.953043,-0.592229,-1.938191
1,-0.211296,0.664035,-0.360658,-0.689057
0,-1.506987,-0.214609,0.295612,2.253874


In [40]:
# 按值排序
df4.sort_values(by=1)

KeyError: 1

## 处理缺失数据

In [31]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()

Unnamed: 0,0,1,2
0,1.619463,0.548047,-1.027003
1,1.0,,
2,4.0,,
3,1.0,,2.0


In [32]:
# isnull
df_data.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,True
2,False,True,True
3,False,True,False


In [33]:
# dropna
df_data.dropna()
#df_data.dropna(axis=1)

Unnamed: 0,0,1,2
0,1.619463,0.548047,-1.027003


In [34]:
# fillna
df_data.fillna(-100.)

Unnamed: 0,0,1,2
0,1.619463,0.548047,-1.027003
1,1.0,-100.0,-100.0
2,4.0,-100.0,-100.0
3,1.0,-100.0,2.0
