In [1]:
import numpy as np
import pandas as pd

如何生成Series对象

In [2]:
ser = pd.Series([1,2,3,4])
ser

0    1
1    2
2    3
3    4
dtype: int64

In [3]:
ser = pd.Series([1,2,3,4],index=['one','two','three','four'])
ser

one      1
two      2
three    3
four     4
dtype: int64

In [4]:
dic = {"one":1,"two":2,"three":3,"four":4}
ser = pd.Series(dic)
ser

one      1
two      2
three    3
four     4
dtype: int64

Seies的values和index属性

In [5]:
dic = {"one":1,"two":2,"three":3,"four":4}
ser = pd.Series(dic)
ser

one      1
two      2
three    3
four     4
dtype: int64

In [6]:
ser.values

array([1, 2, 3, 4], dtype=int64)

In [7]:
ser.index

Index(['one', 'two', 'three', 'four'], dtype='object')

Series对象的shape属性

In [8]:
ser.shape

(4,)

Series的索引方式

In [9]:
ser['one']

1

In [10]:
ser[0]

1

In [11]:
# 按index名称进行索引
ser.loc['one']

1

In [12]:
# 按index的位置进行索引
ser.iloc[0]

1

Serie使用切片

In [13]:
# 包含右边的值
ser['one':'two']

one    1
two    2
dtype: int64

In [14]:
# 不包含右边的值
ser[0:1]

one    1
dtype: int64

In [15]:
# 包含右边的值
ser.loc['one':'two']

one    1
two    2
dtype: int64

In [16]:
# 不包含右边的值
ser.iloc[0:1]

one    1
dtype: int64

Series列表索引，类似于arr的神奇索引

In [17]:
ser[['two','one','four','three']]

two      2
one      1
four     4
three    3
dtype: int64

In [18]:
ser[['two','one','four']]

two     2
one     1
four    4
dtype: int64

Series使用布尔索引

In [19]:
ser[[True,False,False,True]]

one     1
four    4
dtype: int64

Series进行数学运算

In [20]:
ser

one      1
two      2
three    3
four     4
dtype: int64

In [21]:
ser+1

one      2
two      3
three    4
four     5
dtype: int64

In [22]:
np.exp(ser)

one       2.718282
two       7.389056
three    20.085537
four     54.598150
dtype: float64

判断某个值是否在Series的index里

In [23]:
'two' in ser

True

In [24]:
'two' in ser.index

True

判断某个值是否在Series的values里

In [25]:
3 in ser.values

True

按照指定的index顺序生成Series

In [26]:
dic = {"one":1,"two":2,"three":3,"four":4}
ser = pd.Series(dic,index=['two','one','four','three'])
ser

two      2
one      1
four     4
three    3
dtype: int64

In [27]:
# 如果指定的index值在原来的dic里不存在，那么值为NaN
ser = pd.Series(dic,index=['two','one','four','five'])
ser

two     2.0
one     1.0
four    4.0
five    NaN
dtype: float64

判断Series值是否为空

In [28]:
pd.isnull(ser)

two     False
one     False
four    False
five     True
dtype: bool

In [29]:
ser.isnull()

two     False
one     False
four    False
five     True
dtype: bool

判断Series值是否非空

In [30]:
pd.notnull(ser)

two      True
one      True
four     True
five    False
dtype: bool

In [31]:
ser.notnull()

two      True
one      True
four     True
five    False
dtype: bool

Series在进行数学运算时的自动对齐特性  
相同的index值才进行运算，不同的index运算结果为空

In [32]:
ser_2 = pd.Series({'one':3,"two":7,'four':5,'three':9})
ser_2

one      3
two      7
four     5
three    9
dtype: int64

In [33]:
ser

two     2.0
one     1.0
four    4.0
five    NaN
dtype: float64

In [34]:
ser + ser_2

five     NaN
four     9.0
one      4.0
three    NaN
two      9.0
dtype: float64

Series和Series.index的name属性

In [35]:
ser.name = 'test_ser'
ser

two     2.0
one     1.0
four    4.0
five    NaN
Name: test_ser, dtype: float64

In [36]:
ser.index.name = 'test_index'
ser

test_index
two     2.0
one     1.0
four    4.0
five    NaN
Name: test_ser, dtype: float64

修改Series的index值

In [37]:
ser.index

Index(['two', 'one', 'four', 'five'], dtype='object', name='test_index')

In [38]:
ser.index[0]

'two'

直接修改index某个值会报错

In [39]:
# ser.index[0] = 'g'

In [40]:
# 可以对index统一赋值
ser.index = ['five','six','seven','eight']
ser

five     2.0
six      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

如果index量很大的情况下，可以使用rename方法来赋值

In [41]:
ser

five     2.0
six      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

In [42]:
ser.rename({'six':'ten'})

five     2.0
ten      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

In [43]:
ser

five     2.0
six      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

In [44]:
ser.rename({'six':'ten'},inplace=True)
ser

five     2.0
ten      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

如何生成DataFrame对象

In [45]:
frame = pd.DataFrame([1,2,3,4],columns=['data'])
frame

Unnamed: 0,data
0,1
1,2
2,3
3,4


In [46]:
frame = pd.DataFrame([1,2,3,4],index=['one','two','three','four'],columns=['data'])
frame

Unnamed: 0,data
one,1
two,2
three,3
four,4


In [47]:
arr = np.random.randint(1,20,(4,2))
frame = pd.DataFrame(arr,columns=['age','score'])
frame

Unnamed: 0,age,score
0,7,19
1,13,1
2,11,6
3,4,5


In [48]:
dic = {'age':[1,8,10,11],'score':[14,17,16,9]}
pd.DataFrame(dic)

Unnamed: 0,age,score
0,1,14
1,8,17
2,10,16
3,11,9


DataFrame对象的values，index和columns属性

In [49]:
dic = {'age':[1,8,10,11],'score':[14,17,16,9]}
frame = pd.DataFrame(dic)
frame

Unnamed: 0,age,score
0,1,14
1,8,17
2,10,16
3,11,9


In [50]:
frame.values

array([[ 1, 14],
       [ 8, 17],
       [10, 16],
       [11,  9]], dtype=int64)

In [51]:
frame.index

RangeIndex(start=0, stop=4, step=1)

In [52]:
frame.columns

Index(['age', 'score'], dtype='object')

DataFrame的shape属性

In [53]:
frame.shape

(4, 2)

检查是否存在空值

In [54]:
pd.isnull(frame)

Unnamed: 0,age,score
0,False,False
1,False,False
2,False,False
3,False,False


检查值是否非空

In [55]:
pd.notnull(frame)

Unnamed: 0,age,score
0,True,True
1,True,True
2,True,True
3,True,True


In [56]:
frame.notnull()

Unnamed: 0,age,score
0,True,True
1,True,True
2,True,True
3,True,True


使用等长度列表来生成DataFrame

In [57]:
list_1 = [[1,2,3,4],[5,6,7,8]]
frame = pd.DataFrame(list_1)
frame

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,5,6,7,8


In [58]:
dic_1 = {'a':[1,2,3,4],'b':[5,6,7,8]}
frame_1 = pd.DataFrame(dic_1)
frame_1

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8


In [59]:
dic_2 = {'a':np.random.randn(4),'b':np.random.randn(4)}
frame_2 = pd.DataFrame(dic_2)
frame_2

Unnamed: 0,a,b
0,-0.51156,0.331531
1,2.008483,0.420467
2,0.378062,0.920995
3,-2.006633,-0.799111


获取前5行和后5行

In [60]:
frame = pd.DataFrame(np.random.rand(7,8))
frame

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.780695,0.16043,0.128794,0.207109,0.324967,0.96481,0.214098,0.099305
1,0.458221,0.816917,0.786789,0.612123,0.599637,0.668304,0.456155,0.873705
2,0.734828,0.246814,0.199406,0.428855,0.804236,0.93708,0.56146,0.868191
3,0.250815,0.741093,0.536278,0.151153,0.185502,0.290004,0.873802,0.730799
4,0.379186,0.351904,0.987891,0.538711,0.93471,0.72286,0.107829,0.201007
5,0.714547,0.264654,0.452628,0.530251,0.955197,0.325079,0.392739,0.469366
6,0.230171,0.397826,0.984536,0.052446,0.281449,0.451333,0.630868,0.11187


In [61]:
frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.780695,0.16043,0.128794,0.207109,0.324967,0.96481,0.214098,0.099305
1,0.458221,0.816917,0.786789,0.612123,0.599637,0.668304,0.456155,0.873705
2,0.734828,0.246814,0.199406,0.428855,0.804236,0.93708,0.56146,0.868191
3,0.250815,0.741093,0.536278,0.151153,0.185502,0.290004,0.873802,0.730799
4,0.379186,0.351904,0.987891,0.538711,0.93471,0.72286,0.107829,0.201007


In [62]:
frame.tail()

Unnamed: 0,0,1,2,3,4,5,6,7
2,0.734828,0.246814,0.199406,0.428855,0.804236,0.93708,0.56146,0.868191
3,0.250815,0.741093,0.536278,0.151153,0.185502,0.290004,0.873802,0.730799
4,0.379186,0.351904,0.987891,0.538711,0.93471,0.72286,0.107829,0.201007
5,0.714547,0.264654,0.452628,0.530251,0.955197,0.325079,0.392739,0.469366
6,0.230171,0.397826,0.984536,0.052446,0.281449,0.451333,0.630868,0.11187


当使用字典作为数值参数来生成DataFrame时，通过columns参数指定列的顺序。

In [63]:
dic_1

{'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}

In [64]:
pd.DataFrame(dic_1)

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8


In [65]:
pd.DataFrame(dic_1,columns=['b','a'])

Unnamed: 0,b,a
0,5,1
1,6,2
2,7,3
3,8,4


In [66]:
pd.DataFrame(dic_1,columns=['b'])

Unnamed: 0,b
0,5
1,6
2,7
3,8


In [67]:
# 对于列，传递不存在的值，会生成空值
pd.DataFrame(dic_1,columns=['b','d'])

Unnamed: 0,b,d
0,5,
1,6,
2,7,
3,8,


指定index值时的规则

In [68]:
frame = pd.DataFrame(dic_1,columns=['b','d'],index=['one','two','three','four'])
frame

Unnamed: 0,b,d
one,5,
two,6,
three,7,
four,8,


In [69]:
# 对于index，传递多的或少的值，会报错
# pd.DataFrame(dic_1,columns=['b','d'],index=['one','two','three'])

DataFrame对列的索引，生成Series

In [70]:
frame = pd.DataFrame(dic_1,columns=['a','b'],index=['one','two','three','four'])
frame                                                    

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


In [71]:
frame['a']

one      1
two      2
three    3
four     4
Name: a, dtype: int64

In [72]:
frame.a

one      1
two      2
three    3
four     4
Name: a, dtype: int64

In [73]:
frame.loc[:,'a']

one      1
two      2
three    3
four     4
Name: a, dtype: int64

In [74]:
frame.iloc[:,0]

one      1
two      2
three    3
four     4
Name: a, dtype: int64

DataFrame对列的索引，生成DataFrame

In [75]:
frame[['a']]

Unnamed: 0,a
one,1
two,2
three,3
four,4


In [76]:
# 不能使用切片的写法
# frame[['a':'b']]

In [77]:
frame[['a','b']]

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


In [78]:
frame.loc[:,['a']]

Unnamed: 0,a
one,1
two,2
three,3
four,4


In [79]:
frame.loc[:,['a','b']]

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


In [80]:
frame.loc[:,'a':'b']

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


In [81]:
frame.iloc[:,[0]]

Unnamed: 0,a
one,1
two,2
three,3
four,4


In [82]:
frame.iloc[:,0:1]

Unnamed: 0,a
one,1
two,2
three,3
four,4


In [83]:
frame.iloc[:,[0,1]]

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


DataFrame对行的索引

In [84]:
# 生成Series
frame.loc['one']

a    1
b    5
Name: one, dtype: int64

In [85]:
# 生成DataFrame
frame.loc['one':'two']

Unnamed: 0,a,b
one,1,5
two,2,6


In [86]:
# 生成DataFrame
frame.loc[['one']]

Unnamed: 0,a,b
one,1,5


In [87]:
frame.loc[['one','three']]

Unnamed: 0,a,b
one,1,5
three,3,7


In [88]:
frame.iloc[0]

a    1
b    5
Name: one, dtype: int64

In [89]:
frame.iloc[0:2]

Unnamed: 0,a,b
one,1,5
two,2,6


In [90]:
# 使用iloc获取指定的行和列的值
frame.iloc[0,1]

5

In [91]:
# 使用iloc获取指定的行
frame.iloc[[0,2],]

Unnamed: 0,a,b
one,1,5
three,3,7


In [92]:
frame.iloc[[0,2]]

Unnamed: 0,a,b
one,1,5
three,3,7


In [93]:
# 使用索引序号，直接获取行，必须要使用切片
frame[0:1]

Unnamed: 0,a,b
one,1,5


In [94]:
# 使用索引标签，直接获取行，必须要使用切片
frame['one':'two']

Unnamed: 0,a,b
one,1,5
two,2,6


In [95]:
# 使用布尔索引获取行
frame[frame['a'] > 2]

Unnamed: 0,a,b
three,3,7
four,4,8


如何修改frame列的值

In [96]:
frame['a']

one      1
two      2
three    3
four     4
Name: a, dtype: int64

In [97]:
# 直接给列赋新值
frame['a'] = [10,11,12,13]
frame['a']

one      10
two      11
three    12
four     13
Name: a, dtype: int64

In [98]:
# 使用applymap方法对逐个元素进行处理
frame.applymap(lambda x:'{:.2f}'.format(x))

Unnamed: 0,a,b
one,10.0,5.0
two,11.0,6.0
three,12.0,7.0
four,13.0,8.0


In [99]:
frame

Unnamed: 0,a,b
one,10,5
two,11,6
three,12,7
four,13,8


In [100]:
# 使用map对frame的某一列(Series)进行处理
# frame没有map方法
frame['a'].map(lambda x:'{:.2f}'.format(x))

one      10.00
two      11.00
three    12.00
four     13.00
Name: a, dtype: object

对索引进行排序，使用sort_index()方法

In [101]:
frame = pd.DataFrame({'a':[10,11,12,13],'b':[5,6,7,8]},index=['one','two','three','four'])
frame

Unnamed: 0,a,b
one,10,5
two,11,6
three,12,7
four,13,8


In [102]:
frame.sort_index()

Unnamed: 0,a,b
four,13,8
one,10,5
three,12,7
two,11,6


In [103]:
frame

Unnamed: 0,a,b
one,10,5
two,11,6
three,12,7
four,13,8


In [104]:
frame.sort_index(inplace=True)
frame

Unnamed: 0,a,b
four,13,8
one,10,5
three,12,7
two,11,6


In [105]:
frame = pd.DataFrame({'b':[10,11,12,13],'a':[5,6,7,8]},index=['one','two','three','four'])
frame

Unnamed: 0,b,a
one,10,5
two,11,6
three,12,7
four,13,8


In [106]:
# 对columns进行排序
frame.sort_index(axis=1)

Unnamed: 0,a,b
one,5,10
two,6,11
three,7,12
four,8,13


In [107]:
frame

Unnamed: 0,b,a
one,10,5
two,11,6
three,12,7
four,13,8


In [108]:
# 降序排列
frame.sort_index(ascending=False)

Unnamed: 0,b,a
two,11,6
three,12,7
one,10,5
four,13,8


对Series的值进行排序

In [109]:
ser

five     2.0
ten      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

In [110]:
ser.sort_values()

ten      1.0
five     2.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

In [111]:
ser

five     2.0
ten      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

In [112]:
ser.sort_values(inplace=True)

In [113]:
ser

ten      1.0
five     2.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

对DataFrame的某一列的值进行排序

In [114]:
frame.sort_values('a')

Unnamed: 0,b,a
one,10,5
two,11,6
three,12,7
four,13,8


In [115]:
frame

Unnamed: 0,b,a
one,10,5
two,11,6
three,12,7
four,13,8


In [116]:
frame.sort_values(by=['a'])

Unnamed: 0,b,a
one,10,5
two,11,6
three,12,7
four,13,8


In [117]:
frame.sort_values(by=['a','b'])

Unnamed: 0,b,a
one,10,5
two,11,6
three,12,7
four,13,8


对Series或DataFrame进行排名

In [118]:
frame['a']

one      5
two      6
three    7
four     8
Name: a, dtype: int64

In [119]:
frame['a'].rank()

one      1.0
two      2.0
three    3.0
four     4.0
Name: a, dtype: float64

In [120]:
frame['a'].rank(ascending=False)

one      4.0
two      3.0
three    2.0
four     1.0
Name: a, dtype: float64

In [121]:
ser = pd.Series(np.random.randint(1,5,6),index=list('abcdef'))
ser

a    2
b    4
c    3
d    4
e    2
f    4
dtype: int32

In [122]:
# 对于有重复值的情况，默认取重复值排名的平均值作为最终排名
ser.rank()

a    1.5
b    5.0
c    3.0
d    5.0
e    1.5
f    5.0
dtype: float64

In [123]:
# 使用重复值出现的观测顺序来打破相同排名
ser.rank(method='first')

a    1.0
b    4.0
c    3.0
d    5.0
e    2.0
f    6.0
dtype: float64

In [124]:
# 使用重复值的最大排名作为最终的排名
ser.rank(method='max')

a    2.0
b    6.0
c    3.0
d    6.0
e    2.0
f    6.0
dtype: float64

In [125]:
# 使用重复值的最小排名作为最终排名
ser.rank(method='min')

a    1.0
b    4.0
c    3.0
d    4.0
e    1.0
f    4.0
dtype: float64

判断索引是否有重复值

In [126]:
ser.index.is_unique

True

In [127]:
ser = pd.Series(np.random.randint(1,5,6),index=list('abcdcf'))
ser

a    1
b    4
c    3
d    4
c    3
f    3
dtype: int32

In [128]:
ser.index.is_unique

False

In [129]:
frame

Unnamed: 0,b,a
one,10,5
two,11,6
three,12,7
four,13,8


如果有重复的索引，那么选中该重复索引的时候，会把重复索引都选上

In [130]:
frame = pd.DataFrame({'a':[5,6,7,8],'b':[10,11,12,13]},index=['one','two','three','three'])
frame

Unnamed: 0,a,b
one,5,10
two,6,11
three,7,12
three,8,13


In [131]:
frame.loc['one']

a     5
b    10
Name: one, dtype: int64

In [132]:
frame.loc['three']

Unnamed: 0,a,b
three,7,12
three,8,13


常用的描述性统计方法，作用于frame的每一列。

In [133]:
frame = pd.DataFrame({'a':[5,6,7,8],'b':[10,11,12,13]},index=['one','two','three','four'])
frame

Unnamed: 0,a,b
one,5,10
two,6,11
three,7,12
four,8,13


In [134]:
frame.sum()

a    26
b    46
dtype: int64

In [135]:
frame.mean()

a     6.5
b    11.5
dtype: float64

In [136]:
frame.idxmax()

a    four
b    four
dtype: object

In [137]:
frame.cumsum()

Unnamed: 0,a,b
one,5,10
two,11,21
three,18,33
four,26,46


In [138]:
frame.describe()

Unnamed: 0,a,b
count,4.0,4.0
mean,6.5,11.5
std,1.290994,1.290994
min,5.0,10.0
25%,5.75,10.75
50%,6.5,11.5
75%,7.25,12.25
max,8.0,13.0


获取Series的唯一值,frame没有unique()方法

In [139]:
frame['a'].unique()

array([5, 6, 7, 8], dtype=int64)

统计每一列包含值的个数

In [140]:
# frame没有unique()方法
# frame.unique()

In [141]:
frame['a'].value_counts()

7    1
6    1
5    1
8    1
Name: a, dtype: int64

检查Series和frame的值是否在列表中

In [142]:
list1 = [5,8,10,12]
print('list1:{}'.format(list1))
print("frame['a']:{}".format(frame['a']))
frame['a'].isin(list1)

list1:[5, 8, 10, 12]
frame['a']:one      5
two      6
three    7
four     8
Name: a, dtype: int64


one       True
two      False
three    False
four      True
Name: a, dtype: bool

In [143]:
frame.isin(list1)

Unnamed: 0,a,b
one,True,True
two,False,False
three,False,True
four,True,False


获取一个series的值在另一个ser中的位置

In [145]:
ser1 = pd.Series([1,2,3,4],index=['one','two','three','four'])
ser1

one      1
two      2
three    3
four     4
dtype: int64

In [146]:
ser2 = pd.Series([2,4,5,6],index=['five','six','seven','eight'])
ser2

five     2
six      4
seven    5
eight    6
dtype: int64

In [147]:
# 返回一个索引，值为ser的值
pd.Index(ser1)

Int64Index([1, 2, 3, 4], dtype='int64')

In [148]:
pd.Index(ser2)

Int64Index([2, 4, 5, 6], dtype='int64')

In [149]:
# 返回ser2的值在ser1中索引的位置，-1表示不存在
pd.Index(ser1).get_indexer(ser2)

array([ 1,  3, -1, -1], dtype=int32)

In [150]:
ser1.index

Index(['one', 'two', 'three', 'four'], dtype='object')

计算frame的直方图

In [151]:
frame.apply(pd.value_counts)

Unnamed: 0,a,b
5,1.0,
6,1.0,
7,1.0,
8,1.0,
10,,1.0
11,,1.0
12,,1.0
13,,1.0


In [152]:
frame.apply(pd.value_counts).fillna(0)

Unnamed: 0,a,b
5,1.0,0.0
6,1.0,0.0
7,1.0,0.0
8,1.0,0.0
10,0.0,1.0
11,0.0,1.0
12,0.0,1.0
13,0.0,1.0


In [153]:
frame.apply(pd.value_counts)

Unnamed: 0,a,b
5,1.0,
6,1.0,
7,1.0,
8,1.0,
10,,1.0
11,,1.0
12,,1.0
13,,1.0


识别重复值

In [154]:
frame.duplicated()

one      False
two      False
three    False
four     False
dtype: bool

In [155]:
frame['a'].duplicated()

one      False
two      False
three    False
four     False
Name: a, dtype: bool

In [156]:
frame = pd.DataFrame({'a':[5,7,7,8],'b':[10,12,12,13]},index=['one','two','three','four'])
frame

Unnamed: 0,a,b
one,5,10
two,7,12
three,7,12
four,8,13


In [157]:
frame.duplicated()

one      False
two      False
three     True
four     False
dtype: bool

In [158]:
frame.drop_duplicates()

Unnamed: 0,a,b
one,5,10
two,7,12
four,8,13


In [159]:
frame

Unnamed: 0,a,b
one,5,10
two,7,12
three,7,12
four,8,13


In [160]:
#  根据某一列的值来删除重复值
frame.drop_duplicates('a')

Unnamed: 0,a,b
one,5,10
two,7,12
four,8,13


In [161]:
frame

Unnamed: 0,a,b
one,5,10
two,7,12
three,7,12
four,8,13
