In [32]:
import numpy as np
import pandas as pd

如何生成Series对象

In [33]:
ser = pd.Series([1,2,3,4])
ser

0    1
1    2
2    3
3    4
dtype: int64

In [34]:
ser = pd.Series([1,2,3,4],index=['one','two','three','four'])
ser

one      1
two      2
three    3
four     4
dtype: int64

In [35]:
dic = {"one":1,"two":2,"three":3,"four":4}
ser = pd.Series(dic)
ser

one      1
two      2
three    3
four     4
dtype: int64

Seies的values和index属性

In [36]:
dic = {"one":1,"two":2,"three":3,"four":4}
ser = pd.Series(dic)
ser

one      1
two      2
three    3
four     4
dtype: int64

In [37]:
ser.values

array([1, 2, 3, 4], dtype=int64)

In [38]:
ser.index

Index(['one', 'two', 'three', 'four'], dtype='object')

Series对象的shape属性

In [39]:
ser.shape

(4,)

Series的索引方式

In [40]:
ser['one']

1

In [41]:
ser[0]

1

In [42]:
# 按index名称进行索引
ser.loc['one']

1

In [43]:
# 按index的位置进行索引
ser.iloc[0]

1

Serie使用切片

In [44]:
# 包含右边的值
ser['one':'two']

one    1
two    2
dtype: int64

In [45]:
# 不包含右边的值
ser[0:1]

one    1
dtype: int64

In [46]:
# 包含右边的值
ser.loc['one':'two']

one    1
two    2
dtype: int64

In [47]:
# 不包含右边的值
ser.iloc[0:1]

one    1
dtype: int64

Series列表索引，类似于arr的神奇索引

In [57]:
ser[['two','one','four','three']]

two      2
one      1
four     4
three    3
dtype: int64

In [58]:
ser[['two','one','four']]

two     2
one     1
four    4
dtype: int64

Series使用布尔索引

In [60]:
ser[[True,False,False,True]]

one     1
four    4
dtype: int64

Series进行数学运算

In [61]:
ser

one      1
two      2
three    3
four     4
dtype: int64

In [62]:
ser+1

one      2
two      3
three    4
four     5
dtype: int64

In [64]:
np.exp(ser)

one       2.718282
two       7.389056
three    20.085537
four     54.598150
dtype: float64

判断某个值是否在Series的index里

In [65]:
'two' in ser

True

In [67]:
'two' in ser.index

True

判断某个值是否在Series的values里

In [66]:
3 in ser.values

True

按照指定的index顺序生成Series

In [68]:
dic = {"one":1,"two":2,"three":3,"four":4}
ser = pd.Series(dic,index=['two','one','four','three'])
ser

two      2
one      1
four     4
three    3
dtype: int64

In [69]:
# 如果指定的index值在原来的dic里不存在，那么值为NaN
ser = pd.Series(dic,index=['two','one','four','five'])
ser

two     2.0
one     1.0
four    4.0
five    NaN
dtype: float64

判断Series值是否为空

In [70]:
pd.isnull(ser)

two     False
one     False
four    False
five     True
dtype: bool

In [71]:
ser.isnull()

two     False
one     False
four    False
five     True
dtype: bool

判断Series值是否非空

In [74]:
pd.notnull(ser)

two      True
one      True
four     True
five    False
dtype: bool

In [75]:
ser.notnull()

two      True
one      True
four     True
five    False
dtype: bool

Series在进行数学运算时的自动对齐特性  
相同的index值才进行运算，不同的index运算结果为空

In [82]:
ser_2 = pd.Series({'one':3,"two":7,'four':5,'three':9})
ser_2

one      3
two      7
four     5
three    9
dtype: int64

In [80]:
ser

two     2.0
one     1.0
four    4.0
five    NaN
dtype: float64

In [83]:
ser + ser_2

five     NaN
four     9.0
one      4.0
three    NaN
two      9.0
dtype: float64

Series和Series.index的name属性

In [86]:
ser.name = 'test_ser'
ser

two     2.0
one     1.0
four    4.0
five    NaN
Name: test_ser, dtype: float64

In [87]:
ser.index.name = 'test_index'
ser

test_index
two     2.0
one     1.0
four    4.0
five    NaN
Name: test_ser, dtype: float64

修改Series的index值

In [88]:
ser.index

Index(['two', 'one', 'four', 'five'], dtype='object', name='test_index')

In [89]:
ser.index[0]

'two'

In [91]:
# 直接修改index某个值会报错
ser.index[0] = 'six'

TypeError: Index does not support mutable operations

In [93]:
# 可以对index统一赋值
ser.index = ['five','six','seven','eight']
ser

five     2.0
six      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

如果index量很大的情况下，可以使用rename方法来赋值

In [94]:
ser

five     2.0
six      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

In [96]:
ser.rename({'six':'ten'})

five     2.0
ten      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

In [97]:
ser

five     2.0
six      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

In [98]:
ser.rename({'six':'ten'},inplace=True)
ser

five     2.0
ten      1.0
seven    4.0
eight    NaN
Name: test_ser, dtype: float64

如何生成DataFrame对象

In [48]:
frame = pd.DataFrame([1,2,3,4],columns=['data'])
frame

Unnamed: 0,data
0,1
1,2
2,3
3,4


In [49]:
frame = pd.DataFrame([1,2,3,4],index=['one','two','three','four'],columns=['data'])
frame

Unnamed: 0,data
one,1
two,2
three,3
four,4


In [50]:
arr = np.random.randint(1,20,(4,2))
frame = pd.DataFrame(arr,columns=['age','score'])
frame

Unnamed: 0,age,score
0,16,14
1,16,11
2,13,7
3,19,10


In [51]:
dic = {'age':[1,8,10,11],'score':[14,17,16,9]}
pd.DataFrame(dic)

Unnamed: 0,age,score
0,1,14
1,8,17
2,10,16
3,11,9


DataFrame对象的values，index和columns属性

In [52]:
dic = {'age':[1,8,10,11],'score':[14,17,16,9]}
frame = pd.DataFrame(dic)
frame

Unnamed: 0,age,score
0,1,14
1,8,17
2,10,16
3,11,9


In [53]:
frame.values

array([[ 1, 14],
       [ 8, 17],
       [10, 16],
       [11,  9]], dtype=int64)

In [54]:
frame.index

RangeIndex(start=0, stop=4, step=1)

In [55]:
frame.columns

Index(['age', 'score'], dtype='object')

DataFrame的shape属性

In [56]:
frame.shape

(4, 2)

检查是否存在空值

In [72]:
pd.isnull(frame)

Unnamed: 0,age,score
0,False,False
1,False,False
2,False,False
3,False,False


检查值是否非空

In [76]:
pd.notnull(frame)

Unnamed: 0,age,score
0,True,True
1,True,True
2,True,True
3,True,True


In [77]:
frame.notnull()

Unnamed: 0,age,score
0,True,True
1,True,True
2,True,True
3,True,True


使用等长度列表来生成DataFrame

In [100]:
list_1 = [[1,2,3,4],[5,6,7,8]]
frame = pd.DataFrame(list_1)
frame

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,5,6,7,8


In [101]:
dic_1 = {'a':[1,2,3,4],'b':[5,6,7,8]}
frame_1 = pd.DataFrame(dic_1)
frame_1

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8


In [102]:
dic_2 = {'a':np.random.randn(4),'b':np.random.randn(4)}
frame_2 = pd.DataFrame(dic_2)
frame_2

Unnamed: 0,a,b
0,0.734728,-2.092889
1,-0.209118,1.013932
2,1.33909,-0.388475
3,-0.239243,-0.538564


获取前5行和后5行

In [103]:
frame = pd.DataFrame(np.random.rand(7,8))
frame

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.675063,0.937895,0.605752,0.106039,0.556261,0.78596,0.017867,0.107635
1,0.652966,0.87427,0.49526,0.661086,0.764298,0.48056,0.047513,0.841197
2,0.030859,0.295707,0.12611,0.592902,0.324621,0.509874,0.035734,0.442179
3,0.570024,0.679402,0.820676,0.527923,0.218526,0.635337,0.950778,0.060316
4,0.40471,0.005134,0.587389,0.637513,0.925393,0.14172,0.818558,0.173224
5,0.936124,0.706591,0.269316,0.231274,0.924326,0.283007,0.54452,0.955634
6,0.919811,0.852866,0.905864,0.400672,0.317105,0.768616,0.250232,0.051243


In [104]:
frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.675063,0.937895,0.605752,0.106039,0.556261,0.78596,0.017867,0.107635
1,0.652966,0.87427,0.49526,0.661086,0.764298,0.48056,0.047513,0.841197
2,0.030859,0.295707,0.12611,0.592902,0.324621,0.509874,0.035734,0.442179
3,0.570024,0.679402,0.820676,0.527923,0.218526,0.635337,0.950778,0.060316
4,0.40471,0.005134,0.587389,0.637513,0.925393,0.14172,0.818558,0.173224


In [105]:
frame.tail()

Unnamed: 0,0,1,2,3,4,5,6,7
2,0.030859,0.295707,0.12611,0.592902,0.324621,0.509874,0.035734,0.442179
3,0.570024,0.679402,0.820676,0.527923,0.218526,0.635337,0.950778,0.060316
4,0.40471,0.005134,0.587389,0.637513,0.925393,0.14172,0.818558,0.173224
5,0.936124,0.706591,0.269316,0.231274,0.924326,0.283007,0.54452,0.955634
6,0.919811,0.852866,0.905864,0.400672,0.317105,0.768616,0.250232,0.051243


当使用字典作为数值参数来生成DataFrame时，通过columns参数指定列的顺序。

In [106]:
dic_1

{'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}

In [107]:
pd.DataFrame(dic_1)

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7
3,4,8


In [108]:
pd.DataFrame(dic_1,columns=['b','a'])

Unnamed: 0,b,a
0,5,1
1,6,2
2,7,3
3,8,4


In [109]:
pd.DataFrame(dic_1,columns=['b'])

Unnamed: 0,b
0,5
1,6
2,7
3,8


In [114]:
# 对于列，传递不存在的值，会生成空值
pd.DataFrame(dic_1,columns=['b','d'])

Unnamed: 0,b,d
0,5,
1,6,
2,7,
3,8,


指定index值时的规则

In [125]:
frame = pd.DataFrame(dic_1,columns=['b','d'],index=['one','two','three','four'])
frame

Unnamed: 0,b,d
one,5,
two,6,
three,7,
four,8,


In [119]:
# 对于index，传递多的或少的值，会报错
pd.DataFrame(dic_1,columns=['b','d'],index=['one','two','three'])

ValueError: Shape of passed values is (2, 4), indices imply (2, 3)

DataFrame对列的索引，生成Series

In [128]:
frame = pd.DataFrame(dic_1,columns=['a','b'],index=['one','two','three','four'])
frame                                                    

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


In [130]:
frame['a']

one      1
two      2
three    3
four     4
Name: a, dtype: int64

In [131]:
frame.a

one      1
two      2
three    3
four     4
Name: a, dtype: int64

In [141]:
frame.loc[:,'a']

one      1
two      2
three    3
four     4
Name: a, dtype: int64

In [142]:
frame.iloc[:,0]

one      1
two      2
three    3
four     4
Name: a, dtype: int64

DataFrame对列的索引，生成DataFrame

In [132]:
frame[['a']]

Unnamed: 0,a
one,1
two,2
three,3
four,4


In [134]:
# 不能使用切片的写法
frame[['a':'b']]

SyntaxError: invalid syntax (<ipython-input-134-636991682c59>, line 2)

In [135]:
frame[['a','b']]

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


In [137]:
frame.loc[:,['a']]

Unnamed: 0,a
one,1
two,2
three,3
four,4


In [139]:
frame.loc[:,['a','b']]

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


In [140]:
frame.loc[:,'a':'b']

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


In [145]:
frame.iloc[:,[0]]

Unnamed: 0,a
one,1
two,2
three,3
four,4


In [146]:
frame.iloc[:,0:1]

Unnamed: 0,a
one,1
two,2
three,3
four,4


In [148]:
frame.iloc[:,[0,1]]

Unnamed: 0,a,b
one,1,5
two,2,6
three,3,7
four,4,8


DataFrame对行的索引

In [149]:
# 生成Series
frame.loc['one']

a    1
b    5
Name: one, dtype: int64

In [150]:
# 生成DataFrame
frame.loc['one':'two']

Unnamed: 0,a,b
one,1,5
two,2,6


In [151]:
# 生成DataFrame
frame.loc[['one']]

Unnamed: 0,a,b
one,1,5


In [152]:
frame.loc[['one','three']]

Unnamed: 0,a,b
one,1,5
three,3,7
