In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [2]:
index = pd.date_range('1/1/2020', periods=8)
s = pd.Series(np.random.randn(5), index=['a','b','c','d','e'])
df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
2020-01-01,-1.837389,-0.502021,-0.51111
2020-01-02,-0.251983,0.174675,0.025821
2020-01-03,0.453611,0.320801,0.658286
2020-01-04,1.066931,0.87784,-0.427331
2020-01-05,1.141333,0.090507,-0.032995
2020-01-06,0.425389,-1.636299,-1.454113
2020-01-07,0.863827,0.43884,-1.800895
2020-01-08,-3.199249,0.532554,0.977505


### Head与Tail
- 用于快速浏览`Series`与`DataFrame`

In [3]:
long_series = pd.Series(np.random.randn(1000))
long_series.head()

0    0.264154
1    1.976132
2   -1.214458
3   -0.338936
4    1.766516
dtype: float64

In [4]:
long_series.tail()

995    1.203551
996   -1.406152
997    0.062369
998   -0.461022
999   -0.546006
dtype: float64

### 属性与低层书
- shape 输出对象的轴维度
- Series:Index轴标签
- DataFrame index 行与列标签

In [5]:
long_series.index, df.index

(RangeIndex(start=0, stop=1000, step=1),
 DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
                '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08'],
               dtype='datetime64[ns]', freq='D'))

In [6]:
s.to_numpy() == np.asarray(s)  ## 提取numpy数组

array([ True,  True,  True,  True,  True])

### DataFrame提取数据
- 1. 当所有列的数据类型都一样 直接返回得逞数据
- 2. 当同构型数据，即列的数据类型一样，pandas修改原始ndarray
- 3. 对于异质性数据，即列的数据类型不一样， 字符串输出的类型为 object，浮点数或整数 输出浮点数

In [7]:
df.to_numpy().dtype

dtype('float64')

In [8]:
df_t1 = pd.DataFrame([1, '12'])
# df_t1.iat[1, 1] = 'qe'
df_t1.to_numpy().dtype

dtype('O')

### 加速操作

In [20]:
# 借助numexpr与bottleneck支持库，可以加速特定类型的二进制数值与布尔操作
pd.set_option('compute.use_bottleneck', True)
pd.set_option('compute.use_numexpr', True)

In [21]:
import cProfile
df1 = pd.DataFrame(np.random.randn(100, 100000))
df2 = pd.DataFrame(np.random.randn(100, 100000))

In [22]:
pd.get_option('compute.use_numexpr')

True

### 二进制操作

In [26]:
# 1.多维与低维对象之间的广播机制
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,-1.384536,-0.819317,
b,-0.737517,-0.389764,-0.704417
c,-0.813361,-0.901278,1.321916
d,,-0.405067,-0.707503


In [32]:
row = df.iloc[1]
colum = df['two']
row, colum

(one     -0.737517
 two     -0.389764
 three   -0.704417
 Name: b, dtype: float64,
 a   -0.819317
 b   -0.389764
 c   -0.901278
 d   -0.405067
 Name: two, dtype: float64)

In [38]:
df.sub(row, axis=1)  # 或axis=‘columns’  每一行都减去了 row所在的行

Unnamed: 0,one,two,three
a,-0.647019,-0.429554,
b,0.0,0.0,0.0
c,-0.075844,-0.511514,2.026333
d,,-0.015304,-0.003086


In [40]:
# 缺失值与填充缺失值操作
# 缺失值相互运算仍然是缺失值
df2 = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd'])})
df + df2

Unnamed: 0,one,two,three
a,-1.024335,-2.873954,
b,-0.138485,-1.42698,-1.770998
c,-2.557931,-1.900887,0.534913
d,,-1.067135,-2.370399


In [41]:
# 但是当其中一个为缺失值时设置一个填充的值来进行运算 当都为缺失值时 不做填充
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,-1.024335,-2.873954,0.80972
b,-0.138485,-1.42698,-1.770998
c,-2.557931,-1.900887,0.534913
d,,-1.067135,-2.370399


### 比较操作
- `Series`,`DataFrame`支持`eq`,`ne`,`lt`,`gt`,`le`,`ge`形式的比较操作
- 比较操作支持广播

In [43]:
df.gt(df2)

Unnamed: 0,one,two,three
a,False,True,False
b,False,True,True
c,True,True,True
d,False,True,True


### 布尔简化
- empty、any、all、bool可以把数据汇总简化至单个布尔值

In [45]:
(df > 0).all() 

one      False
two      False
three    False
dtype: bool

- pandas的空值不相等

In [46]:
((df + df) == df * 2).all()

one      False
two       True
three    False
dtype: bool

- 可以用对象方法`equals`来验证数据是否等效

In [48]:
(df + df).equals(df * 2)

True

- pandas无法广播时抛出错误，numpy无法广播时返回False

### 合并重叠数据集

In [49]:
df1 = pd.DataFrame({'A': [1., np.nan, 3., 5., np.nan],
                        'B': [np.nan, 2., 3., np.nan, 6.]})
    
df2 = pd.DataFrame({'A': [5., 2., 4., np.nan, 3., 7.],
                        'B': [np.nan, np.nan, 3., 4., 6., 8.]})
df1, df2
    

(     A    B
 0  1.0  NaN
 1  NaN  2.0
 2  3.0  3.0
 3  5.0  NaN
 4  NaN  6.0,
      A    B
 0  5.0  NaN
 1  2.0  NaN
 2  4.0  3.0
 3  NaN  4.0
 4  3.0  6.0
 5  7.0  8.0)

In [53]:
df1.combine_first(df2)  # df1非空用df1 否则用df2 都为空则为空

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


In [None]:
df.combine(lambda x,y: pd.isna(x))