In [3]:
import pandas as pd
import numpy as np

### Series
Series的字符串表现形式为：索引在左边，值在右边。
    
由于我们没有为数据指定索引，于是会自动创建一个0到N-1（N为长度）的整数型索引。

In [4]:
s = pd.Series([1, 3, 6, np.nan, 44, 1])
print(s)

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64


### DataFrame
DataFrame是一个表格型的数据结构，它包含有一组有序的列，每列可以是不同的值类型（数值，字符串，布尔值等）。
        
DataFrame既有行索引也有列索引， 它可以被看做由Series组成的大字典。

In [10]:
dates = pd.date_range('20190714', periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['a','b','c','d']) # 6行4列
print(df)

DatetimeIndex(['2019-07-14', '2019-07-15', '2019-07-16', '2019-07-17',
               '2019-07-18', '2019-07-19'],
              dtype='datetime64[ns]', freq='D')
                   a         b         c         d
2019-07-14 -0.183047  0.129060 -0.178748 -0.048714
2019-07-15  0.445466  0.239436 -2.474375 -0.609477
2019-07-16 -0.671901 -1.057606 -0.334523 -0.110975
2019-07-17 -0.229298  0.027984 -0.828452  0.404956
2019-07-18 -0.481123  0.392407  0.801153 -0.587273
2019-07-19  0.626338 -1.371531 -0.320659 -1.753163


In [11]:
print(df['b'])

2019-07-14    0.129060
2019-07-15    0.239436
2019-07-16   -1.057606
2019-07-17    0.027984
2019-07-18    0.392407
2019-07-19   -1.371531
Freq: D, Name: b, dtype: float64


In [12]:
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


In [13]:
df2 = pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo'})
                    
print(df2)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo


In [14]:
print(df2.dtypes)

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


In [15]:
print(df2.index)

Int64Index([0, 1, 2, 3], dtype='int64')


In [16]:
print(df2.columns)

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')


In [17]:
print(df2.values)

[[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']]


In [18]:
df2.describe()
# 结果自动忽略了非数字项

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [19]:
print(df2.T)

                     0                    1                    2  \
A                    1                    1                    1   
B  2013-01-02 00:00:00  2013-01-02 00:00:00  2013-01-02 00:00:00   
C                    1                    1                    1   
D                    3                    3                    3   
E                 test                train                 test   
F                  foo                  foo                  foo   

                     3  
A                    1  
B  2013-01-02 00:00:00  
C                    1  
D                    3  
E                train  
F                  foo  


In [20]:
print(df2.sort_index(axis=1, ascending=False)) # axis=1 按列排序， ascending=false 按倒序排序

     F      E  D    C          B    A
0  foo   test  3  1.0 2013-01-02  1.0
1  foo  train  3  1.0 2013-01-02  1.0
2  foo   test  3  1.0 2013-01-02  1.0
3  foo  train  3  1.0 2013-01-02  1.0


In [24]:
print(df2.sort_index(axis=0, ascending=False))

     A          B    C  D      E    F
3  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
0  1.0 2013-01-02  1.0  3   test  foo


In [25]:
print(df2.sort_values(by='B'))

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
