# Pandas基础

In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1, 3, 6, np.nan, 44, 1])
s

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

In [3]:
dates = pd.date_range('20160101', periods=6)
dates

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

# DataFrame的使用

In [4]:
# 生成框架，有标题和序列
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b','c', 'd'])
df

Unnamed: 0,a,b,c,d
2016-01-01,-0.598624,-0.730161,0.486057,0.163667
2016-01-02,-2.517562,-0.275055,1.314815,2.658326
2016-01-03,1.534812,1.022975,-0.413207,0.708282
2016-01-04,0.24518,0.302458,-0.608806,-1.352564
2016-01-05,0.593149,0.505962,-0.058204,0.516191
2016-01-06,1.016878,-1.077706,-0.767148,-0.116165


In [5]:
# 如果没有定义index, 默认从0开始 0,1,2,3...
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
df1

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [6]:
# 用字典的方式生成DataFrame
df2 = pd.DataFrame({'A':1.,
                   'B':pd.Timestamp('20130102'),
                   'C':pd.Series(1, index=list(range(4)), dtype='float32'),
                   'D':np.array([3]*4, dtype='int32'),
                   'E':pd.Categorical(["test", "train", "test", "train"]),
                   'F':'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
# 类型
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [8]:
# 序号
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [9]:
# 标题的名字
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [10]:
# 打印值
df2.values

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [11]:
# 进行数据运算
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [12]:
# 列按倒的序列排序
df2.sort_index(axis=1, ascending=False)

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2013-01-02,1.0
1,foo,train,3,1.0,2013-01-02,1.0
2,foo,test,3,1.0,2013-01-02,1.0
3,foo,train,3,1.0,2013-01-02,1.0


In [13]:
# 行按倒的序列排序
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
0,1.0,2013-01-02,1.0,3,test,foo


In [14]:
# 对里面的值进行排序
df2.sort_values(by='E')

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
2,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
3,1.0,2013-01-02,1.0,3,train,foo
