# DataFrame的简介
## DataFrame的创建
dataframe是一个表格的结构，相当于一个二维的ndarray

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.DataFrame({ 'a' : [1,2,3,4],
                      'b' : pd.Series(1, index = list(range(4)), dtype='float32'),
                      'c': np.array([3]*4, dtype = 'int32')})

# a b c为列索引 字典是列的索引
# 


In [5]:
data

Unnamed: 0,a,b,c
0,1,1.0,3
1,2,1.0,3
2,3,1.0,3
3,4,1.0,3


In [13]:
# 行的索引是 index 
# 列的索引是 column
xuhao = ['one', 'two', 'three', 'four', 'five', 'six']
df = pd.DataFrame(np.random.randn(6, 4), index = xuhao, columns=list('abcd'))
# random.randn 正态分布的随机数

In [14]:
df

Unnamed: 0,a,b,c,d
one,0.193501,-1.042922,0.665838,-1.760604
two,-1.903137,0.750669,-0.729288,-0.698876
three,0.951065,-0.151784,-1.319253,0.225157
four,-1.071918,1.174068,1.098785,0.23715
five,0.026197,-0.77303,-0.390808,-0.637051
six,0.305107,-1.125128,0.269343,-1.824455


In [15]:
# 查看DataFrame 的索引
df.columns

Index(['a', 'b', 'c', 'd'], dtype='object')

In [16]:
df.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

### 选取DataFrame的一列

In [17]:
df.a
# 保留了行索引

one      0.193501
two     -1.903137
three    0.951065
four    -1.071918
five     0.026197
six      0.305107
Name: a, dtype: float64

In [19]:
df['a']
# 以列表形式传入

one      0.193501
two     -1.903137
three    0.951065
four    -1.071918
five     0.026197
six      0.305107
Name: a, dtype: float64

In [21]:
#选取多列，一定要使用列表形式传入
df[['a', 'b']]
# 保留了行索引

Unnamed: 0,a,b
one,0.193501,-1.042922
two,-1.903137,0.750669
three,0.951065,-0.151784
four,-1.071918,1.174068
five,0.026197,-0.77303
six,0.305107,-1.125128


In [24]:
# 选取行
df[0:1]

Unnamed: 0,a,b,c,d
one,0.193501,-1.042922,0.665838,-1.760604


### 精确选取
使用loc函数,按照名称去取值

In [25]:
df.loc['one']
# 默认选取行

a    0.193501
b   -1.042922
c    0.665838
d   -1.760604
Name: one, dtype: float64

In [27]:
# 选取多行
df.loc[['one', 'two']]

Unnamed: 0,a,b,c,d
one,0.193501,-1.042922,0.665838,-1.760604
two,-1.903137,0.750669,-0.729288,-0.698876


In [29]:
# 同时选取行和列
df.loc[['one', 'two'],['a','d']]

Unnamed: 0,a,d
one,0.193501,-1.760604
two,-1.903137,-0.698876


In [32]:
# 选取全部的值时
df.loc[: , ['a', 'b']]

Unnamed: 0,a,b
one,0.193501,-1.042922
two,-1.903137,0.750669
three,0.951065,-0.151784
four,-1.071918,1.174068
five,0.026197,-0.77303
six,0.305107,-1.125128


### 按照顺序去取值
iloc函数

In [34]:
df.iloc[3]
# 选取行

a   -1.071918
b    1.174068
c    1.098785
d    0.237150
Name: four, dtype: float64

In [36]:
df.iloc[1:3, 2]

two     -0.729288
three   -1.319253
Name: c, dtype: float64

In [37]:
df.iloc[1:2, :]

Unnamed: 0,a,b,c,d
two,-1.903137,0.750669,-0.729288,-0.698876


In [38]:
# 选取单个值
df.iloc[2,3]

0.2251572718323901

## 取最

In [39]:
# 取大于0的值
df[df>0]

Unnamed: 0,a,b,c,d
one,0.193501,,0.665838,
two,,0.750669,,
three,0.951065,,,0.225157
four,,1.174068,1.098785,0.23715
five,0.026197,,,
six,0.305107,,0.269343,


In [40]:
# 取第一列小于于0的数
df[df.a<0]

Unnamed: 0,a,b,c,d
two,-1.903137,0.750669,-0.729288,-0.698876
four,-1.071918,1.174068,1.098785,0.23715


## 增加一列
通过赋值增加一列

In [43]:
df['e'] = 3

In [44]:
df

Unnamed: 0,a,b,c,d,e
one,0.193501,-1.042922,0.665838,-1.760604,3
two,-1.903137,0.750669,-0.729288,-0.698876,3
three,0.951065,-0.151784,-1.319253,0.225157,3
four,-1.071918,1.174068,1.098785,0.23715,3
five,0.026197,-0.77303,-0.390808,-0.637051,3
six,0.305107,-1.125128,0.269343,-1.824455,3


In [45]:
df['f'] = np.arange(6)

In [46]:
df

Unnamed: 0,a,b,c,d,e,f
one,0.193501,-1.042922,0.665838,-1.760604,3,0
two,-1.903137,0.750669,-0.729288,-0.698876,3,1
three,0.951065,-0.151784,-1.319253,0.225157,3,2
four,-1.071918,1.174068,1.098785,0.23715,3,3
five,0.026197,-0.77303,-0.390808,-0.637051,3,4
six,0.305107,-1.125128,0.269343,-1.824455,3,5


In [48]:
# 用pd.series 增加一列
df['g'] = pd.Series(np.arange(4), index = ['one','three','five','six'])

In [49]:
df

Unnamed: 0,a,b,c,d,e,f,g
one,0.193501,-1.042922,0.665838,-1.760604,3,0,0.0
two,-1.903137,0.750669,-0.729288,-0.698876,3,1,
three,0.951065,-0.151784,-1.319253,0.225157,3,2,1.0
four,-1.071918,1.174068,1.098785,0.23715,3,3,
five,0.026197,-0.77303,-0.390808,-0.637051,3,4,2.0
six,0.305107,-1.125128,0.269343,-1.824455,3,5,3.0


In [50]:
## 使用嵌套字典生成DataFrame
zd = {'one': {'a': 12, 'b': '13'},
     'two' : {'a': 34, 'e': '35'}}

In [52]:
data = pd.DataFrame(zd)

In [53]:
data

Unnamed: 0,one,two
a,12.0,34.0
b,13.0,
e,,35.0
