# Dataframe
- 二维数组Dataframe：是一个表格型的数据结构，包含一组有序的列，其列的值类型可以是数值，字符串，布尔值等
- Dataframe中的数据以一个或多个二维块存放，不是列表、字典或一维数组结构

In [3]:
import pandas as pd
import numpy as np
# Dataframe 带有index行标签和columns列标签
# .index：查看行标签
# .columns：查看列标签
# .values：查看值，数据类型为ndarray
data = {'name':['a','b','c'],'age':[1,2,3],'gender':['m','m','w']}
frame = pd.DataFrame(data)
print(frame)
print(type(frame))
print(frame.index,'\n该数据类型为：',type(frame.index))
print(frame.columns,'\n该数据类型为',type(frame.columns))
print(frame.values,'\n该数据类型为',type(frame.values))

  name  age gender
0    a    1      m
1    b    2      m
2    c    3      w
<class 'pandas.core.frame.DataFrame'>
RangeIndex(start=0, stop=3, step=1) 
该数据类型为： <class 'pandas.core.indexes.range.RangeIndex'>
Index(['name', 'age', 'gender'], dtype='object') 
该数据类型为 <class 'pandas.core.indexes.base.Index'>
[['a' 1 'm']
 ['b' 2 'm']
 ['c' 3 'w']] 
该数据类型为 <class 'numpy.ndarray'>


# Dataframe 创建

In [6]:
# Dataframe创建方式一：由数组/list组成的字典
# 创建方法：pandas.Dataframe()

data1 = {'a':[1,2,3],'b':[4,5,6]}
data2 = {'one':np.random.rand(3),'two':np.random.rand(3)}

print(data1)
print(data2)

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)

# columns参数：可以重新指定列的顺序，格式为list，如果现有数据中没有该列，则产生NaN值
# 如果columns重新指定时，列的数量可以小于原数据
df1 = pd.DataFrame(data1,columns=['b','c','a','b'])
print(df1)
df1 = pd.DataFrame(data1,columns=['b','c'])
print(df1)

# index参数：重新定义index，格式为list，长度必须保持一致
df2 = pd.DataFrame(data2,index=['f1','f2','f3'])
print(df2)

{'a': [1, 2, 3], 'b': [4, 5, 6]}
{'one': array([0.51410767, 0.83528956, 0.74199338]), 'two': array([0.00354538, 0.96651467, 0.89534564])}
   a  b
0  1  4
1  2  5
2  3  6
        one       two
0  0.514108  0.003545
1  0.835290  0.966515
2  0.741993  0.895346
   b    c  a  b
0  4  NaN  1  4
1  5  NaN  2  5
2  6  NaN  3  6
   b    c
0  4  NaN
1  5  NaN
2  6  NaN
         one       two
f1  0.514108  0.003545
f2  0.835290  0.966515
f3  0.741993  0.895346


In [4]:
# 创建方法二：由Series组成的字典
# 其中column为字典的key，index为Series的标签（index），如果没有指定index，则是默认数字标签
# 如果两行的数量不一致，会默认用NaN代替
data1 = {
            'one':pd.Series(np.random.rand(2)),
            'two':pd.Series(np.random.rand(3))
        }
data2 = {
            'one':pd.Series(np.random.rand(2),index=['a','b']),
            'two':pd.Series(np.random.rand(3),index=['a','b','c'])
        }
print(data1)
print(data2)
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)

{'one': 0    0.253376
1    0.694209
dtype: float64, 'two': 0    0.243005
1    0.252494
2    0.618899
dtype: float64}
{'one': a    0.902048
b    0.425521
dtype: float64, 'two': a    0.969927
b    0.848560
c    0.361410
dtype: float64}
        one       two
0  0.253376  0.243005
1  0.694209  0.252494
2       NaN  0.618899
        one       two
a  0.902048  0.969927
b  0.425521  0.848560
c       NaN  0.361410


In [5]:
# 创建方法三：通过二维数组直接创建
# 如果不指定index和column，两者均为数字
# index和column指定长度与数组的长度要一致
ar = np.random.rand(9).reshape(3,3)
print(ar)
df1 = pd.DataFrame(ar)
df2 = pd.DataFrame(ar,index=['a','b','c'],columns = ['one','two','three'])
print(df1)
print(df2)

[[0.97620446 0.99309431 0.36477411]
 [0.46306879 0.62926255 0.80124974]
 [0.87572901 0.58514006 0.98546744]]
          0         1         2
0  0.976204  0.993094  0.364774
1  0.463069  0.629263  0.801250
2  0.875729  0.585140  0.985467
        one       two     three
a  0.976204  0.993094  0.364774
b  0.463069  0.629263  0.801250
c  0.875729  0.585140  0.985467


In [7]:
# 创建方法四：由字典组成的列表
# columns为字典的key，index不指定为数组的下标
data = [{'one':1,'two':2},{'one':5,'two':10,'three':20}]
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data,index=['a','b'])
df3 = pd.DataFrame(data,columns=['one','two']) # 只指定了两列，如果字典有多余的内容，则删除
print(df1)
print(df2)
print(df3)

   one  three  two
0    1    NaN    2
1    5   20.0   10
   one  three  two
a    1    NaN    2
b    5   20.0   10
   one  two
0    1    2
1    5   10


In [15]:
# 创建方法五：由字典组成的字典
# columns为字典的key，index为子字典的key
data = {'Jack':{'math':90,'english':89,'art':78},
       'Marry':{'math':82,'english':95,'art':92},
       'Tom':{'math':78,'english':67}}
df1 = pd.DataFrame(data)
print(df1)

df2 = pd.DataFrame(data,columns=['Jack','Tom','Bob'])
print(df2)

         Jack  Marry   Tom
art        78     92   NaN
english    89     95  67.0
math       90     82  78.0
         Jack   Tom  Bob
art        78   NaN  NaN
english    89  67.0  NaN
math       90  78.0  NaN


# DataFrame方法

In [17]:
# 选择行与列的返回值
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns=['a','b','c','d'])
print(df)
# 按照列名选择列，只选择一列返回Series，选择多列，返回DataFrame
data1 = df['a']
data2 = df[['a','c']]
print(data1,type(data1))
print(data2,type(data2))

# 按照index选择行，只选择一行输出Series，选择多行输出DataFrame
data3 = df.loc['one']
data4 = df.loc[['one','two']]
print(data3,type(data3))
print(data4,type(data4))

               a          b          c          d
one    38.409709  82.563439  51.273076  56.517761
two    31.474211  42.202104  24.396734  15.594921
three  73.257897  53.145605  54.743648   0.191099
one      38.409709
two      31.474211
three    73.257897
Name: a, dtype: float64 <class 'pandas.core.series.Series'>
               a          c
one    38.409709  51.273076
two    31.474211  24.396734
three  73.257897  54.743648 <class 'pandas.core.frame.DataFrame'>
a    38.409709
b    82.563439
c    51.273076
d    56.517761
Name: one, dtype: float64 <class 'pandas.core.series.Series'>
             a          b          c          d
one  38.409709  82.563439  51.273076  56.517761
two  31.474211  42.202104  24.396734  15.594921 <class 'pandas.core.frame.DataFrame'>


In [None]:
# df[] - 选择列
# df = pd.DataFrame(np.random.rand(12).reshaoer)