# Dataframe
- 二维数组Dataframe：是一个表格型的数据结构，包含一组有序的列，其列的值类型可以是数值，字符串，布尔值等
- Dataframe中的数据以一个或多个二维块存放，不是列表、字典或一维数组结构

In [2]:
import pandas as pd
import numpy as np
# Dataframe 带有index行标签和columns列标签
# .index：查看行标签
# .columns：查看列标签
# .values：查看值，数据类型为ndarray
data = {'name':['a','b','c'],'age':[1,2,3],'gender':['m','m','w']}
frame = pd.DataFrame(data)
print(frame)
print(type(frame))
print(frame.index,'\n该数据类型为：',type(frame.index))
print(frame.columns,'\n该数据类型为',type(frame.columns))
print(frame.values,'\n该数据类型为',type(frame.values))

  name  age gender
0    a    1      m
1    b    2      m
2    c    3      w
<class 'pandas.core.frame.DataFrame'>
RangeIndex(start=0, stop=3, step=1) 
该数据类型为： <class 'pandas.core.indexes.range.RangeIndex'>
Index(['name', 'age', 'gender'], dtype='object') 
该数据类型为 <class 'pandas.core.indexes.base.Index'>
[['a' 1 'm']
 ['b' 2 'm']
 ['c' 3 'w']] 
该数据类型为 <class 'numpy.ndarray'>


# Dataframe 创建

In [6]:
# Dataframe创建方式一：由数组/list组成的字典
# 创建方法：pandas.Dataframe()

data1 = {'a':[1,2,3],'b':[4,5,6]}
data2 = {'one':np.random.rand(3),'two':np.random.rand(3)}

print(data1)
print(data2)

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)

# columns参数：可以重新指定列的顺序，格式为list，如果现有数据中没有该列，则产生NaN值
# 如果columns重新指定时，列的数量可以小于原数据
df1 = pd.DataFrame(data1,columns=['b','c','a','b'])
print(df1)
df1 = pd.DataFrame(data1,columns=['b','c'])
print(df1)

# index参数：重新定义index，格式为list，长度必须保持一致
df2 = pd.DataFrame(data2,index=['f1','f2','f3'])
print(df2)

{'a': [1, 2, 3], 'b': [4, 5, 6]}
{'one': array([0.51410767, 0.83528956, 0.74199338]), 'two': array([0.00354538, 0.96651467, 0.89534564])}
   a  b
0  1  4
1  2  5
2  3  6
        one       two
0  0.514108  0.003545
1  0.835290  0.966515
2  0.741993  0.895346
   b    c  a  b
0  4  NaN  1  4
1  5  NaN  2  5
2  6  NaN  3  6
   b    c
0  4  NaN
1  5  NaN
2  6  NaN
         one       two
f1  0.514108  0.003545
f2  0.835290  0.966515
f3  0.741993  0.895346


In [4]:
# 创建方法二：由Series组成的字典
# 其中column为字典的key，index为Series的标签（index），如果没有指定index，则是默认数字标签
# 如果两行的数量不一致，会默认用NaN代替
data1 = {
            'one':pd.Series(np.random.rand(2)),
            'two':pd.Series(np.random.rand(3))
        }
data2 = {
            'one':pd.Series(np.random.rand(2),index=['a','b']),
            'two':pd.Series(np.random.rand(3),index=['a','b','c'])
        }
print(data1)
print(data2)
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)

{'one': 0    0.253376
1    0.694209
dtype: float64, 'two': 0    0.243005
1    0.252494
2    0.618899
dtype: float64}
{'one': a    0.902048
b    0.425521
dtype: float64, 'two': a    0.969927
b    0.848560
c    0.361410
dtype: float64}
        one       two
0  0.253376  0.243005
1  0.694209  0.252494
2       NaN  0.618899
        one       two
a  0.902048  0.969927
b  0.425521  0.848560
c       NaN  0.361410


In [5]:
# 创建方法三：通过二维数组直接创建
# 如果不指定index和column，两者均为数字
# index和column指定长度与数组的长度要一致
ar = np.random.rand(9).reshape(3,3)
print(ar)
df1 = pd.DataFrame(ar)
df2 = pd.DataFrame(ar,index=['a','b','c'],columns = ['one','two','three'])
print(df1)
print(df2)

[[0.97620446 0.99309431 0.36477411]
 [0.46306879 0.62926255 0.80124974]
 [0.87572901 0.58514006 0.98546744]]
          0         1         2
0  0.976204  0.993094  0.364774
1  0.463069  0.629263  0.801250
2  0.875729  0.585140  0.985467
        one       two     three
a  0.976204  0.993094  0.364774
b  0.463069  0.629263  0.801250
c  0.875729  0.585140  0.985467


In [7]:
# 创建方法四：由字典组成的列表
# columns为字典的key，index不指定为数组的下标
data = [{'one':1,'two':2},{'one':5,'two':10,'three':20}]
df1 = pd.DataFrame(data)
df2 = pd.DataFrame(data,index=['a','b'])
df3 = pd.DataFrame(data,columns=['one','two']) # 只指定了两列，如果字典有多余的内容，则删除
print(df1)
print(df2)
print(df3)

   one  three  two
0    1    NaN    2
1    5   20.0   10
   one  three  two
a    1    NaN    2
b    5   20.0   10
   one  two
0    1    2
1    5   10


In [15]:
# 创建方法五：由字典组成的字典
# columns为字典的key，index为子字典的key
data = {'Jack':{'math':90,'english':89,'art':78},
       'Marry':{'math':82,'english':95,'art':92},
       'Tom':{'math':78,'english':67}}
df1 = pd.DataFrame(data)
print(df1)

df2 = pd.DataFrame(data,columns=['Jack','Tom','Bob'])
print(df2)

         Jack  Marry   Tom
art        78     92   NaN
english    89     95  67.0
math       90     82  78.0
         Jack   Tom  Bob
art        78   NaN  NaN
english    89  67.0  NaN
math       90  78.0  NaN


# DataFrame方法

In [17]:
# 选择行与列的返回值
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns=['a','b','c','d'])
print(df)
# 按照列名选择列，只选择一列返回Series，选择多列，返回DataFrame
data1 = df['a']
data2 = df[['a','c']]
print(data1,type(data1))
print(data2,type(data2))

# 按照index选择行，只选择一行输出Series，选择多行输出DataFrame
data3 = df.loc['one']
data4 = df.loc[['one','two']]
print(data3,type(data3))
print(data4,type(data4))

               a          b          c          d
one    38.409709  82.563439  51.273076  56.517761
two    31.474211  42.202104  24.396734  15.594921
three  73.257897  53.145605  54.743648   0.191099
one      38.409709
two      31.474211
three    73.257897
Name: a, dtype: float64 <class 'pandas.core.series.Series'>
               a          c
one    38.409709  51.273076
two    31.474211  24.396734
three  73.257897  54.743648 <class 'pandas.core.frame.DataFrame'>
a    38.409709
b    82.563439
c    51.273076
d    56.517761
Name: one, dtype: float64 <class 'pandas.core.series.Series'>
             a          b          c          d
one  38.409709  82.563439  51.273076  56.517761
two  31.474211  42.202104  24.396734  15.594921 <class 'pandas.core.frame.DataFrame'>


In [3]:
# df[] - 选择列
df = pd.DataFrame(np.random.rand(12).reshape(3,4)*100,index=['one','two','three'],columns=['a','b','c','d'])
print(df)

# df[]默认选择列，[]中写列名
data1 = df['a'] # 单选列，返回Series
data2 = df[['b','c']]  # 多选列，选择DataFrame
print(data1)
print(data2)

# df[]中为数字时，默认选择行，且只能进行切片的选择，不能单独选择行 -- df[0]
# df[]不能通过索引选择行 --- df['one']
data3 = df[:1]
print(data3,type(data3))

               a          b          c          d
one    89.337397  34.899602  84.852295  67.275818
two    95.127221  31.345027  39.463871  87.889717
three  86.001285  89.032645  48.574998   0.957820
one      89.337397
two      95.127221
three    86.001285
Name: a, dtype: float64
               b          c
one    34.899602  84.852295
two    31.345027  39.463871
three  89.032645  48.574998
             a          b          c          d
one  89.337397  34.899602  84.852295  67.275818 <class 'pandas.core.frame.DataFrame'>


In [5]:
# df.loc[] -- 按index选择行
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   index = ['one','two','three','four'],
                   columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
                   columns = ['a','b','c','d'])
print(df1)
print(df2)

# 按照index查找，如果设置index则不能使用数字进行a
data1 = df1.loc['one']
# 按照默认index查找，即数字index
data2 = df2.loc[1]
print(data1)
print(data2)

# 多个标签索引，如果标签不存在，则返回NaN
data3 = df1.loc[['one','two']]
data4 = df2.loc[1:2]
print(data3)
print(data4)

# 可以做切片对象
data5 = df1.loc['one':'three']
data6 = df2.loc[1:3]
print(data5)
print(data6)

               a          b          c          d
one    71.552037  80.265062  20.951259  72.541999
two    32.854276  66.830525  13.915621  11.994220
three   4.748599  38.872146  55.199464  80.208177
four   51.202797  97.633333  86.391924  24.158100
           a          b          c          d
0   3.726300  66.112340  48.220489  93.760402
1  39.120391  25.638545  92.493584  44.929521
2  67.998094  32.908821  23.708935   0.419288
3  82.357372  53.480613  14.556641  79.355039
a    71.552037
b    80.265062
c    20.951259
d    72.541999
Name: one, dtype: float64
a    39.120391
b    25.638545
c    92.493584
d    44.929521
Name: 1, dtype: float64
             a          b          c          d
one  71.552037  80.265062  20.951259  72.541999
two  32.854276  66.830525  13.915621  11.994220
           a          b          c          d
1  39.120391  25.638545  92.493584  44.929521
2  67.998094  32.908821  23.708935   0.419288
               a          b          c          d
one    71.552037  

In [7]:
# df.iloc[] - 按照整数位置（从轴的0到length-1）选择行
# 类似list的索引，其顺序就是dataframe的整数位置，从0开始计
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])

print(df)

# iloc中的索引不能超过数据行数的整数位置
print(df.iloc[0])
print(df.iloc[-1])

# 多行索引
print(df.iloc[1:3])
print(df.iloc[::2])

               a          b          c          d
one     5.790416  15.907801  15.346601  41.570169
two    62.328093  43.988665  28.834321  68.477525
three  89.860261  57.388847  23.542659  98.224468
four   51.774764  35.003939  70.082198  73.725810
a     5.790416
b    15.907801
c    15.346601
d    41.570169
Name: one, dtype: float64
a    51.774764
b    35.003939
c    70.082198
d    73.725810
Name: four, dtype: float64
               a          b          c          d
two    62.328093  43.988665  28.834321  68.477525
three  89.860261  57.388847  23.542659  98.224468
               a          b          c          d
one     5.790416  15.907801  15.346601  41.570169
three  89.860261  57.388847  23.542659  98.224468


In [9]:
# 布尔型索引
# 和Series原理相同
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=['one','two','three','four'],columns=['a','b','c','d'])
print(df)

# 对每个数据都做判断，如果为true返回原数据，false返回值为NaN
print('-'*5+'无限定判断'+'-'*5)
print(df[df < 20])

# 单列的判断，该列结果为True的行数据，包括其他列
print('-'*5+'单列判断'+'-'*5)
print(df['a'] > 20)

# 多列的判断
print('-'*5+'多列判断'+'-'*5)
print(df[df[['a','b']] > 20 ])

# 多行做判断
print('-'*5+'多行判断'+'-'*5)
print(df[df.loc[['one','three']] < 50])

               a          b          c          d
one    26.308656  29.852513  43.681006  78.972862
two    39.015117  83.554890  41.946063  62.653479
three  15.172235  42.314596  93.891098  80.769701
four   52.062534  92.902764  84.203408  19.350741
-----无限定判断-----
               a   b   c          d
one          NaN NaN NaN        NaN
two          NaN NaN NaN        NaN
three  15.172235 NaN NaN        NaN
four         NaN NaN NaN  19.350741
-----单列判断-----
one       True
two       True
three    False
four      True
Name: a, dtype: bool
-----多列判断-----
               a          b   c   d
one    26.308656  29.852513 NaN NaN
two    39.015117  83.554890 NaN NaN
three        NaN  42.314596 NaN NaN
four   52.062534  92.902764 NaN NaN
-----多行判断-----
               a          b          c   d
one    26.308656  29.852513  43.681006 NaN
two          NaN        NaN        NaN NaN
three  15.172235  42.314596        NaN NaN
four         NaN        NaN        NaN NaN


In [10]:
# 多重索引：同时索引行和列
# 先选择列再选择行 -- 相当于对于一个数据，先筛选字段，在选择数据量
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index = ['one','two','three','four'],columns = ['a','b','c','d'])
print(df)

print(df['a'].loc[['one','three']]) #先选择列a，然后选择列a的one，three行
print(df[['b','c','d']].iloc[::2]) #选择多个列的行
print(df[df['a'] < 50].iloc[:2]) # 选择满足判断索引的前两行数据

               a          b          c          d
one    65.317498  80.150041  98.362942  38.044392
two    94.181856   1.347843  59.874516  73.530484
three  88.830297  20.396915  81.360220  85.787044
four   60.314734   1.348655  60.943412  13.315916
one      65.317498
three    88.830297
Name: a, dtype: float64
               b          c          d
one    80.150041  98.362942  38.044392
three  20.396915  81.360220  85.787044
Empty DataFrame
Columns: [a, b, c, d]
Index: []


In [11]:
# 数据查看、转置
df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,columns = ['a','b'])
print(df)

# .head()查看头部数据
# .tail()查看尾部数据
# 默认查看5条
print(df.head(2))
print(df.tail())

# 转置
print(df.T)

           a          b
0  93.259424  98.421852
1  69.010361  83.181006
2  28.685085  62.507307
3  52.947526  75.253460
4  97.742462  40.813481
5  15.389640  18.020944
6  58.487632  72.754595
7  88.341625  63.626585
           a          b
0  93.259424  98.421852
1  69.010361  83.181006
           a          b
3  52.947526  75.253460
4  97.742462  40.813481
5  15.389640  18.020944
6  58.487632  72.754595
7  88.341625  63.626585
           0          1          2          3          4          5  \
a  93.259424  69.010361  28.685085  52.947526  97.742462  15.389640   
b  98.421852  83.181006  62.507307  75.253460  40.813481  18.020944   

           6          7  
a  58.487632  88.341625  
b  72.754595  63.626585  


In [12]:
# 添加与修改
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns = ['a','b','c','d'])
print(df)

# 新增列/行并赋值
df['e'] = 10
df.loc[4] = 20
print(df)

# 直接修改一整列
df['e'] = 20
# 修改多个列
df[['a','c']] = 100
print(df)

           a          b          c          d
0  58.193139  50.961910  94.482855  57.737026
1  76.388479  10.338230  98.808006  77.556905
2  18.266112  28.249430  40.840480   6.534588
3  69.399512  25.392848  92.047405  41.550446
           a          b          c          d   e
0  58.193139  50.961910  94.482855  57.737026  10
1  76.388479  10.338230  98.808006  77.556905  10
2  18.266112  28.249430  40.840480   6.534588  10
3  69.399512  25.392848  92.047405  41.550446  10
4  20.000000  20.000000  20.000000  20.000000  20
     a          b    c          d   e
0  100  50.961910  100  57.737026  20
1  100  10.338230  100  77.556905  20
2  100  28.249430  100   6.534588  20
3  100  25.392848  100  41.550446  20
4  100  20.000000  100  20.000000  20


In [14]:
# 删除 del/drop()

df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns = ['a','b','c','d'])

print(df)

# 删除列：del，删除后df会变化
del df['a']
print(df)

# 删除列：drop 加上 axis=1，inplace=False 删除后生成新的数据，不改变原数据
print(df.drop(['b'],axis=1))
print(df)
print('-'*10)

# 删除行：drop，删除后df不变
# 参数inplace=False，删除后生成新的数据，原数据不变
print(df.drop(0))
print(df.drop([1,2]))
print(df)
print('-'*10)

           a          b          c          d
0  15.257645  57.188657  96.503310  27.811684
1  40.799058  16.626775  89.227109  67.728371
2  78.871745  65.838457  53.140350  92.599459
3  31.403500  81.353036  94.205857  12.800509
           b          c          d
0  57.188657  96.503310  27.811684
1  16.626775  89.227109  67.728371
2  65.838457  53.140350  92.599459
3  81.353036  94.205857  12.800509
           c          d
0  96.503310  27.811684
1  89.227109  67.728371
2  53.140350  92.599459
3  94.205857  12.800509
           b          c          d
0  57.188657  96.503310  27.811684
1  16.626775  89.227109  67.728371
2  65.838457  53.140350  92.599459
3  81.353036  94.205857  12.800509
----------
           b          c          d
1  16.626775  89.227109  67.728371
2  65.838457  53.140350  92.599459
3  81.353036  94.205857  12.800509
           b          c          d
0  57.188657  96.503310  27.811684
3  81.353036  94.205857  12.800509
           b          c          d
0  57.188

# 排序

In [16]:
# 按值排序 .sort_values
# # ascending参数：设置升序降序，默认升序
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,columns = ['a','b','c','d'])
print(df1)

print('-'*10)
print(df1.sort_values(['a'],ascending=True)) # 升序
print(df1.sort_values(['a'],ascending=False)) # 降序

# 多列排序,先按照设置的首列排序，然后首列相同的按照第二列排序
print('-'*10)
df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],
                  'b':list(range(8)),
                  'c':list(range(8,0,-1))})
print(df2)
print(df2.sort_values(['a','c']))

           a          b          c          d
0   5.369000  87.414935  84.423568  24.115428
1  71.393246  83.025402  20.238562  14.836448
2  40.952781  74.388208  56.824483  82.124819
3  56.333080  62.993463  46.205342  19.433410
----------
           a          b          c          d
0   5.369000  87.414935  84.423568  24.115428
2  40.952781  74.388208  56.824483  82.124819
3  56.333080  62.993463  46.205342  19.433410
1  71.393246  83.025402  20.238562  14.836448
           a          b          c          d
1  71.393246  83.025402  20.238562  14.836448
3  56.333080  62.993463  46.205342  19.433410
2  40.952781  74.388208  56.824483  82.124819
0   5.369000  87.414935  84.423568  24.115428
----------
   a  b  c
0  1  0  8
1  1  1  7
2  1  2  6
3  1  3  5
4  2  4  4
5  2  5  3
6  2  6  2
7  2  7  1
   a  b  c
3  1  3  5
2  1  2  6
1  1  1  7
0  1  0  8
7  2  7  1
6  2  6  2
5  2  5  3
4  2  4  4


In [18]:
# 索引排序 - .sort_index
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=[5,4,3,2],columns = ['a','b','c','d'])

df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,index=[5,4,3,2],columns = ['a','b','c','d'])

# 按照index排序
# 默认ascending=True，inplace=False
print(df1)
print(df1.sort_index())
print(df2)
print(df2.sort_index())

           a          b          c          d
5  70.861323  77.387828  16.508888  39.783136
4  40.950611  53.598744  80.824587  13.809055
3  72.488587  41.106073   8.523795  24.478620
2  87.017575  71.132648  86.882009   5.114085
           a          b          c          d
2  87.017575  71.132648  86.882009   5.114085
3  72.488587  41.106073   8.523795  24.478620
4  40.950611  53.598744  80.824587  13.809055
5  70.861323  77.387828  16.508888  39.783136
           a          b          c          d
5  45.900896  66.706248   6.677665   8.861845
4  76.593940  65.411763  55.919493  24.744247
3  52.938624  42.516864  36.724546  37.207165
2  92.684244  60.776301  25.454571   4.832269
           a          b          c          d
2  92.684244  60.776301  25.454571   4.832269
3  52.938624  42.516864  36.724546  37.207165
4  76.593940  65.411763  55.919493  24.744247
5  45.900896  66.706248   6.677665   8.861845
