In [1]:
import pandas as pd

In [2]:
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], 
                     'data': [4, 3, 2, 1, 12, 3, 4, 5, 7]})
print(data)

  group  data
0     a     4
1     a     3
2     a     2
3     b     1
4     b    12
5     b     3
6     c     4
7     c     5
8     c     7


In [4]:
# 指定某列升序,某列降序
# 表示按照什么字段排序 ascending 属性表示按照升序排序或者按照降序排序
data.sort_values(by=['group', 'data'], ascending=[False, True], inplace=True)
print(data)

  group  data
6     c     4
7     c     5
8     c     7
3     b     1
5     b     3
4     b    12
2     a     2
1     a     3
0     a     4


In [3]:
# 构造DataFrame
data = pd.DataFrame({'k1':['one']*3+['two']*4,
                     'k2':[3,2,1,3,3,4,4]})
print(data)

    k1  k2
0  one   3
1  one   2
2  one   1
3  two   3
4  two   3
5  two   4
6  two   4


In [4]:
# 按k2排序
print(data.sort_values(by='k2'))

    k1  k2
2  one   1
1  one   2
0  one   3
3  two   3
4  two   3
5  two   4
6  two   4


In [5]:
# 删除重复值
# 2个列都重复才删掉
dt = data.drop_duplicates()
print(dt)

    k1  k2
0  one   3
1  one   2
2  one   1
3  two   3
5  two   4


In [6]:
# 按一列只要重复就删掉
dt = data.drop_duplicates(subset='k1')
print(dt)

    k1  k2
0  one   3
3  two   3


In [8]:
# apply() 数据聚合运算，可以很方便的对分组进行现有的运算和自定义的运算

In [9]:
data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})
print(data)

  food  data
0   A1     1
1   A2     2
2   B1     3
3   B2     4
4   B3     5
5   C1     6
6   C2     7


In [10]:
# A1,A2,A3都归为A
def food_map(series):
    if series['food'] == 'A1':
        return 'A'
    elif series['food'] == 'A2':
        return 'A'
    elif series['food'] == 'B1':
        return 'B'
    elif series['food'] == 'B2':
        return 'B'
    elif series['food'] == 'B3':
        return 'B'
    elif series['food'] == 'C1':
        return 'C'
    elif series['food'] == 'C2':
        return 'C'


# 增加一列
data['food_map'] = data.apply(food_map, axis='columns')
print(data)

  food  data food_map
0   A1     1        A
1   A2     2        A
2   B1     3        B
3   B2     4        B
4   B3     5        B
5   C1     6        C
6   C2     7        C


In [11]:
food2Upper = {
                'A1':'A',
                'A2':'A',
                'B1':'B',
                'B2':'B',
                'B3':'B',
                'C1':'C',
                'C2':'C'
            }
data['upper'] = data['food'].map(food2Upper)

In [12]:
print(data)

  food  data food_map upper
0   A1     1        A     A
1   A2     2        A     A
2   B1     3        B     B
3   B2     4        B     B
4   B3     5        B     B
5   C1     6        C     C
6   C2     7        C     C


In [13]:
import numpy as np
df = pd.DataFrame({'data1':np.random.randn(5),
                   'data2':np.random.randn(5)})
df2 = df.assign(ration = df['data1']/df['data2'])
print(df2)

      data1     data2    ration
0  0.981560  1.356974  0.723345
1  0.554868  0.470992  1.178082
2 -1.263160 -0.766644  1.647650
3  1.495241  0.276406  5.409577
4 -1.310699  0.338455 -3.872592


In [14]:
df2.drop('ration', axis='columns', inplace=True)
print(df2)

      data1     data2
0  0.981560  1.356974
1  0.554868  0.470992
2 -1.263160 -0.766644
3  1.495241  0.276406
4 -1.310699  0.338455


In [15]:
# 替换数据 replace()
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9])
# 把9这个值替换为NaN
data.replace(9, np.nan, inplace=True)
print(data)

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    7.0
7    8.0
8    NaN
dtype: float64


In [16]:
# bins 数据分组规则 cut()数据分组
ages = [15, 18, 20, 21, 22, 34, 41, 52, 63, 79]
bins = [10, 40, 80]
# 以bins为区间切分
bins_res = pd.cut(ages, bins)
print(bins_res)

[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]]
Categories (2, interval[int64]): [(10, 40] < (40, 80]]


In [17]:
# value_counts() 统计数据分组后每组的个数
counts = pd.value_counts(bins_res)
print(counts)

(10, 40]    6
(40, 80]    4
dtype: int64


In [18]:
print(pd.cut(ages,[10,30,50,80]))

[(10, 30], (10, 30], (10, 30], (10, 30], (10, 30], (30, 50], (30, 50], (50, 80], (50, 80], (50, 80]]
Categories (3, interval[int64]): [(10, 30] < (30, 50] < (50, 80]]


In [20]:
# 用别名代替区间
group_names = ['Yonth', 'Middle', 'Old']
counts = pd.value_counts(pd.cut(ages, [10, 20, 50, 80], labels=group_names))
print(counts)

Middle    4
Old       3
Yonth     3
dtype: int64


In [21]:
# isnull() 查看数据中是否有空值。any是按行查看空值，axis=1 是按列查看
df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)])
print(df)
# bool判断有没有缺失值
print(df.isnull())

   0    1    2
0  0  1.0  2.0
1  0  NaN  0.0
2  0  0.0  NaN
3  0  1.0  2.0
       0      1      2
0  False  False  False
1  False   True  False
2  False  False   True
3  False  False  False


In [22]:
print(df.isnull().any())

0    False
1     True
2     True
dtype: bool


In [23]:
print(df.isnull().any(axis = 1))

0    False
1     True
2     True
3    False
dtype: bool


In [27]:
# fillna() 检查是否有缺失值，如果有可以将填充值写在fillna()的参数中
fillna = df.fillna(5)
print(fillna)

   0    1    2
0  0  1.0  2.0
1  0  5.0  0.0
2  0  0.0  5.0
3  0  1.0  2.0


In [24]:
df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)])
dt = df[df.isnull().any(axis=1)]
print(dt)

   0    1    2
1  0  NaN  0.0
2  0  0.0  NaN
