## Series的apply方法

In [1]:
# 数据准备
import pandas as pd
df = pd.DataFrame({'a':[10,20,30],'b':[20,30,40]})


In [2]:
# 创建一个自定义函数
def my_sq(x):
    return x**2


In [3]:
#apply方法有一个func参数, 把传入的函数应用于Series的每个元素
sq = df['a'].apply(my_sq)
#注意,把my_sq传递给apply的时候,不要加上圆括号


In [4]:
sq

0    100
1    400
2    900
Name: a, dtype: int64

In [5]:
#apply 传入 需要多个参数的函数
def my_exp(x,e):
    return x**e


In [6]:
my_exp(2,3)

8

In [7]:
ex = df['a'].apply(my_exp,e=2)


In [8]:
ex

0    100
1    400
2    900
Name: a, dtype: int64

## apply 使用案例

In [2]:
# 使用titanic数据集来介绍apply的用法
# 加载数据,使用info查看该数据集的基本特征
titanic = pd.read_csv('data/titanic.csv')
titanic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null object
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null object
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), float64(2), int64(4), object(7)
memory usage: 92.4+ KB


In [None]:
# 该数据集有891行,15列, 其中age 和 deck 两列中包含缺失值
# 可以使用apply计算数据中有多少null 或 NaN值


In [3]:
# 缺失值数目
import numpy as np
def count_missing(vec):
    #根据值是否缺失获取一个由True/False组成的向量
    null_vec = pd.isnull(vec)
    # 得到null_vec中null值的个数
    # null值对应True, True为1
    null_count = np.sum(null_vec)
    #返回向量中缺失值的个数
    return null_count


In [4]:
# 缺失值占比
def prop_missing(vec):
    # 计算缺失值的个数
    # 这里使用刚刚编写的count_missing函数
    num = count_missing(vec)
    #获得向量中元素的个数
    #也需要统计缺失值个数
    dem = vec.size
    return num/dem


In [5]:
# 非缺失值占比
def prop_complete(vec):
    #先计算缺失值占的比例
    #然后用1减去缺失值的占比
    return 1-prop_missing(vec)


In [6]:
# 把前面定义好的函数应用于数据的各列
titanic.apply(count_missing)


survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [7]:
titanic.apply(prop_missing)


survived       0.000000
pclass         0.000000
sex            0.000000
age            0.198653
sibsp          0.000000
parch          0.000000
fare           0.000000
embarked       0.002245
class          0.000000
who            0.000000
adult_male     0.000000
deck           0.772166
embark_town    0.002245
alive          0.000000
alone          0.000000
dtype: float64

In [8]:
titanic.apply(prop_complete)


survived       1.000000
pclass         1.000000
sex            1.000000
age            0.801347
sibsp          1.000000
parch          1.000000
fare           1.000000
embarked       0.997755
class          1.000000
who            1.000000
adult_male     1.000000
deck           0.227834
embark_town    0.997755
alive          1.000000
alone          1.000000
dtype: float64

In [9]:
#把前面定义好的函数应用于数据的各行
titanic.apply(count_missing ,axis = 1)


0      1
1      0
2      1
3      0
4      1
      ..
886    1
887    0
888    2
889    0
890    1
Length: 891, dtype: int64

In [10]:
titanic.apply(prop_missing ,axis = 1)


0      0.066667
1      0.000000
2      0.066667
3      0.000000
4      0.066667
         ...   
886    0.066667
887    0.000000
888    0.133333
889    0.000000
890    0.066667
Length: 891, dtype: float64

In [11]:
titanic.apply(prop_complete ,axis = 1)


0      0.933333
1      1.000000
2      0.933333
3      1.000000
4      0.933333
         ...   
886    0.933333
887    1.000000
888    0.866667
889    1.000000
890    0.933333
Length: 891, dtype: float64

In [12]:

titanic.apply(count_missing,axis = 1).value_counts()


1    549
0    182
2    160
dtype: int64

## 向量化函数

In [13]:
# 创建一个DataFrame
df = pd.DataFrame({'a':[10,20,30],'b':[20,30,40]})


In [15]:
# 创建函数
def avg_2(x,y):
    return (x+y)/2
avg_2(df['a'],df['b'])



0    15.0
1    25.0
2    35.0
dtype: float64

In [16]:
# 修改函数
def avg_2_mod(x,y):
    if(x==20):
        return (np.NaN)
    else:
        return (x+y)/2
avg_2_mod(df['a'],df['b'])	


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [17]:
# 使用np.vectorize将函数向量化
avg_2_mod_vec = np.vectorize(avg_2_mod)
avg_2_mod_vec(df['a'],df['b'])


array([15., nan, 35.])

In [18]:
# 使用装饰器
@np.vectorize
def vec_avg_2_mod(x,y):
    if(x==20):
        return (np.NaN)
    else:
        return (x+y)/2
vec_avg_2_mod(df['a'],df['b'])

array([15., nan, 35.])

## lambda函数

In [19]:
df.apply(lambda x: x+1)

Unnamed: 0,a,b
0,11,21
1,21,31
2,31,41
