# 客户营销响应预测

## 背景

某企业业务部门准备基于以往产品营销活动的经验,对现有的客户有选择性地进行营销活动。受制于营销活动预算的限制,业务部门希望能够通过以前类似的营销活动找出能够响应此次营销活动的客户名单和响应概率。能够在有限的成本控制中得到较高的客户响应率,以提高此次营销活动的效果。

## 目的

通过数据预测在下一次营销活动时，响应活动会员的具体名单和响应概率，以此来制定针对性的营销策略。

## 数据

- 训练数据:3999条
- 测试数据:8843条

字段名|字段含义|变量类型
:-|:-|:-
age|年龄|数值	
total_pageviews|总页面浏览数|数值
edu|教育程度|分类[1,10]	
edu_ages|受教育年限|数值
user_level|会员等级|分类[1,7]
industry|用户行业|分类[1,15]	
value_level|用户价值|分类[1,6]
act_level|用户活跃度|分类[1,5]	
sex|性别|分类[0,1] 1表示男性	
blue_money|历史蓝色优惠券使用金额|数值
red_money|历史红色优惠券使用金额|数值
work_hours|在线时长|数值
region|地区|分类[1,41]
response|是否响应|分类[0,1] 1表示响应

## 1. 数据加载

In [1]:
import numpy as np  
import pandas as pd  
from sklearn.preprocessing import OneHotEncoder  
from sklearn.model_selection import StratifiedKFold, cross_val_score 
from sklearn.feature_selection import SelectPercentile, f_classif  
from sklearn.ensemble import AdaBoostClassifier  
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_excel('order.xlsx', sheetname=0)  
test = pd.read_excel('order.xlsx', sheetname=1)  

In [3]:
train.head()

Unnamed: 0,age,total_pageviews,edu,edu_ages,user_level,industry,value_level,act_level,sex,blue_money,red_money,work_hours,region,response
0,39.0,77516.0,1.0,13.0,1.0,1.0,1,1.0,1.0,2174,0.0,40,1.0,0
1,50.0,83311.0,1.0,13.0,2.0,2.0,2,1.0,1.0,0,0.0,13,1.0,0
2,38.0,215646.0,2.0,9.0,3.0,3.0,1,1.0,1.0,0,0.0,40,1.0,0
3,53.0,234721.0,2.0,7.0,2.0,3.0,2,2.0,1.0,0,0.0,40,1.0,0
4,28.0,338409.0,1.0,13.0,2.0,4.0,3,2.0,0.0,0,0.0,40,2.0,0


In [4]:
train.tail()

Unnamed: 0,age,total_pageviews,edu,edu_ages,user_level,industry,value_level,act_level,sex,blue_money,red_money,work_hours,region,response
39994,24.0,194102.0,1.0,13.0,1.0,2.0,4,1.0,1.0,0,0.0,40,1.0,0
39995,35.0,295127.0,2.0,10.0,3.0,9.0,5,1.0,1.0,0,0.0,50,1.0,0
39996,60.0,102310.0,5.0,12.0,3.0,12.0,1,1.0,0.0,0,0.0,45,11.0,0
39997,48.0,240175.0,2.0,7.0,5.0,5.0,5,2.0,1.0,0,0.0,22,1.0,0
39998,41.0,145441.0,2.0,9.0,2.0,10.0,2,1.0,1.0,0,0.0,40,1.0,1


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39999 entries, 0 to 39998
Data columns (total 14 columns):
age                39998 non-null float64
total_pageviews    39998 non-null float64
edu                39998 non-null float64
edu_ages           39998 non-null float64
user_level         39998 non-null float64
industry           39997 non-null float64
value_level        39999 non-null int64
act_level          39998 non-null float64
sex                39998 non-null float64
blue_money         39999 non-null int64
red_money          39998 non-null float64
work_hours         39999 non-null int64
region             39997 non-null float64
response           39999 non-null int64
dtypes: float64(10), int64(4)
memory usage: 4.3 MB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8843 entries, 0 to 8842
Data columns (total 14 columns):
age                8843 non-null int64
total_pageviews    8843 non-null int64
edu                8843 non-null int64
edu_ages           8843 non-null int64
user_level         8841 non-null float64
industry           8841 non-null float64
value_level        8843 non-null int64
act_level          8843 non-null int64
sex                8843 non-null int64
blue_money         8843 non-null int64
red_money          8843 non-null int64
work_hours         8843 non-null int64
region             8838 non-null float64
final_response     8843 non-null int64
dtypes: float64(3), int64(11)
memory usage: 967.3 KB


In [7]:
train.describe()

Unnamed: 0,age,total_pageviews,edu,edu_ages,user_level,industry,value_level,act_level,sex,blue_money,red_money,work_hours,region,response
count,39998.0,39998.0,39998.0,39998.0,39998.0,39997.0,39999.0,39998.0,39998.0,39999.0,39998.0,39999.0,39997.0,39999.0
mean,38.589654,189513.6,2.511626,10.076754,2.087004,5.677126,2.546289,1.221036,0.668083,1089.142529,87.379394,40.442486,2.251519,0.239606
std,13.66349,105310.9,1.63811,2.573384,1.260992,3.395948,1.44321,0.626618,0.470907,7491.275548,402.93035,12.376033,4.913482,0.426848
min,17.0,12285.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
25%,28.0,117528.2,2.0,9.0,1.0,3.0,1.0,1.0,0.0,0.0,0.0,40.0,1.0,0.0
50%,37.0,178341.0,2.0,10.0,2.0,5.0,2.0,1.0,1.0,0.0,0.0,40.0,1.0,0.0
75%,48.0,237268.5,2.0,12.0,2.0,8.0,4.0,1.0,1.0,0.0,0.0,45.0,1.0,0.0
max,90.0,1484705.0,10.0,16.0,7.0,15.0,6.0,5.0,1.0,99999.0,4356.0,99.0,41.0,1.0


## 2. 数据预处理

### 去重

In [9]:
train.drop_duplicates(inplace=True)

### 缺失值处理

In [27]:
train.isnull().sum().sort_values(ascending=False)

response           0
region             0
work_hours         0
red_money          0
blue_money         0
sex                0
act_level          0
value_level        0
industry           0
user_level         0
edu_ages           0
edu                0
total_pageviews    0
age                0
dtype: int64

In [11]:
test.isnull().sum().sort_values(ascending=False)

region             5
industry           2
user_level         2
final_response     0
work_hours         0
red_money          0
blue_money         0
sex                0
act_level          0
value_level        0
edu_ages           0
edu                0
total_pageviews    0
age                0
dtype: int64

In [12]:
def na_replace(df):
    na_rules = {'age': df['age'].mean(),
                'total_pageviews': df['total_pageviews'].mean(),
                'edu': df['edu'].median(),
                'edu_ages': df['edu_ages'].median(),
                'user_level': df['user_level'].median(),
                'industry': df['user_level'].median(),
                'act_level': df['act_level'].median(),
                'sex': df['sex'].median(),
                'red_money': df['red_money'].mean(),
                'region': df['region'].median()
                } 
    df = df.fillna(na_rules)
    return df

In [13]:
train = na_replace(train)  
test = na_replace(test)

### 独热编码

In [14]:
def symbol_con(df, enc_object=None, train=True):
    convert_cols = ['edu', 'user_level', 'industry', 'value_level', 'act_level', 'sex', 'region'] 
    df_con = df[convert_cols]  
    df_org = df[['age', 'total_pageviews', 'edu_ages', 'blue_money', 'red_money', 'work_hours']].values  
    if train == True: 
        enc = OneHotEncoder() 
        enc.fit(df_con)  
        df_con_new = enc.transform(df_con).toarray()  
        new_matrix = np.hstack((df_con_new, df_org))  
        return new_matrix, enc
    else:
        df_con_new = enc_object.transform(df_con).toarray()  
        new_matrix = np.hstack((df_con_new, df_org))
        return new_matrix


In [15]:
X_train, enc = symbol_con(train, enc_object=None, train=True) 
y_train = train['response']

In [16]:
X_test = symbol_con(test, enc_object=enc, train=False)
y_test = test['final_response']

## 3. 建模

### 参数优化选择

In [17]:
transform = SelectPercentile(f_classif, percentile=50)  

In [18]:
model_adaboost = AdaBoostClassifier() 

In [19]:
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
pipe=Pipeline([('select',transform), 
               ('model_adaboost', model_adaboost)])

param_test = {'model_adaboost__n_estimators':[20,50,100], 
              'model_adaboost__learning_rate':[0.5,1]
             }
gsearch = GridSearchCV(estimator = pipe, param_grid = param_test, scoring='accuracy', cv=5)
gsearch.fit(X_train,y_train)

print(gsearch.grid_scores_)
print('-'*30)
print(gsearch.best_params_, gsearch.best_score_)

[mean: 0.85406, std: 0.00282, params: {'model_adaboost__learning_rate': 0.5, 'model_adaboost__n_estimators': 20}, mean: 0.85834, std: 0.00353, params: {'model_adaboost__learning_rate': 0.5, 'model_adaboost__n_estimators': 50}, mean: 0.86090, std: 0.00346, params: {'model_adaboost__learning_rate': 0.5, 'model_adaboost__n_estimators': 100}, mean: 0.85369, std: 0.00463, params: {'model_adaboost__learning_rate': 1, 'model_adaboost__n_estimators': 20}, mean: 0.85982, std: 0.00404, params: {'model_adaboost__learning_rate': 1, 'model_adaboost__n_estimators': 50}, mean: 0.86335, std: 0.00406, params: {'model_adaboost__learning_rate': 1, 'model_adaboost__n_estimators': 100}]
------------------------------
{'model_adaboost__learning_rate': 1, 'model_adaboost__n_estimators': 100} 0.863349684653


In [20]:
transform.fit(X_train, y_train)
reduce_X_train = transform.transform(X_train)
final_model = AdaBoostClassifier(n_estimators=100)  
final_model.fit(reduce_X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

### 预测

In [21]:
reduce_X_test = transform.transform(X_test)  

In [23]:
data = test.drop('final_response', axis=1) 
predict_labels = pd.DataFrame(final_model.predict(reduce_X_test), columns=['labels'])  
predict_proba = pd.DataFrame(final_model.predict_proba(reduce_X_test), columns=['noproba', 'yesproba']) 
predict_pd = pd.concat((data, predict_labels, predict_proba), axis=1) 

In [25]:
accuracy_score(y_test, predict_labels)

final accuracy: 0.862150853782653


In [24]:
predict_pd.to_excel('order_predict_result.xlsx', 'Sheet1') 

## 总结与思考

### 模型实施

- 制定了营销响应率不低于80%的KPI作为本次营销活动的绩效考核目标。
- 结合历史销售订单数据计算本次活动的预期收益，制定ROI目标。
- 基于预期的订单金额和订单数量，以及关联的用券数量和金额，申请对应的营销资源用于促销用户购买转化。