# 导库与库参数设置

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

# 导入数据

## 导入源数据与备份源数据

In [2]:
origin_data = pd.read_csv("./data/client_buy.csv")

In [3]:
df = origin_data

## 查看源数据形态

In [4]:
df.head(5)

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,1,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown,0
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other,0
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown,0
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown,0
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25317 entries, 0 to 25316
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         25317 non-null  int64 
 1   age        25317 non-null  int64 
 2   job        25317 non-null  object
 3   marital    25317 non-null  object
 4   education  25317 non-null  object
 5   default    25317 non-null  object
 6   balance    25317 non-null  int64 
 7   housing    25317 non-null  object
 8   loan       25317 non-null  object
 9   contact    25317 non-null  object
 10  day        25317 non-null  int64 
 11  month      25317 non-null  object
 12  duration   25317 non-null  int64 
 13  campaign   25317 non-null  int64 
 14  pdays      25317 non-null  int64 
 15  previous   25317 non-null  int64 
 16  poutcome   25317 non-null  object
 17  y          25317 non-null  int64 
dtypes: int64(9), object(9)
memory usage: 3.5+ MB


In [6]:
df.isnull().sum()

ID           0
age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [7]:
df.describe()

Unnamed: 0,ID,age,balance,day,duration,campaign,pdays,previous,y
count,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0,25317.0
mean,12659.0,40.935379,1357.555082,15.835289,257.732393,2.77205,40.248766,0.591737,0.116957
std,7308.532719,10.634289,2999.822811,8.31948,256.975151,3.136097,100.213541,2.568313,0.321375
min,1.0,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0,0.0
25%,6330.0,33.0,73.0,8.0,103.0,1.0,-1.0,0.0,0.0
50%,12659.0,39.0,448.0,16.0,181.0,2.0,-1.0,0.0,0.0
75%,18988.0,48.0,1435.0,21.0,317.0,3.0,-1.0,0.0,0.0
max,25317.0,95.0,102127.0,31.0,3881.0,55.0,854.0,275.0,1.0


## 特征命名

In [8]:
rename_list = {
    "age":"客户年龄",
    "job":"职业",
    "marital":"婚姻状况",
    "education":"受教育水平",
    "default":"是否有违约记录",
    "balance":"每年账户的平均余额",
    "housing":"是否有住房贷款",
    "loan":"是否有个人贷款",
    "contact":"与客户联系的沟通方式",
    "day":"最后一次联系的时间(几号)",
    "month":"最后一次联系的时间(月份)",
    "duration":"最后一次联系的交流时长",
    "campaign":"在本次活动中，与该客户交流的次数",
    "pdays":"距离上次活动最后一次联系客户，过去了多久",
    "previous":"在本次活动中，与该客户交流过的次数",
    "poutcome":"上一次活动的结果"
}
df1 = df.rename(columns = rename_list)
df1.head(5)

Unnamed: 0,ID,客户年龄,职业,婚姻状况,受教育水平,是否有违约记录,每年账户的平均余额,是否有住房贷款,是否有个人贷款,与客户联系的沟通方式,最后一次联系的时间(几号),最后一次联系的时间(月份),最后一次联系的交流时长,在本次活动中，与该客户交流的次数,距离上次活动最后一次联系客户，过去了多久,在本次活动中，与该客户交流过的次数,上一次活动的结果,y
0,1,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown,0
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other,0
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown,0
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown,0
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown,0


In [9]:
Y = df1['y']
Y

0        0
1        0
2        0
3        0
4        0
        ..
25312    1
25313    1
25314    1
25315    1
25316    1
Name: y, Length: 25317, dtype: int64

In [10]:
df1.drop(['y'],axis = 1, inplace = True)

In [11]:
df1

Unnamed: 0,ID,客户年龄,职业,婚姻状况,受教育水平,是否有违约记录,每年账户的平均余额,是否有住房贷款,是否有个人贷款,与客户联系的沟通方式,最后一次联系的时间(几号),最后一次联系的时间(月份),最后一次联系的交流时长,在本次活动中，与该客户交流的次数,距离上次活动最后一次联系客户，过去了多久,在本次活动中，与该客户交流过的次数,上一次活动的结果
0,1,43,management,married,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown
1,2,42,technician,divorced,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other
2,3,47,admin.,married,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown
3,4,28,management,single,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown
4,5,42,technician,divorced,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,25313,55,blue-collar,divorced,primary,no,8180,no,no,cellular,14,may,854,2,360,1,failure
25313,25314,52,services,married,secondary,no,961,no,yes,cellular,18,feb,222,1,553,4,failure
25314,25315,35,blue-collar,divorced,primary,no,300,yes,no,unknown,13,may,945,2,-1,0,unknown
25315,25316,37,entrepreneur,divorced,tertiary,no,66,no,no,cellular,18,nov,1164,2,-1,0,unknown


# 特征工程

## 对职业进行处理

### 人为经验对类别打分

In [12]:
df1['婚姻状况'].nunique()

3

In [13]:
df1['婚姻状况'].value_counts()

婚姻状况
married     15245
single       7157
divorced     2915
Name: count, dtype: int64

In [14]:
marital_list ={
    "married":1,
    "single":2,
    "divorced":3
}

In [15]:
df1['婚姻状况'] = df1['婚姻状况'].map(marital_list)
df1

Unnamed: 0,ID,客户年龄,职业,婚姻状况,受教育水平,是否有违约记录,每年账户的平均余额,是否有住房贷款,是否有个人贷款,与客户联系的沟通方式,最后一次联系的时间(几号),最后一次联系的时间(月份),最后一次联系的交流时长,在本次活动中，与该客户交流的次数,距离上次活动最后一次联系客户，过去了多久,在本次活动中，与该客户交流过的次数,上一次活动的结果
0,1,43,management,1,tertiary,no,291,yes,no,unknown,9,may,150,2,-1,0,unknown
1,2,42,technician,3,primary,no,5076,yes,no,cellular,7,apr,99,1,251,2,other
2,3,47,admin.,1,secondary,no,104,yes,yes,cellular,14,jul,77,2,-1,0,unknown
3,4,28,management,2,secondary,no,-994,yes,yes,cellular,18,jul,174,2,-1,0,unknown
4,5,42,technician,3,secondary,no,2974,yes,no,unknown,21,may,187,5,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,25313,55,blue-collar,3,primary,no,8180,no,no,cellular,14,may,854,2,360,1,failure
25313,25314,52,services,1,secondary,no,961,no,yes,cellular,18,feb,222,1,553,4,failure
25314,25315,35,blue-collar,3,primary,no,300,yes,no,unknown,13,may,945,2,-1,0,unknown
25315,25316,37,entrepreneur,3,tertiary,no,66,no,no,cellular,18,nov,1164,2,-1,0,unknown


In [16]:


education_lust = {
    "secondary":1,
    "tertiary":2,
    "primary":3,
    "unknown":4
}
default_list = {
    "no":1,
    "yes":2
}
housing_list = {
    "no":1,
    "yes":2
}
loan_list = {
    "no":1,
    "yes":2
}
month_list = {
    "jan":1,
    "feb":2,
    "mar":3,
    "apr":4,
    "may":5,
    "jun":6,
    "jul":7,
    "aug":8,
    "sep":9,
    "oct":10,
    "nov":11,
    "Dec":12,
}
poutcome_list = {
    "unknown":1,
    "failure":2,
    "other":3,
    "success":4
}
contact_list = {
    "cellular":1,
    "unknown":2,
    "telephone":3
}

In [17]:
df1['受教育水平'] = df1['受教育水平'].map(education_lust)
df1['是否有违约记录'] =df1['是否有违约记录'].map(default_list)
df1['是否有住房贷款'] = df1['是否有住房贷款'].map(housing_list)
df1['是否有个人贷款'] = df1['是否有个人贷款'].map(loan_list)
df1['最后一次联系的时间(月份)'] =df1['最后一次联系的时间(月份)'].map(month_list).fillna(0)
df1['上一次活动的结果'] = df1['上一次活动的结果'].map(poutcome_list)
df1['与客户联系的沟通方式'] = df1["与客户联系的沟通方式"].map(contact_list)

In [18]:
df1.head()

Unnamed: 0,ID,客户年龄,职业,婚姻状况,受教育水平,是否有违约记录,每年账户的平均余额,是否有住房贷款,是否有个人贷款,与客户联系的沟通方式,最后一次联系的时间(几号),最后一次联系的时间(月份),最后一次联系的交流时长,在本次活动中，与该客户交流的次数,距离上次活动最后一次联系客户，过去了多久,在本次活动中，与该客户交流过的次数,上一次活动的结果
0,1,43,management,1,2,1,291,2,1,2,9,5.0,150,2,-1,0,1
1,2,42,technician,3,3,1,5076,2,1,1,7,4.0,99,1,251,2,3
2,3,47,admin.,1,1,1,104,2,2,1,14,7.0,77,2,-1,0,1
3,4,28,management,2,1,1,-994,2,2,1,18,7.0,174,2,-1,0,1
4,5,42,technician,3,1,1,2974,2,1,2,21,5.0,187,5,-1,0,1


### 哑变量编码

In [19]:
one_hot_job = pd.get_dummies(df1['职业'])
one_hot_job.head()

Unnamed: 0,admin.,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed,unknown
0,False,False,False,False,True,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,True,False,False
2,True,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,True,False,False


### 删除原特征

In [20]:
df1.drop(['职业','婚姻状况'],axis=1, inplace=True)

In [21]:
df1

Unnamed: 0,ID,客户年龄,受教育水平,是否有违约记录,每年账户的平均余额,是否有住房贷款,是否有个人贷款,与客户联系的沟通方式,最后一次联系的时间(几号),最后一次联系的时间(月份),最后一次联系的交流时长,在本次活动中，与该客户交流的次数,距离上次活动最后一次联系客户，过去了多久,在本次活动中，与该客户交流过的次数,上一次活动的结果
0,1,43,2,1,291,2,1,2,9,5.0,150,2,-1,0,1
1,2,42,3,1,5076,2,1,1,7,4.0,99,1,251,2,3
2,3,47,1,1,104,2,2,1,14,7.0,77,2,-1,0,1
3,4,28,1,1,-994,2,2,1,18,7.0,174,2,-1,0,1
4,5,42,1,1,2974,2,1,2,21,5.0,187,5,-1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,25313,55,3,1,8180,1,1,1,14,5.0,854,2,360,1,2
25313,25314,52,1,1,961,1,2,1,18,2.0,222,1,553,4,2
25314,25315,35,3,1,300,2,1,2,13,5.0,945,2,-1,0,1
25315,25316,37,2,1,66,1,1,1,18,11.0,1164,2,-1,0,1


### 加入处理后的特征

In [22]:
X = pd.concat([df1,one_hot_job],axis=1)
X

Unnamed: 0,ID,客户年龄,受教育水平,是否有违约记录,每年账户的平均余额,是否有住房贷款,是否有个人贷款,与客户联系的沟通方式,最后一次联系的时间(几号),最后一次联系的时间(月份),...,entrepreneur,housemaid,management,retired,self-employed,services,student,technician,unemployed,unknown
0,1,43,2,1,291,2,1,2,9,5.0,...,False,False,True,False,False,False,False,False,False,False
1,2,42,3,1,5076,2,1,1,7,4.0,...,False,False,False,False,False,False,False,True,False,False
2,3,47,1,1,104,2,2,1,14,7.0,...,False,False,False,False,False,False,False,False,False,False
3,4,28,1,1,-994,2,2,1,18,7.0,...,False,False,True,False,False,False,False,False,False,False
4,5,42,1,1,2974,2,1,2,21,5.0,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25312,25313,55,3,1,8180,1,1,1,14,5.0,...,False,False,False,False,False,False,False,False,False,False
25313,25314,52,1,1,961,1,2,1,18,2.0,...,False,False,False,False,False,True,False,False,False,False
25314,25315,35,3,1,300,2,1,2,13,5.0,...,False,False,False,False,False,False,False,False,False,False
25315,25316,37,2,1,66,1,1,1,18,11.0,...,True,False,False,False,False,False,False,False,False,False


## 归一化处理

# 建模

## 划分数据集

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [24]:
train_x = np.array(X.iloc[:,1:])
train_x

array([[43, 2, 1, ..., False, False, False],
       [42, 3, 1, ..., True, False, False],
       [47, 1, 1, ..., False, False, False],
       ...,
       [35, 3, 1, ..., False, False, False],
       [37, 2, 1, ..., False, False, False],
       [52, 2, 1, ..., False, False, False]], dtype=object)

In [25]:
train_x.shape

(25317, 26)

In [26]:
train_y = np.array(Y)
train_y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [27]:
train_y.shape

(25317,)

In [28]:
SD = StandardScaler().fit(train_x)
trans_train_x = SD.transform(train_x)
trans_train_x

array([[ 0.19415134,  0.31703208, -0.13421772, ..., -0.44858014,
        -0.16875252, -0.08049896],
       [ 0.10011404,  1.46709673, -0.13421772, ...,  2.22925607,
        -0.16875252, -0.08049896],
       [ 0.57030052, -0.83303257, -0.13421772, ..., -0.44858014,
        -0.16875252, -0.08049896],
       ...,
       [-0.55814702,  1.46709673, -0.13421772, ..., -0.44858014,
        -0.16875252, -0.08049896],
       [-0.37007243,  0.31703208, -0.13421772, ..., -0.44858014,
        -0.16875252, -0.08049896],
       [ 1.04048699,  0.31703208, -0.13421772, ..., -0.44858014,
        -0.16875252, -0.08049896]])

In [29]:
lreg = LogisticRegression()
fit_reg = lreg.fit(trans_train_x,train_y)

In [30]:
fit_reg.intercept_,fit_reg.coef_

(array([-2.70820564]),
 array([[-3.33294244e-02,  1.77068509e-02, -4.05345802e-02,
          7.26781423e-02, -4.78717233e-01, -1.91300964e-01,
         -2.63033419e-01, -2.84267063e-02, -6.66990917e-02,
          1.03711047e+00, -3.32780526e-01, -1.13909477e-01,
          2.27230728e-05,  6.03687340e-01,  5.99335703e-02,
         -1.51350907e-01, -6.85707779e-02, -6.97251277e-02,
          6.98977447e-02,  1.37218850e-01, -1.08566296e-03,
         -4.46790934e-02,  9.93731113e-02,  1.95146253e-02,
          5.65875918e-03, -1.38294299e-02]]))

In [31]:
pre_y = fit_reg.predict(trans_train_x)

In [32]:
accuracy_score(train_y,pre_y)

0.9001461468578426

In [33]:
mean_squared_error(train_y,pre_y)

0.09985385314215744

需要整理测试集和训练集一起，重新整理代码划分