# logistic regression model

####  何数学

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from scipy.optimize import minimize

# >>> Part 1：数据预处理

#### 数据读入

In [89]:
data = pd.read_csv(r'.\dataset\train.csv')
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### 提取连续变量与分类变量列表：

In [4]:
con_rv = data.describe().columns # 连续变量
dis_rv = [x for x in data.columns if x not in con_rv] # 分类变量

In [90]:
data['native_country'].loc[data['native_country'] == ' ?'] = ' United-States' # native_country 缺失值填充

#### 将分类变量转为哑变量：

In [91]:
for rv in dis_rv:
    this_dummy = pd.get_dummies(data[rv]).iloc[:,1:] # 产生哑变量列；防止共线性，去掉一列
    data = pd.concat([data,this_dummy], axis=1) # 与原数据合并
    data.drop(rv, axis=1, inplace=True) # 去掉原来的变量列
print(data.shape)
data

(32561, 100)


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,Federal-gov,Local-gov,Never-worked,Private,...,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,>50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
32557,40,154374,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
32558,58,151910,9,0,0,40,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
32559,22,201490,9,0,0,20,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


# >>> Part 2：逻辑回归

## 2.0 先定义分类器评价指标：

In [7]:
from sklearn.metrics import confusion_matrix,classification_report, roc_auc_score,accuracy_score, precision_score, recall_score, f1_score

def clf_Evaluate(y_, y_pred,method):
    confusionmatrix = confusion_matrix(y_, y_pred)
    print('Confusion matrix:\n\n',confusionmatrix)
    print('\nClassification report:\n\n',classification_report(y_, y_pred))
    
    result = pd.DataFrame(columns = ['auc','accuracy','precision','recall','f1'])
    score_list = {
        'auc': round(roc_auc_score(y_, y_pred),4),
        'accuracy': round(accuracy_score(y_, y_pred),4),
        'precision': round(precision_score(y_, y_pred),4),
        'recall': round(recall_score(y_, y_pred),4),
        'f1': round(f1_score(y_, y_pred),4)
    }
    result = result.append(score_list, ignore_index=True)
    result.index = pd.Series([method])
    return result

## 2.1 未对连续型特征实施标准化时：

In [8]:
from sklearn.model_selection import train_test_split
X = data.drop([' >50K'],axis = 1)
y = data[' >50K']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [9]:
def cal_yhat(theta, X):
    X = np.hstack([np.ones((X.shape[0],1)),X])
    z = X @ theta
    return 1 / (1 + np.exp(-z))

def cross_entropy(theta, X, y):
    yhat = cal_yhat(theta, X)
    return -(y.T @ np.log(yhat) + (1-y).T @ np.log(1-yhat)).sum()/X.shape[0]

#### —— 无正则化（regularization，Regu）
* 缩写：特征标准化 - feature normalization；FN、正则化 - regularization，Regu

In [10]:
theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化

def opt_cross_entropy(theta):
    loss = cross_entropy(theta, X_train, y_train)
#     print(loss)
    return loss

opt1 = minimize(opt_cross_entropy, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt1.x])
print('迭代终止是否成功：', opt1.success)
print('迭代终止原因：', opt1.message)

0.6931471805599464
0.6931471805599464
0.6931471844284165
0.6931473098778413
0.693890398241572
0.6931472140381993
0.6931407617371697
0.6931471369465451
0.6931473192286616
0.6931471806030965
0.6931471807446419
0.6931471805606002
0.6931471834941519
0.6931471805432748
0.6931471808008677
0.6931471807044338
0.6931471805635423
0.6931471807975987
0.6931471806465735
0.6931471805965587
0.6931471806272868
0.6931471806936463
0.693147180666514
0.6931471806750134
0.6931471807201247
0.6931471807629479
0.693147180517777
0.6931471822110908
0.6931471805115661
0.6931471805717146
0.6931471805017592
0.6931471815860685
0.6931471805602734
0.6931471809286834
0.6931471806370936
0.6931471827880596
0.6931471807580445
0.6931471807551025
0.6931471811937949
0.6931471805619078
0.6931471810784012
0.6931471805818483
0.6931471807335274
0.6931471808322496
0.6931471809058009
0.6931471812572125
0.69314718058904
0.6931471806583417
0.6931471806115957
0.6931471809528736
0.6931471806410163
0.6931471807760238
0.693147182068891

In [36]:
pred = cal_yhat(np.asarray(opt1.x), X_train)
pred = pred>0.5
train_nonor_noregu = clf_Evaluate(y_train, pred,
             'training: without FN & Regu')
train_nonor_noregu

Confusion matrix:

 [[16095  1218]
 [ 2649  2830]]

Classification report:

               precision    recall  f1-score   support

           0       0.86      0.93      0.89     17313
           1       0.70      0.52      0.59      5479

   micro avg       0.83      0.83      0.83     22792
   macro avg       0.78      0.72      0.74     22792
weighted avg       0.82      0.83      0.82     22792



Unnamed: 0,auc,accuracy,precision,recall,f1
training: without FN & Regu,0.7231,0.8303,0.6991,0.5165,0.5941


In [37]:
pred = cal_yhat(np.asarray(opt1.x), X_test)
pred = pred>0.5
test_nonor_noregu = clf_Evaluate(y_test, pred, 
                            'testing: without FN & Regu')
test_nonor_noregu

Confusion matrix:

 [[6907  500]
 [1112 1250]]

Classification report:

               precision    recall  f1-score   support

           0       0.86      0.93      0.90      7407
           1       0.71      0.53      0.61      2362

   micro avg       0.83      0.83      0.83      9769
   macro avg       0.79      0.73      0.75      9769
weighted avg       0.83      0.83      0.83      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: without FN & Regu,0.7309,0.835,0.7143,0.5292,0.608


#### 可以看到，accuracy 在0.83~0.84，auc 为0.73，f1 为0.6。

#### 训练集与测试集结果非常相近，即不存在明显过拟合，因此后续只展示测试集上的结果。

#### —— L2 正则

In [13]:
def regu2(theta):
    return lamda * (theta.T @ theta).sum()

def opt_cross_entropy_L2(theta):
    loss = cross_entropy(theta, X_train, y_train) + regu2(theta)
    print(loss)
    return loss

theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化
lamda = 0.01 # 正则力度

opt2 = minimize(opt_cross_entropy_L2, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt2.x])
print('迭代终止是否成功：', opt2.success)
print('迭代终止原因：', opt2.message)

0.6931471805599464
0.6931471805599464
0.6931471844284165
0.6931473098778413
0.693890398241572
0.6931472140381993
0.6931407617371697
0.6931471369465451
0.6931473192286616
0.6931471806030965
0.6931471807446419
0.6931471805606002
0.6931471834941519
0.6931471805432748
0.6931471808008677
0.6931471807044338
0.6931471805635423
0.6931471807975987
0.6931471806465735
0.6931471805965587
0.6931471806272868
0.6931471806936463
0.693147180666514
0.6931471806750134
0.6931471807201247
0.6931471807629479
0.693147180517777
0.6931471822110908
0.6931471805115661
0.6931471805717146
0.6931471805017592
0.6931471815860685
0.6931471805602734
0.6931471809286834
0.6931471806370936
0.6931471827880596
0.6931471807580445
0.6931471807551025
0.6931471811937949
0.6931471805619078
0.6931471810784012
0.6931471805818483
0.6931471807335274
0.6931471808322496
0.6931471809058009
0.6931471812572125
0.69314718058904
0.6931471806583417
0.6931471806115957
0.6931471809528736
0.6931471806410163
0.6931471807760238
0.693147182068891

In [31]:
pred = cal_yhat(np.asarray(opt2.x), X_test)
pred = pred>0.5
nonor_regu_L2_001 = clf_Evaluate(y_test, pred, 
                            'testing: without FN & Regu L2=0.01')
nonor_regu_L2_001

Confusion matrix:

 [[7083  324]
 [1313 1049]]

Classification report:

               precision    recall  f1-score   support

           0       0.84      0.96      0.90      7407
           1       0.76      0.44      0.56      2362

   micro avg       0.83      0.83      0.83      9769
   macro avg       0.80      0.70      0.73      9769
weighted avg       0.82      0.83      0.82      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: without FN & Regu L2=0.01,0.7002,0.8324,0.764,0.4441,0.5617


In [23]:
theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化
lamda = 0.1 # 正则力度

opt3 = minimize(opt_cross_entropy_L2, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt3.x])
print('迭代终止是否成功：', opt3.success)
print('迭代终止原因：', opt3.message)

0.6931471805599464
0.6931471805599464
0.6931471844284165
0.6931473098778413
0.693890398241572
0.6931472140381993
0.6931407617371697
0.6931471369465451
0.6931473192286616
0.6931471806030965
0.6931471807446419
0.6931471805606002
0.6931471834941519
0.6931471805432748
0.6931471808008677
0.6931471807044338
0.6931471805635423
0.6931471807975987
0.6931471806465735
0.6931471805965587
0.6931471806272868
0.6931471806936463
0.693147180666514
0.6931471806750134
0.6931471807201247
0.6931471807629479
0.693147180517777
0.6931471822110908
0.6931471805115661
0.6931471805717146
0.6931471805017592
0.6931471815860685
0.6931471805602734
0.6931471809286834
0.6931471806370936
0.6931471827880596
0.6931471807580445
0.6931471807551025
0.6931471811937949
0.6931471805619078
0.6931471810784012
0.6931471805818483
0.6931471807335274
0.6931471808322496
0.6931471809058009
0.6931471812572125
0.69314718058904
0.6931471806583417
0.6931471806115957
0.6931471809528736
0.6931471806410163
0.6931471807760238
0.693147182068891

In [32]:
pred = cal_yhat(np.asarray(opt3.x), X_test)
pred = pred>0.5
nonor_regu_L2_01 = clf_Evaluate(y_test, pred, 
                            'testing: without FN & Regu L2 = 0.1')
nonor_regu_L2_01

Confusion matrix:

 [[7235  172]
 [1688  674]]

Classification report:

               precision    recall  f1-score   support

           0       0.81      0.98      0.89      7407
           1       0.80      0.29      0.42      2362

   micro avg       0.81      0.81      0.81      9769
   macro avg       0.80      0.63      0.65      9769
weighted avg       0.81      0.81      0.77      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: without FN & Regu L2 = 0.1,0.6311,0.8096,0.7967,0.2854,0.4202


#### —— L1正则

In [25]:
def regu1(theta):
    return lamda * (abs(theta)).sum()

def opt_cross_entropy_L1(theta):
    loss = cross_entropy(theta, X_train, y_train) + regu1(theta)
#     print(loss)
    return loss

theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化
lamda = 0.01

opt4 = minimize(opt_cross_entropy_L1, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt4.x])
print('迭代终止是否成功：', opt4.success)
print('迭代终止原因：', opt4.message)

Optimal weights： [-1.2065, -0.0163, -0.0, 0.0859, 0.0003, 0.0007, -0.0033, 0.0024, -0.0006, 0.0008, -0.1289, 0.0003, 0.0018, 0.0003, -0.0004, -0.0002, -0.0008, -0.001, 0.0011, 0.0008, 0.0001, 0.0009, 0.0007, -0.0001, -0.0001, -0.3837, 0.002, -0.0004, -0.001, -0.0124, 0.0005, 1.1355, 0.001, -1.1852, -0.0007, 0.0006, 0.0009, -0.0007, -0.0005, 0.3611, -0.0007, -0.0017, -0.0004, -0.0498, -0.0004, 0.1108, 0.0004, 0.0014, 0.0003, -0.0022, -0.0018, 0.0009, -0.3531, -0.0013, 0.0011, -0.0013, -0.0004, 0.0016, -0.0013, 0.0004, 0.0012, 0.0004, -0.002, -0.0006, -0.0005, -0.0004, 0.0013, -0.002, -0.0002, -0.0008, 0.0004, -0.0019, -0.0007, 0.0006, -0.0005, -0.0002, 0.0004, 0.0002, 0.0005, -0.0004, 0.0002, 0.0006, -0.0004, 0.001, -0.0002, 0.0005, 0.0017, -0.0009, 0.0003, 0.0019, -0.0, -0.0019, 0.0012, -0.0013, -0.001, 0.0003, 0.0009, -0.0804, -0.0014, -0.0006]
迭代终止是否成功： False
迭代终止原因： Iteration limit exceeded


In [33]:
pred = cal_yhat(np.asarray(opt4.x), X_test)
pred = pred>0.5
nonor_regu_L1_001 = clf_Evaluate(y_test, pred, 
                            'testing: without FN & Regu L1=0.01')
nonor_regu_L1_001

Confusion matrix:

 [[7117  290]
 [1439  923]]

Classification report:

               precision    recall  f1-score   support

           0       0.83      0.96      0.89      7407
           1       0.76      0.39      0.52      2362

   micro avg       0.82      0.82      0.82      9769
   macro avg       0.80      0.68      0.70      9769
weighted avg       0.81      0.82      0.80      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: without FN & Regu L1=0.01,0.6758,0.823,0.7609,0.3908,0.5164


In [27]:
theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化
lamda = 0.1

opt5 = minimize(opt_cross_entropy_L1, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt5.x])
print('迭代终止是否成功：', opt5.success)
print('迭代终止原因：', opt5.message)

Optimal weights： [-0.0009, -0.0022, -0.0, 0.0004, 0.0002, 0.0007, -0.01, 0.0001, 0.0002, 0.0004, -0.0003, 0.0004, 0.0002, -0.0003, -0.0002, 0.0001, -0.0001, 0.0011, -0.0005, 0.0005, 0.0003, -0.0005, 0.0005, 0.0005, 0.0003, -0.0009, 0.0001, 0.0008, -0.0001, 0.0007, -0.0004, 0.0019, 0.0002, -0.0017, 0.0001, 0.0002, 0.0002, -0.0001, -0.0003, -0.0001, -0.0002, 0.0001, -0.0, 0.0005, 0.0001, -0.0001, -0.0003, 0.0001, 0.0003, -0.0004, 0.0001, -0.0001, -0.0006, 0.0005, 0.0007, -0.0, 0.0003, 0.0002, -0.0008, 0.0003, 0.0002, -0.0005, 0.0006, -0.0001, 0.0001, 0.0004, -0.0, -0.0001, -0.0003, 0.0001, 0.0007, -0.0002, -0.0006, -0.0001, -0.0002, 0.0001, -0.0005, 0.0002, 0.0001, -0.0, -0.0002, -0.0, -0.0001, 0.0, -0.0002, 0.0003, 0.0008, 0.0001, 0.0003, -0.0002, -0.0004, -0.0002, 0.0001, 0.0002, -0.0002, 0.0001, -0.0003, -0.0004, 0.0002, -0.0003]
迭代终止是否成功： False
迭代终止原因： Iteration limit exceeded


In [41]:
pred = cal_yhat(np.asarray(opt5.x), X_test)
pred = pred>0.5
nonor_regu_L1_01 = clf_Evaluate(y_test, pred, 
                            'testing: without FN & Regu L1=0.1')
nonor_regu_L1_01

Confusion matrix:

 [[4541 2866]
 [1728  634]]

Classification report:

               precision    recall  f1-score   support

           0       0.72      0.61      0.66      7407
           1       0.18      0.27      0.22      2362

   micro avg       0.53      0.53      0.53      9769
   macro avg       0.45      0.44      0.44      9769
weighted avg       0.59      0.53      0.56      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: without FN & Regu L1=0.1,0.4407,0.5297,0.1811,0.2684,0.2163


### >>> 未对特征做标准化情况下的汇总：

In [55]:
pd.concat([train_nonor_noregu, test_nonor_noregu,
           nonor_regu_L2_001, nonor_regu_L2_01, 
           nonor_regu_L1_001, nonor_regu_L1_01])

Unnamed: 0,auc,accuracy,precision,recall,f1
training: without FN & Regu,0.7231,0.8303,0.6991,0.5165,0.5941
testing: without FN & Regu,0.7309,0.835,0.7143,0.5292,0.608
testing: without FN & Regu L2=0.01,0.7002,0.8324,0.764,0.4441,0.5617
testing: without FN & Regu L2 = 0.1,0.6311,0.8096,0.7967,0.2854,0.4202
testing: without FN & Regu L1=0.01,0.6758,0.823,0.7609,0.3908,0.5164
testing: without FN & Regu L1=0.1,0.4407,0.5297,0.1811,0.2684,0.2163


#### 训练集与测试集结果相近，即表明没有明显过拟合。
#### 若加入正则力度，可以看到 precision 能有所提升，但是其他指标均下降。

## 2.2 对连续特征做标准化后：

In [92]:
miu_list = []
sigma_list = []
for rv in con_rv:
    miu = data[rv].mean()
    miu_list.append(miu)
    sigma = data[rv].var()**0.5
    sigma_list.append(sigma)
    data[rv] = (data[rv] - miu) / sigma
data.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,Federal-gov,Local-gov,Never-worked,Private,...,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,>50K
0,0.03067,-1.063594,1.134721,0.148451,-0.216656,-0.035429,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0.837096,-1.008692,1.134721,-0.145918,-0.216656,-2.222119,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,-0.042641,0.245075,-0.420053,-0.145918,-0.216656,-0.035429,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,1.057031,0.425795,-1.19744,-0.145918,-0.216656,-0.035429,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,-0.775756,1.408154,1.134721,-0.145918,-0.216656,-0.035429,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [40]:
X = data.drop([' >50K'],axis = 1)
y = data[' >50K']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#### —— 无正则化（regularization，Regu）

In [43]:
theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化

def opt_cross_entropy(theta):
    loss = cross_entropy(theta, X_train, y_train)
    print(loss)
    return loss

opt6 = minimize(opt_cross_entropy, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt6.x])
print('迭代终止是否成功：', opt6.success)
print('迭代终止原因：', opt6.message)

0.6931471805599464
0.6931471805599464
0.6931471844284165
0.6931471790985476
0.6931471806334962
0.6931471784149184
0.6931471791261109
0.693147179613575
0.6931471791213918
0.6931471806030965
0.6931471807446419
0.6931471805606002
0.6931471834941519
0.6931471805432748
0.6931471808008677
0.6931471807044338
0.6931471805635423
0.6931471807975987
0.6931471806465735
0.6931471805965587
0.6931471806272868
0.6931471806936463
0.693147180666514
0.6931471806750134
0.6931471807201247
0.6931471807629479
0.693147180517777
0.6931471822110908
0.6931471805115661
0.6931471805717146
0.6931471805017592
0.6931471815860685
0.6931471805602734
0.6931471809286834
0.6931471806370936
0.6931471827880596
0.6931471807580445
0.6931471807551025
0.6931471811937949
0.6931471805619078
0.6931471810784012
0.6931471805818483
0.6931471807335274
0.6931471808322496
0.6931471809058009
0.6931471812572125
0.69314718058904
0.6931471806583417
0.6931471806115957
0.6931471809528736
0.6931471806410163
0.6931471807760238
0.693147182068891

In [44]:
pred = cal_yhat(np.asarray(opt6.x), X_test)
pred = pred>0.5
nor_noregu = clf_Evaluate(y_test, pred,
             'testing: FN & without Regu')
nor_noregu

Confusion matrix:

 [[6874  533]
 [ 931 1431]]

Classification report:

               precision    recall  f1-score   support

           0       0.88      0.93      0.90      7407
           1       0.73      0.61      0.66      2362

   micro avg       0.85      0.85      0.85      9769
   macro avg       0.80      0.77      0.78      9769
weighted avg       0.84      0.85      0.85      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: FN & without Regu,0.7669,0.8501,0.7286,0.6058,0.6616


#### —— L2 正则

In [45]:
def opt_cross_entropy_L2(theta):
    loss = cross_entropy(theta, X_train, y_train) + regu2(theta)
#     print(loss)
    return loss

theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化
lamda = 0.01

opt7 = minimize(opt_cross_entropy_L2, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt7.x])
print('迭代终止是否成功：', opt7.success)
print('迭代终止原因：', opt7.message)

Optimal weights： [-0.6286, 0.2703, 0.0296, 0.5923, 0.7961, 0.201, 0.3258, 0.0727, -0.0709, -0.0005, -0.1574, 0.0489, -0.1929, -0.0975, -0.0034, -0.0784, -0.022, -0.0074, -0.0158, -0.036, -0.0318, -0.073, -0.062, -0.0118, 0.0192, -0.226, 0.0427, -0.0035, 0.0307, -0.1045, 0.0061, 0.6654, -0.0445, -0.6056, -0.0922, -0.1062, -0.139, -0.0016, -0.0329, 0.2826, -0.149, -0.1116, -0.111, -0.2799, -0.0142, 0.092, 0.0312, 0.0233, 0.0609, -0.0516, -0.4629, -0.1126, -0.3767, -0.3755, 0.1864, -0.094, -0.2036, -0.0394, -0.2434, 0.0915, -0.0052, -0.0199, -0.0157, -0.0062, -0.0103, -0.0051, -0.0075, -0.0062, -0.0035, -0.0079, -0.0055, -0.0065, -0.007, -0.0005, -0.0016, -0.0006, -0.0031, -0.014, -0.0042, -0.0017, 0.0013, -0.0128, -0.0042, -0.0012, -0.0694, -0.003, -0.0034, -0.0049, -0.0084, -0.0045, -0.0055, -0.021, -0.0016, -0.016, -0.0018, -0.0033, -0.0049, -0.318, -0.0148, -0.0011]
迭代终止是否成功： True
迭代终止原因： Optimization terminated successfully.


In [46]:
pred = cal_yhat(np.asarray(opt7.x), X_test)
pred = pred>0.5
nor_regu_L2_001 = clf_Evaluate(y_test, pred,
             'testing: FN & Regu L2 = 0.01')
nor_regu_L2_001

Confusion matrix:

 [[6971  436]
 [1153 1209]]

Classification report:

               precision    recall  f1-score   support

           0       0.86      0.94      0.90      7407
           1       0.73      0.51      0.60      2362

   micro avg       0.84      0.84      0.84      9769
   macro avg       0.80      0.73      0.75      9769
weighted avg       0.83      0.84      0.83      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: FN & Regu L2 = 0.01,0.7265,0.8373,0.735,0.5119,0.6034


In [47]:
theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化
lamda = 0.1

opt8 = minimize(opt_cross_entropy_L2, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt8.x])
print('迭代终止是否成功：', opt8.success)
print('迭代终止原因：', opt8.message)

Optimal weights： [-0.2729, 0.172, 0.0052, 0.2983, 0.2044, 0.1198, 0.1864, 0.0052, -0.0174, -0.0001, -0.1802, 0.0143, -0.0292, -0.0181, -0.0006, -0.0244, -0.0085, -0.0035, -0.0066, -0.0138, -0.011, -0.0131, -0.0147, 0.0149, 0.0093, -0.1405, 0.0237, -0.0011, 0.0135, -0.0793, 0.0005, 0.1648, -0.0115, -0.2492, -0.0276, -0.0306, -0.0664, -0.0003, -0.0301, 0.057, -0.0252, -0.0295, -0.0347, -0.0825, -0.0037, 0.0213, 0.0004, -0.016, 0.001, -0.0174, -0.188, -0.0277, -0.1248, -0.1023, 0.0256, -0.0144, -0.0671, -0.0079, -0.1746, -0.0335, -0.0009, -0.0022, -0.0025, -0.0012, -0.0022, -0.0007, -0.0018, -0.0008, -0.0004, -0.0013, -0.0005, -0.0017, -0.0014, -0.0001, -0.0004, -0.0002, -0.0004, -0.001, -0.0003, -0.0004, 0.0, -0.0025, -0.0003, -0.0002, -0.0154, -0.0007, -0.0006, -0.0009, -0.0015, -0.0008, -0.001, -0.0037, -0.0003, -0.0021, 0.0, -0.0005, -0.0007, -0.2192, -0.0022, -0.0001]
迭代终止是否成功： True
迭代终止原因： Optimization terminated successfully.


In [49]:
pred = cal_yhat(np.asarray(opt8.x), X_test)
pred = pred>0.5
nor_regu_L2_01 = clf_Evaluate(y_test, pred,
             'testing: FN & Regu L2 = 0.1')
nor_regu_L2_01

Confusion matrix:

 [[7216  191]
 [1717  645]]

Classification report:

               precision    recall  f1-score   support

           0       0.81      0.97      0.88      7407
           1       0.77      0.27      0.40      2362

   micro avg       0.80      0.80      0.80      9769
   macro avg       0.79      0.62      0.64      9769
weighted avg       0.80      0.80      0.77      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: FN & Regu L2 = 0.1,0.6236,0.8047,0.7715,0.2731,0.4034


#### —— L1 正则

In [50]:
theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化
lamda = 0.01

opt9 = minimize(opt_cross_entropy_L1, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt9.x])
print('迭代终止是否成功：', opt9.success)
print('迭代终止原因：', opt9.message)

Optimal weights： [-1.5998, 0.212, 0.0004, 0.7409, 1.4403, 0.1724, 0.2936, -0.0005, 0.0003, -0.0013, -0.0008, -0.0006, 0.0001, -0.0008, -0.0, -0.0001, 0.0004, -0.0003, -0.0006, -0.0, -0.0005, -0.0002, 0.0006, -0.0006, 0.0018, -0.0013, -0.0002, 0.0002, 0.0005, 0.0003, -0.0007, 1.0667, -0.0003, -0.842, 0.0002, -0.0002, 0.0001, 0.0019, -0.0006, 0.0296, 0.0009, -0.001, -0.0001, -0.0004, -0.0009, -0.0004, 0.0016, -0.0, 0.0007, -0.0004, -0.0849, 0.0001, -0.0016, -0.0951, -0.0004, 0.0007, -0.0007, -0.0002, -0.0024, 0.0001, -0.0017, 0.0001, -0.0006, -0.0003, 0.0003, 0.0001, -0.0012, 0.0001, 0.0003, -0.0006, 0.0002, -0.0, 0.0003, -0.0011, -0.0002, -0.0001, 0.0013, 0.0004, -0.0002, 0.0015, -0.0011, 0.001, 0.0003, 0.0003, -0.0008, -0.0004, -0.0004, -0.0003, 0.0, 0.0008, 0.0002, -0.0003, 0.0006, 0.0002, -0.0001, 0.0008, -0.0003, -0.0061, -0.0015, 0.0017]
迭代终止是否成功： False
迭代终止原因： Iteration limit exceeded


In [51]:
pred = cal_yhat(np.asarray(opt9.x), X_test)
pred = pred>0.5
nor_regu_L1_001 = clf_Evaluate(y_test, pred,
             'testing: FN & Regu L1 = 0.01')
nor_regu_L1_001

Confusion matrix:

 [[6981  426]
 [1120 1242]]

Classification report:

               precision    recall  f1-score   support

           0       0.86      0.94      0.90      7407
           1       0.74      0.53      0.62      2362

   micro avg       0.84      0.84      0.84      9769
   macro avg       0.80      0.73      0.76      9769
weighted avg       0.83      0.84      0.83      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: FN & Regu L1 = 0.01,0.7342,0.8417,0.7446,0.5258,0.6164


In [52]:
theta = np.zeros(shape=(X.shape[1]+1,1)) #初始化
lamda = 0.1

opt10 = minimize(opt_cross_entropy_L1, theta, tol=1e-5, method='SLSQP')
print('Optimal weights：',[round(x,4) for x in opt10.x])
print('迭代终止是否成功：', opt10.success)
print('迭代终止原因：', opt10.message)

Optimal weights： [-0.0004, 0.0002, 0.0, 0.0001, 0.0002, 0.0001, 0.0002, -0.0, 0.0, 0.0, -0.0002, 0.0, 0.0001, 0.0, 0.0, 0.0001, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0001, 0.0, -0.0001, 0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0002, 0.0001, 0.0001, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0001, -0.0001, 0.0001, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, -0.0003, -0.0001, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0003, -0.0, 0.0]
迭代终止是否成功： True
迭代终止原因： Optimization terminated successfully.


In [53]:
pred = cal_yhat(np.asarray(opt10.x), X_test)
pred = pred>0.5
nor_regu_L1_01 = clf_Evaluate(y_test, pred,
             'testing: FN & Regu L1 = 0.1')
nor_regu_L1_01

Confusion matrix:

 [[7384   23]
 [2278   84]]

Classification report:

               precision    recall  f1-score   support

           0       0.76      1.00      0.87      7407
           1       0.79      0.04      0.07      2362

   micro avg       0.76      0.76      0.76      9769
   macro avg       0.77      0.52      0.47      9769
weighted avg       0.77      0.76      0.67      9769



Unnamed: 0,auc,accuracy,precision,recall,f1
testing: FN & Regu L1 = 0.1,0.5162,0.7645,0.785,0.0356,0.068


### >>> 是否对特征做标准化、是否采取正则化的汇总：

In [57]:
pd.concat([train_nonor_noregu, test_nonor_noregu,
           nonor_regu_L2_001, nonor_regu_L2_01, 
           nonor_regu_L1_001, nonor_regu_L1_01,
           nor_noregu,
           nor_regu_L2_001, nor_regu_L2_01, 
           nor_regu_L1_001, nor_regu_L1_01])

Unnamed: 0,auc,accuracy,precision,recall,f1
training: without FN & Regu,0.7231,0.8303,0.6991,0.5165,0.5941
testing: without FN & Regu,0.7309,0.835,0.7143,0.5292,0.608
testing: without FN & Regu L2=0.01,0.7002,0.8324,0.764,0.4441,0.5617
testing: without FN & Regu L2 = 0.1,0.6311,0.8096,0.7967,0.2854,0.4202
testing: without FN & Regu L1=0.01,0.6758,0.823,0.7609,0.3908,0.5164
testing: without FN & Regu L1=0.1,0.4407,0.5297,0.1811,0.2684,0.2163
testing: FN & without Regu,0.7669,0.8501,0.7286,0.6058,0.6616
testing: FN & Regu L2 = 0.01,0.7265,0.8373,0.735,0.5119,0.6034
testing: FN & Regu L2 = 0.1,0.6236,0.8047,0.7715,0.2731,0.4034
testing: FN & Regu L1 = 0.01,0.7342,0.8417,0.7446,0.5258,0.6164


### 总结：
#### 当对特征采取标准化、并不采取正则化时，模型所能达到的效果最佳。
#### 对特征采取标准化带来了1.5%的准确度提升，3.6%的 auc 提升，1.4%的 precision 提升，7.7%的 recall 提升，5.4% 的 f1 提升。

# >>> Part 4：参数分析

In [131]:
data.columns.shape

(100,)

In [134]:
print('最优参数为：')
list(zip(['截距']+list(X_train.columns), [round(x,4) for x in opt6.x]))

最优参数为：


[('截距', -1.4063),
 ('age', 0.3262),
 ('fnlwgt', 0.0611),
 ('education_num', 0.7732),
 ('capital_gain', 2.4954),
 ('capital_loss', 0.2587),
 ('hours_per_week', 0.4126),
 (' Federal-gov', 0.3768),
 (' Local-gov', -0.1417),
 (' Never-worked', -0.0021),
 (' Private', 0.0869),
 (' Self-emp-inc', 0.1117),
 (' Self-emp-not-inc', -0.6348),
 (' State-gov', -0.333),
 (' Without-pay', -0.0141),
 (' 11th', -0.2368),
 (' 12th', -0.054),
 (' 1st-4th', -0.0138),
 (' 5th-6th', -0.0346),
 (' 7th-8th', -0.0702),
 (' 9th', -0.0795),
 (' Assoc-acdm', -0.2801),
 (' Assoc-voc', -0.2268),
 (' Bachelors', -0.1961),
 (' Doctorate', 0.0406),
 (' HS-grad', -0.1382),
 (' Masters', -0.0002),
 (' Preschool', -0.0122),
 (' Prof-school', 0.0568),
 (' Some-college', -0.0397),
 (' Married-AF-spouse', 0.0323),
 (' Married-civ-spouse', 1.1744),
 (' Married-spouse-absent', -0.1094),
 (' Never-married', -1.0201),
 (' Separated', -0.2026),
 (' Widowed', -0.2457),
 (' Adm-clerical', -0.1543),
 (' Armed-Forces', -0.0059),
 ('

In [135]:
param_df = pd.DataFrame(columns=['变量','系数'])
param_df['变量'], param_df['系数'] = ['截距']+list(X_train.columns),[round(x,4) for x in opt6.x]
param_df.sort_values(by = '系数')

Unnamed: 0,变量,系数
0,截距,-1.4063
33,Never-married,-1.0201
52,Own-child,-0.9985
53,Unmarried,-0.9259
43,Other-service,-0.8342
...,...,...
54,Wife,0.6645
3,education_num,0.7732
39,Exec-managerial,0.7930
31,Married-civ-spouse,1.1744


* **对年收入具有最大负面影响的前四个因素，降序排列分别为：从未结婚、拥有一个孩子、（目前）未婚、职业为其他。**
* **对年收入具有最大正面影响的前四个因素，降序排列分别为：资本收入、已婚平民配偶、职业为执行主管、教育年限。**

# >>> Part 5：对 test.csv 预测

In [146]:
test_csv = pd.read_csv(r'.\dataset\test.csv')
# native_country 缺失值填充
test_csv['native_country'].loc[test_csv['native_country'] == ' ?'] = ' United-States' 

In [147]:
for i, rv in enumerate(con_rv):
    test_csv[rv] = (test_csv[rv] - miu_list[i]) / sigma_list[i]
test_csv.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,-0.99569,Private,0.350769,11th,-1.19744,Never-married,Machine-op-inspct,Own-child,Black,Male,-0.145918,-0.216656,-0.035429,United-States
1,-0.042641,Private,-0.947081,HS-grad,-0.420053,Married-civ-spouse,Farming-fishing,Husband,White,Male,-0.145918,-0.216656,0.774456,United-States
2,-0.775756,Local-gov,1.394341,Assoc-acdm,0.746028,Married-civ-spouse,Protective-serv,Husband,White,Male,-0.145918,-0.216656,-0.035429,United-States
3,0.397227,Private,-0.279066,Some-college,-0.03136,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,0.89507,-0.216656,-0.035429,United-States
4,-1.508871,?,-0.817446,Some-college,-0.03136,Never-married,?,Own-child,White,Female,-0.145918,-0.216656,-0.845314,United-States


In [149]:
dis_rv.remove('income')
for rv in dis_rv:
    this_dummy = pd.get_dummies(test_csv[rv]).iloc[:,1:] # 产生哑变量列；防止共线性，去掉一列
    test_csv = pd.concat([test_csv,this_dummy], axis=1) # 与原数据合并
    test_csv.drop(rv, axis=1, inplace=True) # 去掉原来的变量列
print(test_csv.shape)

(16281, 98)


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,Federal-gov,Local-gov,Never-worked,Private,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,-0.995690,0.350769,-1.197440,-0.145918,-0.216656,-0.035429,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,-0.042641,-0.947081,-0.420053,-0.145918,-0.216656,0.774456,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,-0.775756,1.394341,0.746028,-0.145918,-0.216656,-0.035429,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0.397227,-0.279066,-0.031360,0.895070,-0.216656,-0.035429,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
4,-1.508871,-0.817446,-0.031360,-0.145918,-0.216656,-0.845314,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,0.030670,0.242924,1.134721,-0.145918,-0.216656,-0.359383,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
16277,1.863457,1.247036,-0.420053,-0.145918,-0.216656,-0.035429,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16278,-0.042641,1.754663,1.134721,-0.145918,-0.216656,0.774456,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
16279,0.397227,-1.003196,1.134721,0.592712,-0.216656,-0.035429,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [154]:
param_df = pd.DataFrame(columns=['变量','系数'])
param_df['变量'], param_df['系数'] = ['截距']+list(X_train.columns),[x for x in opt6.x]
param_df

Unnamed: 0,变量,系数
0,截距,-1.406275
1,age,0.326166
2,fnlwgt,0.061086
3,education_num,0.773175
4,capital_gain,2.495442
...,...,...
95,Thailand,-0.013850
96,Trinadad&Tobago,-0.020943
97,United-States,-0.163226
98,Vietnam,-0.060371


In [159]:
test_params = []
for rv in test_csv.columns:
    test_params.append(float(param_df[param_df['变量'] == rv ]['系数']))
test_params = [float(param_df[param_df['变量'] == '截距' ]['系数'])] + test_params

In [166]:
final_pred = cal_yhat(np.asarray(test_params), test_csv)
final_pred = pred>0.5

id_ = range(1,test_csv.shape[0]+1)

submission = pd.DataFrame(columns=['id','label'])
submission.id, submission.label = id_, final_pred
submission.label = submission.label.map({True:' >50K', False:' <=50K'})
submission.to_csv('./submission_LR.csv',index=False)
submission

Unnamed: 0,id,label
0,1,<=50K
1,2,<=50K
2,3,<=50K
3,4,>50K
4,5,<=50K
...,...,...
16276,16277,<=50K
16277,16278,<=50K
16278,16279,>50K
16279,16280,<=50K
