# HR表的特征预处理  
字段说明:  
left：是否离职  
satisfaction_level：满意度  
last_evaluation：绩效评估  
number_project：完成项目数  
average_montly_hours：平均每月工作时间  
time_spend_company：为公司服务的年限  
work_accident：是否有工作事故  
promotion：过去5 年是否有升职  
salary：薪资水平  

- 数据清洗  
 数据样本抽样  
 异常值处理  
- 特征预处理  
 特征选择  
 特征变换  
 特征降维  
 特征衍生  

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

#sl:satisfaction_level -False:MinMax;True:Stands
#le:last_evaluation----False:MinMax;True:Stands
#npr:number_project----False:MinMax;True:Stands
#amh:average_monthly_hours---False:MinMax;True:Stands
#tsc:time_spend_company----False:MinMax;True:Stands
#wa:work_acdient----False:MinMax;True:Stands
#pl5:promotion_last_5years----False:MinMax;True:Stands

#dp:department--False:LabelEncoding;True:OneHotEncoder
#slr:salary--False:LabelEncoding;True:OneHotEncoder


def hr_preprocessing(sl=False,le=False,npr=False,amh=False,
                     tsc=False,wa=False,pl5=False,slr=False,dp=False,lower_d=False,ld_n=1):
    df = pd.read_csv('./data/HR.csv')

    #1.清洗数据
    df.dropna(subset=['satisfaction_level','last_evaluation'])
    df = df[df['satisfaction_level']<=1][df['salary']!='nme']
    #2.得到标注
    label = df['left']
    df = df.drop('left',axis = 1)
    #3.特征选择
     #特征较少,先保留所有的
    #4.特征处理
      #这里有三种选择(不处理,转为0-1,转为正态,)
    scaler_lst = [sl,le,npr,amh,tsc,wa,pl5]  
    column_lst = ['satisfaction_level','last_evaluation','number_project',
                  'average_monthly_hours','time_spend_company','Work_accident','promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]]=\
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df[column_lst[i]]=\
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
    scaler_lst = [slr,dp]
    column_lst = ["salary","department"]
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == 'salary':
                df[column_lst[i]]=[map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst[i]]=LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]]=MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1,-1)[0]
        else:
            df=pd.get_dummies(df,columns=[column_lst[i]])   #one-hot
    if lower_d:
        return PCA(n_components=ld_n).fit_transform(df.values)
    
    
    return df,label
d = dict([('low',0),('medium',1),('high',2)])
def map_salary(s):
    return d.get(s,0)

hr_preprocessing(sl=True,le=True,dp=True,lower_d=False,ld_n=3)



(       satisfaction_level  last_evaluation  number_project  \
 0               -0.936495        -1.087275             0.0   
 1                0.752814         0.840707             0.6   
 2               -2.022479         0.957554             1.0   
 3                0.431041         0.899131             0.6   
 4               -0.976716        -1.145699             0.0   
 5               -0.815830        -1.262546             0.0   
 6               -2.062701         0.314894             0.8   
 7                1.235474         0.782283             0.6   
 8                1.114809         1.658639             0.6   
 9               -0.775608        -1.087275             0.0   
 10              -0.654943        -1.028852             0.0   
 11              -2.022479         0.548588             0.8   
 12               0.913701         1.191249             0.4   
 13              -0.815830        -0.970428             0.0   
 14              -1.016938        -0.912004            

# 机器学习与建模

学习: 通过接收到的数据,归纳提取相同与不同  
机器学习: 计算机以数据为基础,进行归纳总结  
模型: 数据解释现象的系统  (实际上是对于数据的函数)  
机器学习相当于模型的实体  

监督学习 : 分类 ,回归    
非监督学习 : 聚类 , 关联  
半监督学习  

## 切分数据集
训练集 : 用来训练与拟合模型  
验证集 : 使用验证集数据纠偏或比较预测  
测试集 : 模型泛华能力的考量  

泛华:对未知数据的预测能力 (过拟合,欠拟合)  
交叉验证 , 1:4
K-fold 交叉验证: 轮流做一遍测试集,其他做训练集





In [2]:
def hr_modeling(features,label):
    from sklearn.model_selection import train_test_split  #切分测试集
    from sklearn.metrics import accuracy_score,recall_score,f1_score  #差准率,召回率,f1得分
    from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier  #knn
    from sklearn.naive_bayes import GaussianNB,BernoulliNB  #朴素贝叶斯,2值用Ber更好,高斯分布用Gauss
    from sklearn.tree import DecisionTreeClassifier #决策树
    from sklearn.svm import SVC  #支持向量机
    from sklearn.ensemble import RandomForestClassifier #随机森林
    from sklearn.ensemble import AdaBoostClassifier #adaboost
    from sklearn.linear_model import LogisticRegression  #逻辑回归
    
#     from sklearn.tree import export_graphviz
#     from sklearn.externals.six import StringIO
    
    f_v = features.values
    f_names=features.columns.values
    l_v = label.values
    X_tt,X_validation,Y_tt,Y_validation = train_test_split(f_v,l_v,test_size = 0.2)
    X_train,X_test,Y_train,Y_test = train_test_split(X_tt,Y_tt,test_size=0.25)
    print(len(X_train),len(X_validation),len(X_test))
         #     训练           验证            测试
    models = []   #构造模型池
    models.append(('KNN',KNeighborsClassifier(n_neighbors=3)))
#     models.append(('GaussianNB',GaussianNB()))
#     models.append(('BernoulliNB',BernoulliNB()))
#     models.append(('DecisionTreeGini',DecisionTreeClassifier()))
#     models.append(('DecisionTreeEntropy',DecisionTreeClassifier(criterion='entropy'))) #信息增益
#     models.append(('SVM Classifier',SVC(1000)))
#     models.append(("OriginalRandomForest",RandomForestClassifier()))
    models.append(("RandomForest",RandomForestClassifier(n_estimators=10,max_features=None)))
    models.append(("Adaboost",AdaBoostClassifier(n_estimators=100)))
    models.append(("LogisticRegression",LogisticRegression(C=1000,tol=1e-10,solver="sag",max_iter=10000)))

    for clf_name , clf in models:
        clf.fit(X_train,Y_train)
        xy_lst = [(X_train,Y_train),(X_validation,Y_validation),(X_test,Y_test)]
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            Y_pred = clf.predict(X_part)
            print(i)
            print(clf_name,'-ACC',accuracy_score(Y_part,Y_pred))
            print(clf_name,'-REC',recall_score(Y_part,Y_pred))
            print(clf_name,'-F1',f1_score(Y_part,Y_pred))
            
            #决策树
            # dot_data=StringIO()
            # export_graphviz(clf,out_file=dot_data,
            #                          feature_names=f_names,
            #                          class_names=["NL","L"],
            #                          filled=True,
            #                          rounded=True,
            #                          special_characters=True)
        #输出,feature_names特征名称,class_names标注名称,可视化(filled填充,rounded,sp)
            # graph=pydotplus.graph_from_dot_data(dot_data.getvalue())
        #
            # graph.write_pdf("dt_tree_2.pdf")
            
    
#     knn_clf.fit(X_train,Y_train)  #拟合
#     Y_pred = knn_clf.predict(X_validation)  #预测

#     print('validation')
#     print("-ACC:",accuracy_score(Y_validation,Y_pred))
#     print("-REC:",recall_score(Y_validation,Y_pred))
#     print("-F1:",f1_score(Y_validation,Y_pred))

# #     knn_clf_n5 = KNeighborsClassifier(n_neighbors=5)
# #     knn_clf_n5.fit(X_train,Y_train)  #拟合
# #     Y_pred = knn_clf_n5.predict(X_validation)  

# #     print("-ACC_5:",accuracy_score(Y_validation,Y_pred))
# #     print("-REC_5:",recall_score(Y_validation,Y_pred))
# #     print("-F1_5:",f1_score(Y_validation,Y_pred))
    
#     Y_pred = knn_clf.predict(X_test)   
#     print('test')
#     print("-ACC:",accuracy_score(Y_test,Y_pred))
#     print("-REC:",recall_score(Y_test,Y_pred))
#     print("-F1:",f1_score(Y_test,Y_pred))
    
#     Y_pred = knn_clf.predict(X_train)  
#     print('train')
#     print("-ACC:",accuracy_score(Y_train,Y_pred))
#     print("-REC:",recall_score(Y_train,Y_pred))
#     print("-F1:",f1_score(Y_train,Y_pred))
    
#     #略微存过拟合,可以忽略
    
#     #保存knn模型
#     from sklearn.externals import joblib
#     joblib.dump(knn_clf,'knn_clf')
#     knn_clf = joblib.load('knn_clf')
#     Y_pred = knn_clf.predict(X_test)   
#     print('test2')
#     print("-ACC:",accuracy_score(Y_test,Y_pred))
#     print("-REC:",recall_score(Y_test,Y_pred))
#     print("-F1:",f1_score(Y_test,Y_pred))


def regr_test(feateres,label):
    print("X",features)
    print("Y",label)
    from sklearn.linear_model import LinearRegression,Ridge,Lasso
    #regr=LinearRegression()
#     regr=Ridge(alpha=1)
    regr = Lasso(alpha=0.01)
    regr.fit(features.values,label.values)
    Y_pred=regr.predict(features.values)
    print("Coef:",regr.coef_)
    from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
    print("MSE:",mean_squared_error(label.values,Y_pred))
    print("MAE:",mean_absolute_error(label.values,Y_pred))
    print("R2:",r2_score(label.values,Y_pred))

def main():
    features,label=hr_preprocessing()
#     regr_test(features[["number_project","average_monthly_hours"]],features["last_evaluation"])
    hr_modeling(features,label)
if __name__=="__main__":
    main()


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


8999 3000 3000
0
KNN -ACC 0.9762195799533282
KNN -REC 0.9577197149643706
KNN -F1 0.9495996231747528
1
KNN -ACC 0.946
KNN -REC 0.901685393258427
KNN -F1 0.8879668049792532
2
KNN -ACC 0.9523333333333334
KNN -REC 0.916445623342175
KNN -F1 0.9062295081967213
0
RandomForest -ACC 0.9971107900877876
RandomForest -REC 0.9876484560570071
RandomForest -F1 0.9937858508604207
1
RandomForest -ACC 0.982
RandomForest -REC 0.9480337078651685
RandomForest -F1 0.9615384615384617
2
RandomForest -ACC 0.9843333333333333
RandomForest -REC 0.9575596816976127
RandomForest -F1 0.9684775318578136
0
Adaboost -ACC 0.9641071230136682
Adaboost -REC 0.9163895486935867
Adaboost -F1 0.9227457546041615
1
Adaboost -ACC 0.9533333333333334
Adaboost -REC 0.8932584269662921
Adaboost -F1 0.9008498583569405
2
Adaboost -ACC 0.9573333333333334
Adaboost -REC 0.8978779840848806
Adaboost -F1 0.913630229419703
0
LogisticRegression -ACC 0.8005333925991777
LogisticRegression -REC 0.3771971496437055
LogisticRegression -F1 0.4694058527

## 一.分类  

 欧式距离  
 曼哈顿距离
 闵可夫斯基距离

 KD-Tree: 树形结构快速寻找最近点   

KNN   (k个最近的邻居)
## 朴素贝叶斯模型

概率  
条件概率  
联合概率  


生成模型 (联合概率分布)   
贝叶斯  
判别模型    
knn


## 决策树模型

信息增益 - ID3  
熵  
选择信息增益最大的做第一次切分  

信息增益率 - C4.5  

Gini系数-CART  不纯度 

连续纸切分,计算每个间隔  
规则用尽  投票
过拟合  剪枝  前剪枝, 后剪枝   


## 支持向量机  

线性核函数  
多项式核函数  
高斯径向基核函数  

问题: 少部分异常  松弛变量  减少过拟合   
样本不平衡  要看场景  进行加权  
多分类问题  建立多个svm模型,或者分类间两两进行svm分类

                       



## 集成学习  
n^p 弱可学习 效果不明显  
m^n 强可学习 效果明显  
- 袋装法  bagging  

随机森林  根据每棵树投票,均值确定
树的个数,太多易过拟合  
树的特征数, 少于50个特征取全部  特征太多时,随机选取特征  
数的训练集,体现树的差异性,  
优点: 每个决策树不用全部特征
    不需要剪枝,有效避免过拟合  

- 提升法  
Adaboost  惩罚系数进行特征权重分配
默认的弱分类器为决策树




## 二.回归

回归分析: 确定多个变量间相互依赖的定量关系的一种统计分析方法  
梯度下降法: 步长选取和梯度大小有关,  
凸函数,极小值就是最小值  
正则化  有系统误差,有多极值点,  倾向于选择权值更小的  
岭回归  L2正则化  
Lasso回归  L1正则化  


逻辑回归,当做分类器使用  
值域[0,1]






## 三.回归分类
### 人工神经网络  

感知器   


   


## 四.聚类






