In [1]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler #欠采样
from imblearn.over_sampling import RandomOverSampler #过采样
from imblearn.over_sampling import SMOTE

#ROC_AUC
from sklearn.metrics import roc_curve,auc

# 数据集的划分
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier#KNN
from sklearn.linear_model import LogisticRegression#逻辑回归
from sklearn.ensemble import RandomForestClassifier##随机森林
from sklearn.ensemble import AdaBoostClassifier#ADboost
from sklearn.ensemble import GradientBoostingClassifier##GBDT

#交叉认证
from sklearn.model_selection import GridSearchCV

#评价指标
from sklearn.metrics import accuracy_score,precision_score, \
recall_score,f1_score,cohen_kappa_score

#警告去除
import warnings
warnings.filterwarnings('ignore')

In [2]:
# over：过采样；smote：SMOTE采样（过采样的一种）；under：欠采样；
def ods(data, method,random):
    data1 = data.drop([ 'FLAG'], axis=1) 
    label = data['FLAG']
    if method == 'over':
        #ros = RandomOverSampler(random_state=2021)
        ros = RandomOverSampler()
        x_train, y_label = ros.fit_resample(data1 , label)
    elif method == 'under':
        rus = RandomUnderSampler(random_state=random)
        x_train, y_label = rus.fit_resample(data1 , label)
    elif method == 'smote':
        smote = SMOTE()
        x_train, y_label = smote.fit_resample(data1 , label)
    else:
        print("输入格式错误！！！")
    
    return x_train, y_label

In [3]:
## 输入：train_X是训练集、trian_y是训练集标签、test_x是验证集、test_y是验证集标签、test是测试集、
## 输出：训练集的n折交叉平均分、预测验证集的类别、预测验证集的分数、预测测试集的概率、

def tree_train(train_x,train_y,test_x,test_y,test,models):
    #这里可以添加参数进行调参
    parameters ={
#         'learning_rate':[0.1,0.01],
#         'n_estimators':[3,6,9],
#         'max_depth':[3,4,5,6,7]
#         max_depth=7
#         min_samples_leaf =60, 
#         min_samples_split =1200, 
#         max_features='sqrt', 
#         'n_estimators': range(1,20,1)
    }
    #模型的实例化
    model = models
    #模型+参数+指标+n交叉次数
    clf_rf = GridSearchCV(model,parameters,scoring='roc_auc',cv=3)
    #模型的训练
    clf_rf.fit(train_x,train_y)
    #n折交叉验证的平均成绩
    grade1 = clf_rf.cv_results_['mean_test_score']
    
    #预测验证集
    predict_type1 = clf_rf.predict(test_x)
    predict_type2 = clf_rf.predict_proba(test_x)
    fpr_X,tpr_Y,threshold = roc_curve(test_y, predict_type1)
    grade2 = auc(fpr_X,tpr_Y)
    
    #预测测试集
    predict_probable = clf_rf.predict_proba(test)
    
    #返回训练集的n折交叉平均分、预测验证集的类别、预测验证集的概率、预测验证集的分数、预测测试集的概率、
    return  grade1, predict_type1, predict_type2, grade2, predict_probable

In [4]:
def train_function(train_x,train_y,test_x,test_y,test,rate,models,random_list):
    ##概率矩阵
    predict_test_x = np.array([[0,0] for i in range(len(test_x))])
    predict_test = np.array([[0,0] for i in range(len(test))])

    data = pd.concat([train_x,train_y],axis=1)

    #训练集的n折交叉平均分
    score1 = []
    #预测验证集的分数
    score2 = []
    
    n = len(random_list)
    for i in range(n):
        #这里是欠采样，X是训练集、Y是训练集标签
        X,Y = ods(data,'under',random_list[i])

        #训练集的n折交叉平均分、预测验证集的类别、预测验证集的概率、预测验证集的分数、预测测试集的概率、
        grade1, predict_type1, predict_type2, grade2, predict_probable = tree_train(X,Y,test_x,test_y,test,models)
#         grade1, predict_type1, predict_type2, grade2, predict_probable = tree_train(train_x,train_y,test_x,test_y,test)

        score1.append(grade1)
        score2.append(grade2)

        #可以不选、选grade1、选grade2(理论上)
        predict_test_x = predict_test_x + np.array(predict_type2)*grade2
        predict_test = predict_test + np.array(predict_probable)*grade2

    #将验证集概率矩阵变为类别
    result1 = []
    predict_test_x = predict_test_x/n
    for i in predict_test_x:
        if i[0] > (i[0] + i[1])*rate:
            result1.append(0)
        else:
            result1.append(1)
    fpr_X,tpr_Y,threshold = roc_curve(test_y, result1)
    print('验证集数据的AUC分数：',auc(fpr_X,tpr_Y))

    #测试集概率变类别
    result2 = []
    predict_test = predict_test/n
    for i in predict_probable:
        if i[0] > (i[0] + i[1])*rate:
            result2.append(0)
        else:
            result2.append(1)
    #返回基模型的交叉认证分数、验证集分数、测试集的测试结果
    return score1, score2, result2

In [5]:
##获取数据
train1 = pd.read_csv('./new_data/随机森林/1_data_制造业_13131_56.csv').drop(['industry','REPORT_TYPE','ACCOUTING_STANDARDS','CURRENCY_CD'],axis=1)
train2 = train1[train1['FLAG']!=-1]
train = train2.drop(['FLAG'],axis=1)
label = pd.DataFrame(train2['FLAG'])

# #数据标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train = pd.DataFrame(scaler.fit_transform(train))

#划分训练集和验证集
train_x,test_x,train_y,test_y = train_test_split(train,label,test_size=0.2,random_state = 2021)

In [6]:
knn = KNeighborsClassifier()
lr = LogisticRegression()#0.60
rfc = RandomForestClassifier()#0.73
adboost = AdaBoostClassifier()
gdbt = GradientBoostingClassifier()#0.73

In [8]:
# #p0.74:
# a = [1857,1793,1631,1508,1046,1242,1271,1338,1342,1443]
#p0.75:
b = [2021,1999,1990,1969,1873,1838,1604,1543,1476]
#0.76
c = [1010]
#0.77
d = [1660,1254]
#0.78
e = [1092,1444]
#0.79
f = [1762]

random_list = b+c+d+e+f
a,b,c = train_function(train_x,train_y,test_x,test_y,train1[train1['FLAG']==-1].drop(['FLAG'],axis=1),0.43,gdbt,random_list)

验证集数据的AUC分数： 0.7665926788443345


In [13]:
pd.DataFrame(c).to_csv('./part2/1_GBDT(制造业).csv')

In [14]:
pd.DataFrame(c)

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
2495,0
2496,0
2497,0
2498,0
