In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
train_set = pd.read_csv('train.csv')
train_X = train_set.values[:, 1:-1]   # shape = (49502, 93)
train_Y = train_set.values[:, -1:]

test_set = pd.read_csv('test.csv')
test_X = test_set.values[:, 1:]

train_X.shape, test_X.shape

In [None]:
train_X = train_X.astype(float)
test_X = test_X.astype(float)
X = np.vstack([train_X, test_X])
X.shape

In [None]:
# PCA分析
meanValue = np.mean(X, axis=0)        # 计算各特征的均值，按列计算
covMat = np.cov(X, rowvar=0)          # 以列向量为变量计算协方差矩阵
eigVals, eigVects = np.linalg.eig(covMat)   # 计算特征值和特征向量
eigVal_index = np.argsort(eigVals)[::-1]    # 特征值从大到小排序

sum_eigVals = sum(eigVals)
threshold = sum_eigVals * 0.95
tempsum = 0
for i in range(len(eigVals)):
    v = eigVals[eigVal_index[i]]
    tempsum += v
    print(f'第{i+1}个特征值：{v}，占比{v/sum_eigVals*100}%')
    if tempsum >= threshold:
        k = i + 1
        print(f'\n前{i+1}个特征值满足占比 95% 以上！共有{len(eigVals)}个特征值')
        break

In [None]:
# PCA降维
eigVectsMain = eigVects[:, eigVal_index[:k]]   # 选取基向量
X_ld = np.matmul(X, eigVectsMain)  # 降维
train_X_ld = X_ld[:49502]
test_X_ld = X_ld[49502:]
_, nd = test_X_ld.shape; nd

In [None]:
# 对数据的训练集进行标准化
ss = StandardScaler()
train_X = ss.fit_transform(np.float64(train_X_ld))

In [None]:
hl_sizes = []
for i in range(7):
    n = nd // 10 -1
    item = []
    for j in range(1,n):
        p = np.random.rand()
        if p < 0.6:
            a = 9+10*j
            b = a+10*j+1 
            if b > nd-5:
                break
            item.append(np.random.randint(a, b))
    if item:
        hl_sizes.append(sorted(item, reverse=True))
hl_sizes.append([40,20])
hl_sizes

In [None]:
del_index = [1, 2, 3,4,6]
del_index.sort(reverse=True)
for item in del_index:
    del hl_sizes[item]
hl_sizes

In [None]:
# 构建模型
clf = MLPClassifier(solver='lbfgs', random_state=1)
# 自动调参
# hl_sizes = [[50, 40, 30, 20], [40, 20], [45, 30, 15]]
param_grid = {'alpha':[1, 1e-1, 1e-2, 1e-3, 1e-4],'hidden_layer_sizes':hl_sizes}
grid_search = GridSearchCV(clf, param_grid, n_jobs = 1, verbose=10)
grid_search.fit(train_X[8000:12000,:], train_Y[8000:12000])
alpha, hl_sizes = grid_search.best_params_['alpha'], grid_search.best_params_['hidden_layer_sizes'];alpha,hl_sizes

In [None]:
# 自动调参后，再次构建模型
clf = MLPClassifier(solver='lbfgs', alpha=alpha, hidden_layer_sizes=hl_sizes, random_state=1)
# 训练模型
clf.fit(train_X_ld, train_Y)
r = clf.score(train_X_ld, train_Y)
print("R值(准确率):", r)

In [None]:
# 预测
test_X_ld = ss.fit_transform(np.float64(test_X_ld))  # 标准化
test_Y_predict = clf.predict_proba(test_X_ld)
test_Y_predict.shape

In [None]:
# 写入测试表
submission = pd.read_csv('sampleSubmission.csv')
submission.iloc[:,1:] = test_Y_predict[:]
submission.head()
submission.to_csv('sampleSubmission.csv',sep=',', header=True, index=False)