# 总的思路就是train很多个model然后取他们的平均值(*╹▽╹*)
# 查看输入文件

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 导入包

In [None]:
import numpy as np
import pandas as pd
import torch
import sklearn
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import torch.utils.data as data  # 制作dataset
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression


# 切割数据集
归一化效果反而会变差

In [None]:
def Gen_Dataset_K(ToTensor=True, K=10):      # 选用相关性最强的前K个特征
    training_set = pd.read_csv("/kaggle/input/ml2021spring-hw1/covid.train.csv")
    test_set = pd.read_csv("/kaggle/input/ml2021spring-hw1/covid.test.csv")
    output_idx = test_set.iloc[:, 0]      # 用来输出csv的idx
    # 分出训练&测试集
    best_features = SelectKBest(f_regression, k=K)
    # print(best_features)
    x = training_set.iloc[:, 1:-1]
    y = training_set.iloc[:, -1]

    fit = best_features.fit(x, y)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(x.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
    # print('============== 前20个相关特征 ===============')
    # print(featureScores.nlargest(20, 'Score'))  # print best K features,前14个比较显著
    features = list(featureScores.nlargest(K, 'Score').index)
    train_x = training_set.iloc[:, features]
    train_y = training_set.iloc[:, -1].to_numpy()
    test_x = test_set.iloc[:, features]
    train_x = train_x.to_numpy()
    test_x = test_x.to_numpy()
    if ToTensor:  # 转成tensor给pytorch(df -> numpy -> tensor)
        return torch.FloatTensor(train_x), torch.FloatTensor(train_y), torch.FloatTensor(test_x), output_idx
    else:         # 直接返回numpy给sklearn训练
        return train_x, train_y, test_x, output_idx

class MyDataset(data.Dataset):              # 自制数据集,继承Dataset,用来生成batch
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, index):           # 返回的是tensor
        x, y = self.x[index], self.y[index]
        return x, y

    def __len__(self):
        return len(self.x)

# 不同模型训练(sklearn & Pytorch)

In [None]:
# 自定义RMSE
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, yhat, y):
        return torch.sqrt(self.mse(yhat, y))

def xgb():
    # XGBoost回归
    model_xgb = XGBRegressor(n_estimators=3000, learning_rate=0.003, max_depth=3, random_state = 14138)
    # train_x, train_y, test_x, output_idx = Gen_Dataset_42(ToTensor=False)
    train, label, test, output_idx = Gen_Dataset_K(ToTensor=False, K=20)
    model_xgb.fit(train, label)
    pred_train_xgb = model_xgb.predict(train)  # 算一下训练集上的RMSE
    loss_func = RMSELoss()
    print('XGB训练集上的RMSE: ', loss_func(torch.FloatTensor(pred_train_xgb), torch.FloatTensor(label)))
    pred_xgb = model_xgb.predict(test)
    sub = pd.DataFrame()
    sub['id'] = output_idx
    sub['tested_positive'] = pred_xgb
    sub.to_csv('sub_XGB.csv',index=False)
    print('XGB预测结果输出完毕')
    return model_xgb

def gb():
    model = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.003, max_depth=3, random_state = 14138)
    train, label, test, output_idx = Gen_Dataset_K(ToTensor=False, K=20)
    model.fit(train, label)
    pred_train = model.predict(train)  # 算一下训练集上的RMSE
    loss_func = RMSELoss()
    print('GB训练集上的RMSE: ', loss_func(torch.FloatTensor(pred_train), torch.FloatTensor(label)))
    pred = model.predict(test)
    sub = pd.DataFrame()
    sub['id'] = output_idx
    sub['tested_positive'] = pred
    sub.to_csv('sub_GB.csv',index=False)
    print('GB预测结果输出完毕')
    return model

def lasso():
    model = Lasso(alpha =0.0005, random_state=14138)
    train, label, test, output_idx = Gen_Dataset_K(ToTensor=False, K=20)
    model.fit(train, label)
    pred_train = model.predict(train)  # 算一下训练集上的RMSE
    loss_func = RMSELoss()
    print('Lasso训练集上的RMSE: ', loss_func(torch.FloatTensor(pred_train), torch.FloatTensor(label)))
    pred = model.predict(test)
    sub = pd.DataFrame()
    sub['id'] = output_idx
    sub['tested_positive'] = pred
    sub.to_csv('sub_Lasso.csv',index=False)
    print('Lasso预测结果输出完毕')
    return model

def E_Net():
    model = ElasticNet(alpha =0.0005, l1_ratio=0.9, random_state=14138) # l1_ratio=1是L1正则化,=0是L2正则化
    train, label, test, output_idx = Gen_Dataset_K(ToTensor=False, K=20)
    model.fit(train, label)
    pred_train = model.predict(train)  # 算一下训练集上的RMSE
    loss_func = RMSELoss()
    print('E_Net训练集上的RMSE: ', loss_func(torch.FloatTensor(pred_train), torch.FloatTensor(label)))
    pred = model.predict(test)
    sub = pd.DataFrame()
    sub['id'] = output_idx
    sub['tested_positive'] = pred
    sub.to_csv('sub_E_Net.csv',index=False)
    print('E_Net预测结果输出完毕')
    return model


def KRR():  # 核岭回归
    model = KernelRidge(kernel='linear')
    train, label, test, output_idx = Gen_Dataset_K(ToTensor=False, K=20)
    model.fit(train, label)
    pred_train = model.predict(train)  # 算一下训练集上的RMSE
    loss_func = RMSELoss()
    print('KRR训练集上的RMSE: ', loss_func(torch.FloatTensor(pred_train), torch.FloatTensor(label)))
    pred = model.predict(test)
    sub = pd.DataFrame()
    sub['id'] = output_idx
    sub['tested_positive'] = pred
    sub.to_csv('sub_KRR.csv',index=False)
    print('KRR预测结果输出完毕')
    return model

def svr():
    model = SVR(kernel="linear")
    train, label, test, output_idx = Gen_Dataset_K(ToTensor=False, K=20)
    model.fit(train, label)
    pred_train = model.predict(train)  # 算一下训练集上的RMSE
    loss_func = RMSELoss()
    print('SVR训练集上的RMSE: ', loss_func(torch.FloatTensor(pred_train), torch.FloatTensor(label)))
    pred = model.predict(test)
    sub = pd.DataFrame()
    sub['id'] = output_idx
    sub['tested_positive'] = pred
    sub.to_csv('sub_SVR.csv',index=False)
    print('SVR预测结果输出完毕')
    return model

In [None]:
class NN(nn.Module):
    def __init__(self, n_features, n_hidden):
        super(NN, self).__init__()
        self.net = nn.Sequential(
        nn.Linear(n_features, n_hidden),
        nn.ReLU(),
        nn.Linear(n_hidden, 1),
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

def Nn():
    # 神经网络部分
    # train, label, test, output_idx = Gen_Dataset_42(ToTensor=True)
    train, label, test, output_idx = Gen_Dataset_K(ToTensor=True, K=20)
    [train_x, dev_x, train_y, dev_y] = train_test_split(train, label, test_size=0.08, random_state=14138)
    train_x = train_x.cuda()
    train_y = train_y.cuda()
    test = test.cuda()
    dataset = MyDataset(train_x, train_y)
    train_loader = data.DataLoader(dataset = dataset , batch_size = 64 , shuffle = True)

    n_features = 20
    n_hidden = 64

    model = NN(n_features, n_hidden).cuda()
    epoch = 500
    lr = 0.001
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=5e-3)
    loss_func = RMSELoss()
    min_dev_rmse = 1e8

    loss_train = []
    loss_dev = []

    for e in range(epoch):
        for step, (b_x, b_y) in enumerate(train_loader):
            output = model(b_x.cuda())
            loss = loss_func(output, b_y.cuda())                       # cross entropy loss
            optimizer.zero_grad()                               # clear gradients for this training step
            loss.backward()                                     # backpropagation, compute gradients
            optimizer.step()                                    # apply gradients
        dev_output = model(dev_x.cuda())
        dev_loss = loss_func(dev_output, dev_y.cuda())
        loss_train.append(loss.cpu().detach().numpy())          # numpy是cpu-only
        loss_dev.append(dev_loss.cpu().detach().numpy())
        if e%20 == 0:
            print('Epoch: ', e, '| train loss: %.4f' % loss.cpu().data.numpy(), '| dev loss: %.4f' % dev_loss.cpu().data.numpy())
            min_dev_rmse = dev_loss
    pred_nn = model(test).cpu().detach().numpy()
    sub = pd.DataFrame()
    sub['id'] = output_idx
    sub['tested_positive'] = pred_nn
    sub.to_csv('sub_NN.csv',index=False)
    print('NN预测结果输出完毕')
    plt.plot(loss_train)
    plt.plot(loss_dev)
    plt.legend(['training loss', 'test loss'])
    plt.show()
    return model
# Nn()

In [None]:
def AverageModel():
    model = LinearRegression()
    train, label, test, output_idx = Gen_Dataset_K(ToTensor=False, K=20)
    train_tensor, label_tensor, test_tensor, _ = Gen_Dataset_K(ToTensor=True, K=20)

    model1 = xgb()
    pred_train1 = model1.predict(train)                 # 模型1对训练集的预测
    pred_test1 = model1.predict(test)                   # 模型1对测试集的预测

    model2 = gb()
    pred_train2 = model2.predict(train)
    pred_test2 = model2.predict(test)

    model3 = lasso()
    pred_train3 = model3.predict(train)
    pred_test3 = model3.predict(test)

    model4 = E_Net()
    pred_train4 = model4.predict(train)
    pred_test4 = model4.predict(test)

    model5 = KRR()
    pred_train5 = model5.predict(train)
    pred_test5 = model5.predict(test)

    model6 = svr()
    pred_train6 = model6.predict(train)
    pred_test6 = model6.predict(test)

    model7 = Nn()
    pred_train7 = model7(train_tensor.cuda()).cpu().detach().numpy()
    pred_test7 = model7(test_tensor.cuda()).cpu().detach().numpy()

    pred = (pred_test1+pred_test2+pred_test3+pred_test4+pred_test5+pred_test6+pred_test7)/7
    sub = pd.DataFrame()
    sub['id'] = output_idx
    sub['tested_positive'] = pred
    sub.to_csv('./sub_AverageModel.csv',index=False)
    print('AverageModel预测结果输出完毕')
    return model

In [None]:
AverageModel()