In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold

import gc

seed=42
root_path='/kaggle/input/playground-series-s4e4/'

In [2]:
def Pipe(df: pd.DataFrame):
    cols = df.select_dtypes('object').columns.tolist()
    df[cols] = df[cols].astype('category')
    
    return df

In [3]:
train = pd.read_csv(root_path + 'train.csv').pipe(Pipe)
test = pd.read_csv(root_path + 'test.csv').pipe(Pipe)

train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

train.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']
test.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']

In [4]:
train['is_train'] = 1
test['is_train'] = 0
df = pd.concat([train, test], axis=0)

df['Meat_Yield'] = df['Shucked_weight'] / (df['Whole_weight'] + df['Shell_weight'])
df['Shell_Ratio'] = df['Shell_weight'] / df['Whole_weight']
df['Weight_to_Shucked_Weight'] = df['Whole_weight'] / df['Shucked_weight']
df['Viscera_Ratio'] = df['Viscera_weight'] / df['Whole_weight']

train = df[df['is_train'] == 1].drop(columns=['is_train'])
test = df[df['is_train'] == 0].drop(columns=['is_train'])

train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Rings,Meat_Yield,Shell_Ratio,Weight_to_Shucked_Weight,Viscera_Ratio
0,F,0.55,0.43,0.15,0.7715,0.3285,0.1465,0.24,11.0,0.324765,0.311082,2.348554,0.18989
1,F,0.63,0.49,0.145,1.13,0.458,0.2765,0.32,11.0,0.315862,0.283186,2.467249,0.24469
2,I,0.16,0.11,0.025,0.021,0.0055,0.003,0.005,6.0,0.211538,0.238095,3.818182,0.142857
3,M,0.595,0.475,0.15,0.9145,0.3755,0.2055,0.25,10.0,0.322456,0.273373,2.435419,0.224713
4,I,0.555,0.425,0.13,0.782,0.3695,0.16,0.1975,9.0,0.377233,0.252558,2.116373,0.204604


In [5]:
def cv_model(train_df, test_df, fea, n_fold=5, seed=seed): 
    
    labels = train_df['Rings']
    
    # 选取适合的特征
    train = train_df[fea]
    test = test_df[fea]

    # 使用SKfold按照target的比例划分
    skf = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

    params={
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 1000,
    'learning_rate': 0.03,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'subsample': 0.8,
    'verbose': -1,
    'random_state': 42
}

    # 特征重要性
    feature_importance_values = np.zeros(len(fea))

    # 测试集结果
    test_predictions = np.zeros(test.shape[0])

    for fold, (train_index, val_index) in enumerate(skf.split(train, labels)):
        print(f"{fold + 1} fold", '-' * 30)
        
        
        # 获取训练集
        X_train, y_train = train.iloc[train_index], labels.iloc[train_index]
        # 获取验证集
        X_valid, y_valid = train.iloc[val_index], labels.iloc[val_index]

        # 包装数据集
        lgb_train = lgb.Dataset(X_train, label=y_train)
        lgb_val = lgb.Dataset(X_valid, label=y_valid)

        # 创建模型
        model = lgb.train(params=params,
                          train_set=lgb_train,
                          valid_sets=[lgb_train, lgb_val],
                          valid_names=['train', 'val'],
                          callbacks=[early_stopping(100), log_evaluation(100)])

        # 记录最佳的迭代次数
        best_iteration = model.best_iteration

        # 记录特征重要性
        feature_names = model.feature_name()
        feature_importance_values += (model.feature_importance() / n_fold)

        # 预测并记录
        test_prediction = model.predict(test, num_iteration=best_iteration)
        test_predictions += (test_prediction / n_fold)

        # 回收内存
        gc.enable()
        del model, X_train, y_train, X_valid, y_valid, lgb_train, lgb_val
        gc.collect()

    # 将特征重要性记录为dataframe
    feature_importances = pd.DataFrame({'feature': feature_names, 'importances': feature_importance_values})


    return test_predictions, feature_importances

In [6]:
fea = train.columns.tolist()
fea.remove('Rings')

test_predictions, feature_importance = cv_model(train, test, fea)

1 fold ------------------------------
Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 1.83871	val's rmse: 1.8529
[200]	train's rmse: 1.78964	val's rmse: 1.83274
[300]	train's rmse: 1.76213	val's rmse: 1.82794
[400]	train's rmse: 1.73891	val's rmse: 1.82587
[500]	train's rmse: 1.71794	val's rmse: 1.82511
[600]	train's rmse: 1.69799	val's rmse: 1.82367
[700]	train's rmse: 1.67843	val's rmse: 1.82281
[800]	train's rmse: 1.66054	val's rmse: 1.8226
[900]	train's rmse: 1.64396	val's rmse: 1.82185
[1000]	train's rmse: 1.62845	val's rmse: 1.82111
Did not meet early stopping. Best iteration is:
[997]	train's rmse: 1.6289	val's rmse: 1.82103
2 fold ------------------------------
Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 1.83271	val's rmse: 1.87654
[200]	train's rmse: 1.78333	val's rmse: 1.85872
[300]	train's rmse: 1.75559	val's rmse: 1.85485
[400]	train's rmse: 1.7314	val's rmse: 1.85306
[500]	train's rmse: 1.7107	val's rm

In [7]:
sub = pd.read_csv(root_path + 'sample_submission.csv')
sub['Rings'] = test_predictions

sub.to_csv('/kaggle/working/submission.csv', index=False)