<a id='1'></a>
# 1.0- 加载所需资料与配置

In [1]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-dec-2021/train.csv
/kaggle/input/tabular-playground-series-dec-2021/test.csv


In [2]:
import matplotlib.pyplot as plt
import plotly.express as px


from lightgbm import LGBMClassifier


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

import gc

In [3]:
# 隐藏warning
import warnings
warnings.filterwarnings('ignore')

In [4]:
#相关优化配置
SEED   = 42
pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', 15) 
pd.set_option('display.max_rows', 25)

<a id='2'></a>
# 1.2- 加载数据集

In [5]:
# 建立dataframe函数
def import_csv(path):
    """
    Import CSV to a dataframe using the specified path...
    """
    df = pd.read_csv(path)
    return df

In [6]:
# 建立dataframe
TRAIN_PATH = '/kaggle/input/tabular-playground-series-dec-2021/train.csv'
train_df = import_csv(TRAIN_PATH)

TEST_PATH = '/kaggle/input/tabular-playground-series-dec-2021/test.csv'
test_df = import_csv(TEST_PATH)

SUB_PATH = '/kaggle/input/tabular-playground-series-dec-2021/sample_submission.csv'
submission_df = import_csv(SUB_PATH)

In [7]:
# 内存优化函数
def reduce_mem_usage(df, verbose = True):
    """
    Takes an input dataframe and optimize the variable types to reduce memory consumption.
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem)) 
        
    return df

In [8]:
# 进行优化
train_df = reduce_mem_usage(train_df)
test_df = reduce_mem_usage(test_df)

Mem. usage decreased to 259.40 Mb (84.8% reduction)
Mem. usage decreased to 63.90 Mb (84.8% reduction)


___

<a id='3'></a>
# 2.0-了解数据集

In [9]:
# 观察train内的前5行
train_df.head()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,...,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,0,3189,40,8,30,13,3270,...,0,0,0,0,0,0,1
1,1,3026,182,5,280,29,3270,...,0,0,0,0,0,0,2
2,2,3106,13,7,351,37,2914,...,0,0,0,0,0,0,1
3,3,3022,276,13,192,16,3034,...,0,0,0,0,0,0,2
4,4,2906,186,13,266,22,2916,...,0,0,0,0,0,0,2


In [10]:
# 观察统计变量
train_df.describe()

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,...,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
count,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,...,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0,4000000.0
mean,1999999.5,2980.19,151.59,15.1,271.32,51.66,1766.64,...,0.02,0.01,0.01,0.04,0.04,0.03,1.77
std,1154700.68,289.05,109.96,8.55,226.55,68.22,1315.61,...,0.13,0.1,0.11,0.2,0.19,0.17,0.89
min,0.0,1773.0,-33.0,-3.0,-92.0,-317.0,-287.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,999999.75,2760.0,60.0,9.0,110.0,4.0,822.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1999999.5,2966.0,123.0,14.0,213.0,31.0,1436.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,2999999.25,3217.0,247.0,20.0,361.0,78.0,2365.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0
max,3999999.0,4383.0,407.0,64.0,1602.0,647.0,7666.0,...,1.0,1.0,1.0,1.0,1.0,1.0,7.0


In [11]:
# 通过Hill Shade features进一步理解
train_df[['Hillshade_9am','Hillshade_Noon','Hillshade_3pm']].describe()

Unnamed: 0,Hillshade_9am,Hillshade_Noon,Hillshade_3pm
count,4000000.0,4000000.0,4000000.0
mean,211.84,221.06,140.81
std,30.76,22.23,43.7
min,-4.0,49.0,-53.0
25%,198.0,210.0,115.0
50%,218.0,224.0,142.0
75%,233.0,237.0,169.0
max,301.0,279.0,272.0


In [12]:
# 观察Soil_Type15 和 Soil_Type7
train_df[['Soil_Type15','Soil_Type7']].describe()

Unnamed: 0,Soil_Type15,Soil_Type7
count,4000000.0,4000000.0
mean,0.0,0.0
std,0.0,0.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,0.0,0.0


In [13]:
# 观察一些指标
train_df.info(max_cols = 15)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000000 entries, 0 to 3999999
Columns: 56 entries, Id to Cover_Type
dtypes: int16(9), int32(1), int8(46)
memory usage: 259.4 MB


In [14]:
# 观察有没有丢失数据
train_df.isnull().sum()

Id                                  0
Elevation                           0
Aspect                              0
Slope                               0
Horizontal_Distance_To_Hydrology    0
                                   ..
Soil_Type37                         0
Soil_Type38                         0
Soil_Type39                         0
Soil_Type40                         0
Cover_Type                          0
Length: 56, dtype: int64

**Notes:** There is no **Null** or **NaN** values in the dataset, this make it easier to start training a model without gettin into filling missing values...

In [15]:
# 可视化目标分布
target_df = pd.DataFrame(train_df['Cover_Type'].value_counts()).reset_index()
target_df.columns = ['Cover_Type', 'count']
fig = px.bar(data_frame =target_df, x = 'Cover_Type', y = 'count', color = "count", 
             color_continuous_scale = "Emrld", width=700, height=400)
fig.show()

In [16]:
# 多少数据是可用的
target_df.groupby('Cover_Type').sum().reset_index()

Unnamed: 0,Cover_Type,count
0,1,1468136
1,2,2262087
2,3,195712
3,4,377
4,5,1
5,6,11426
6,7,62261



**Notes:** The target distribution in the dataset is unbalanced, we will train a baseline model and then build small improvements over time, in the following sections...

<a id='4'></a>
# 3.0- 特征工程
更简单的模型更容易理解，可以作为好的基线模型

In [17]:
# Define fuctions to construct new features for the model.

def euclidean_dist(df, horz_dist, vert_dist, feature_name):
    """
    Calculates the Euclidean distance based on horz. and vert. distance...
    Args:
        df (DataFrame)    : Input dataframe to add the new feature.
        horz_dist (str)   : Name of the horizontal distance field in the dataframe.
        vert_dist (str)   : Name of the vertical distance field in the dataframe.
        feature_name (str): Name of the new created field in the dataframe.
    Returns:
        df (DataFrame)    : Dataframe populated with the new feature.
    """
    
    df[feature_name] = np.sqrt(df[horz_dist] ** 2 + df[vert_dist] ** 2)
    return df
        
def correct_azimut(df):
    """
    Corrects the azimut to be withing 0 - 360 degrees
    Args:
        df (DataFrame): Input dataframe that we need to correct.
    
    Returns:
        df (DataFrame): Dataframe with the feature ranges corrected.
    """
    
    df["Aspect"][df["Aspect"] < 0] += 360
    df["Aspect"][df["Aspect"] > 359] -= 360
    return df


def correct_hillshade(df):
    """
    Corrects the Hillshade values from 0 - 255
    Args:
        df (DataFrame): Input dataframe that we need to correct.
    
    Returns:
        df (DataFrame): Dataframe with the feature ranges corrected.
    """
    
    for feat in df.columns:
        if 'Hillshade' in feat:
            train_df.loc[train_df[feat] < 0, feat] = 0
            train_df.loc[train_df[feat] > 255, feat] = 255
    return df


def count_diversity(df, feature_group_name = 'soil_type'):
    """
    Counts the diversity for example different types of soils in the training row
    Args:
        df (Dataframe): Input dataframe for the feature creation.
        feature_group_name (str): Name of the group in the dataframe that we will be counting.
        
    Returns:
        df (Dataframe): Dataframe populated with the new feature.
    """
    
    features_group = [x for x in df.columns if x.startswith(feature_group_name)]
    df[feature_group_name + "_Count"] = df[features_group].sum(axis=1)
    return df


def remove_cover_type(df, cover_value = 5):
    """
    Remove the selected cover type, Because min. amount of train information.
    
    Args:
        df (Dataframe): Input dataframe to be processed.
        cover_value (int): Value of the cover that needs to be removed from the DataFrame.
    
    Returns:
        df (Dataframe): Dataframe without the selected cover_value.
    """
    
    df = df[df['Cover_Type'] != cover_value]
    return df

In [18]:
# 建立新的特征值
train_df = euclidean_dist(train_df, 'Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology', 'Euclidean_Distance_To_Hydrology')
# 加权
train_df = correct_azimut(train_df)
train_df = correct_hillshade(train_df)
# 建立其他特征值
train_df = count_diversity(train_df, feature_group_name = 'Soil_Type')
train_df = count_diversity(train_df, feature_group_name = 'Wilderness_Area')
# 因为只有一个例子，把conver_type = 5 移开
train_df = remove_cover_type(train_df)

In [19]:
# 建立新的特征值
test_df = euclidean_dist(test_df, 'Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology', 'Euclidean_Distance_To_Hydrology')
# 加权
test_df = correct_azimut(test_df)
test_df = correct_hillshade(test_df)
# 建立其他特征值
test_df = count_diversity(test_df, feature_group_name = 'Soil_Type')
test_df = count_diversity(test_df, feature_group_name = 'Wilderness_Area')

In [20]:
# 查看
train_df.sample(10)

Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,...,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type,Euclidean_Distance_To_Hydrology,Soil_Type_Count,Wilderness_Area_Count
382060,382060,3411,111,15,345,29,1871,...,0,0,0,1,,1,1
3153903,3153903,3028,209,9,106,41,2059,...,0,0,1,1,113.65,2,0
2797641,2797641,3211,26,36,49,383,2047,...,0,0,0,2,134.23,0,1
655981,655981,2703,153,25,150,97,1075,...,0,0,0,2,178.63,0,1
3828362,3828362,2960,36,7,55,47,1944,...,0,0,0,2,72.35,0,0
2494543,2494543,2973,303,10,300,66,3375,...,0,0,0,2,169.76,0,1
2146108,2146108,3079,295,28,85,-60,1435,...,0,0,0,1,104.04,0,1
3417162,3417162,3525,321,4,388,6,1026,...,0,0,0,1,139.67,0,1
1621982,1621982,2791,181,15,1097,34,399,...,0,0,0,2,157.85,2,1
1050172,1050172,3190,187,13,55,1,702,...,0,0,0,1,55.01,2,1


___

<a id='5'></a>
# 4.0- 为训练步骤准备数据集
本节将组织训练功能的数据集，还将重点介绍识别分类特征与数值特征，以提高模型性能，同时对数值变量应用一些归一化。

In [21]:
# 准备用于训练的数据集，删除一些没有价值的变量。
remove = ['Id', 'Cover_Type', 'Soil_Type15', 'Soil_Type7'] # 通过以上分析，决定删除这些数据
features = [values for values in train_df.columns if values not in remove]
X = train_df[features]
y = train_df['Cover_Type']

X_test = test_df[features]

In [22]:
# 识别分类和数值特征
NUMBER_OF_UNIQUE = 25
categorical_features = [col for col in features if train_df[col].nunique() < NUMBER_OF_UNIQUE]
numerical_features = [col for col in features if train_df[col].nunique() >= NUMBER_OF_UNIQUE]

In [23]:
# 输出所有已识别分类要素
print(categorical_features)

['Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40', 'Soil_Type_Count', 'Wilderness_Area_Count']


In [24]:
# 删除原始数据帧以节省内存（优化）
del train_df # Deletes the train_df dataframe
del test_df  # Deletes the test_df dataframe
gc.collect()

57

<a id='6'></a>
# 5.0- 交叉验证策略

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = SEED, test_size = 0.2)

---

<a id='7'></a>
# 6.0- LGBM训练机器学习模型

In [26]:
# 定义一些模型参数
ESTIMATORS = 10 # Default value for the training 2048
LR = 0.1
VERBOSE = 16
EARLY_STOPPING = 250

lgb_params = {'n_estimators'     : ESTIMATORS,      # Number of boosting iterations.
              'random_state'     : SEED,            # Random seed initilizer for the model, helps to replicate the experiments.
              'learning_rate'    : LR,              # The model learning rate.
              'subsample'        : 0.95,            # Row subsample from the dataset, like feature_fraction, but this will randomly select part of data without resampling
              'subsample_freq'   : 1,               # Use or not subsample frequency.
              'colsample_bytree' : 0.75,            # LightGBM will randomly select a subset of features on each iteration (tree).
              'reg_alpha'        : 0.5,             # L1 regularization.
              'reg_lambda'       : 0.5,             # L2 regularization.
              'min_child_weight' : 1e-3,            # Minimal sum hessian in one leaf, it can be used to deal with over-fitting.
              'min_child_samples': 32,              # Minimal number of data in one leaf. Can be used to deal with over-fitting.
              'objective'        : 'multiclass',    # Softmax objective function.
              'metric'           : 'multi_logloss', # Log loss for multi-class classification.
              'device_type'      : 'gpu',
             }   

<a id='8'></a>
# 6.1- 使用LightGBM训练梯度提升决策树


In [27]:
# 使用lgb_params训练 LGBM 模型
lgb_classifier = LGBMClassifier(**lgb_params)

lgb_classifier.fit(X_train, 
                   y_train, 
                   eval_set = [(X_val, y_val)],
                   categorical_feature = categorical_features,
                   # 如果一个验证数据中的一个指标在最后early_stopping_round轮中没有改善，将停止训练
                   early_stopping_rounds = EARLY_STOPPING,
                   verbose = VERBOSE, # 控制 LightGBM 的详细程度
                  )

LGBMClassifier(colsample_bytree=0.75, device_type='gpu', metric='multi_logloss',
               min_child_samples=32, n_estimators=10, objective='multiclass',
               random_state=42, reg_alpha=0.5, reg_lambda=0.5, subsample=0.95,
               subsample_freq=1)

<a id='9'></a>
# 6.2- 验证模型结果

In [28]:
# 查看模型的前10个特征重要性。
feature_imp = pd.DataFrame(sorted(zip(lgb_classifier.feature_importances_,X.columns)),columns=['Value','Feature'])

In [29]:
# 回顾模型认为重要的前五5个特征
print(feature_imp.head(10), '\n')
print(feature_imp.tail(10))

   Value      Feature
0      0  Soil_Type18
1      0  Soil_Type19
2      0  Soil_Type21
3      0  Soil_Type25
4      0  Soil_Type26
5      0   Soil_Type8
6      0   Soil_Type9
7      1  Soil_Type12
8      1  Soil_Type14
9      1  Soil_Type16 

    Value                             Feature
45     43                         Soil_Type10
46     48     Euclidean_Distance_To_Hydrology
47     56                    Wilderness_Area1
48     95                    Wilderness_Area3
49    116                     Soil_Type_Count
50    144      Vertical_Distance_To_Hydrology
51    147    Horizontal_Distance_To_Hydrology
52    185     Horizontal_Distance_To_Roadways
53    206  Horizontal_Distance_To_Fire_Points
54    313                           Elevation


In [30]:
# 模型预测的结果和针对真实目标的评估
preds_valid = lgb_classifier.predict(X_val)
accuracy = accuracy_score(y_val, preds_valid)
print("Mean Accuracy :", accuracy)

Mean Accuracy : 0.9336425


* **Mean Accuracy : 0.96136375** ... Simple baseline model, No Features outside the default variables...
* **Mean Accuracy : 0.96140750** ... Added Euclidean distance as a Feature to the model...
* **Mean Accuracy : 0.96140750** ... Added Euclidean distance as a Feature + Corrections to the ranges of the Azimut and the Hillshade...
* **Mean Accuracy : 0.96057125** ... Added Euclidean distance as a Feature + Corrections to the ranges of the Azimut and the Hillshade, New Train Parameters 128 Estimators...
* **Mean Accuracy : 0.96060625** ... Added Euclidean distance as a Feature + Corrections to the ranges of the Azimut and the Hillshade, New Train Parameters 128 Estimators, Removed Soil 17 & 7...
* **Mean Accuracy : 0.96217000** ... Added Euclidean distance as a Feature + Corrections to the ranges of the Azimut and the Hillshade, New Train Parameters 2048 Estimators, Removed Soil 17 & 7...

In [31]:
# 删除为简单训练创建的训练和验证数据帧以节省内存（优化）
del X_train, 
del X_val, 
del y_train, 
del y_val
gc.collect()

21

---

<a id='10'></a>
# 7.0- 训练机器学习模型LGBM

<a id='11'></a>
# 7.1- 使用LightGBM在交叉验证循环、梯度提升决策树中训练简单模型

In [32]:
# 定义一些交叉验证模型参数
FOLDS = 20
folds = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)

oof_preds = np.zeros(X.shape[0])
sub_preds = np.zeros(X_test.shape[0])

for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
    trn_x, trn_y = X[features].iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = X[features].iloc[val_idx], y.iloc[val_idx]
    
    clf = LGBMClassifier(**lgb_params)
    clf.fit(trn_x, 
            trn_y, 
            eval_set = [(val_x, val_y)],
            categorical_feature = categorical_features,
            verbose=VERBOSE, 
            early_stopping_rounds=EARLY_STOPPING)
    
    oof_preds[val_idx] = clf.predict(val_x, num_iteration=clf.best_iteration_)
    sub_preds += clf.predict(X_test[features], num_iteration=clf.best_iteration_) / folds.n_splits
    
    print('Fold %2d Accuracy Score: %.6f' % (n_fold + 1, accuracy_score(val_y, oof_preds[val_idx])))
    del trn_x, trn_y, val_x, val_y
    gc.collect()

print('\n')    
print('Avg. Accuracy Score %.6f' % accuracy_score(y, oof_preds))

Fold  1 Accuracy Score: 0.933040
Fold  2 Accuracy Score: 0.934170
Fold  3 Accuracy Score: 0.933530
Fold  4 Accuracy Score: 0.933450
Fold  5 Accuracy Score: 0.933270
Fold  6 Accuracy Score: 0.933310
Fold  7 Accuracy Score: 0.933545
Fold  8 Accuracy Score: 0.933800
Fold  9 Accuracy Score: 0.933550
Fold 10 Accuracy Score: 0.932425
Fold 11 Accuracy Score: 0.932870
Fold 12 Accuracy Score: 0.933085
Fold 13 Accuracy Score: 0.933745
Fold 14 Accuracy Score: 0.934710
Fold 15 Accuracy Score: 0.933125
Fold 16 Accuracy Score: 0.934175
Fold 17 Accuracy Score: 0.933525
Fold 18 Accuracy Score: 0.933135
Fold 19 Accuracy Score: 0.932800
Fold 20 Accuracy Score: 0.932910


Avg. Accuracy Score 0.933408


<a id='12'></a>
# 7.2- 验证模型结果

In [33]:
# 查看模型的前 10 个特征重要性
feature_imp_cv = pd.DataFrame(sorted(zip(clf.feature_importances_,X.columns)), columns=['Value','Feature'])

In [34]:
# 回顾模型认为重要的前五个特征
print(feature_imp_cv.head(10), '\n')
print(feature_imp_cv.tail(10))

   Value      Feature
0      0  Soil_Type18
1      0  Soil_Type19
2      0  Soil_Type21
3      0  Soil_Type25
4      0  Soil_Type28
5      0   Soil_Type8
6      0   Soil_Type9
7      1  Soil_Type12
8      1  Soil_Type16
9      1  Soil_Type20 

    Value                             Feature
45     43                         Soil_Type10
46     44     Euclidean_Distance_To_Hydrology
47     60                    Wilderness_Area1
48     94                    Wilderness_Area3
49    118                     Soil_Type_Count
50    154      Vertical_Distance_To_Hydrology
51    159    Horizontal_Distance_To_Hydrology
52    168     Horizontal_Distance_To_Roadways
53    218  Horizontal_Distance_To_Fire_Points
54    308                           Elevation


___

<a id='13'></a>
# 8.0 - 提交

In [35]:
# 使用 80/20 train/val 模型计算预测
preds_test = lgb_classifier.predict(X_test[features])

# 提交来自 CV 训练模型的预设
sub_preds = sub_preds.astype(np.int)
submission_df['Cover_Type'] = sub_preds
submission_df.to_csv('cv_sub_v12112021.csv', index = None)