In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

DIR = "/content/drive/MyDrive/Competitions/Signate/OCEAN180"
INPUT_DIR = os.path.join(DIR,"input")
OUTPUT_DIR = os.path.join(DIR,"output")

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
train = pd.read_csv(os.path.join(INPUT_DIR,"train_data.csv"))
test = pd.read_csv(os.path.join(INPUT_DIR,"test_data.csv"))
sample_sub = pd.read_csv(os.path.join(INPUT_DIR,"submit_example.csv"), header=None)

print(train.shape)
display(train.head(3))

print(test.shape)
display(test.head(3))

print(sample_sub.shape)
display(sample_sub.head(3))

(14140, 3465)


Unnamed: 0.1,Unnamed: 0,lat,lon,area,YMD,year,month,cover,depth_original,cliff_length,...,MIN_NormG_2020,MIN_NormR_2020,MIN_PPR_2020,MIN_PSNDc2_2020,MIN_RDVI_2020,MIN_IF_2020,MIN_SLAVI_2020,MIN_SIPI2_2020,MIN_VARIgreen_2020,mesh20
0,0,24.989139,125.243167,20.0,20110926,2011.0,9.0,0.05,,0.0,...,0.507961,0.287288,-0.278071,-0.808075,-2.341663,2.353039,95.142525,3.728732,-2.405135,3725_10
1,1,26.996172,127.912024,40.0,2009.7.6-2009.7.28,2009.0,7.0,0.725,,0.0,...,0.353645,0.245048,-0.100556,-0.771423,-3.282602,4.672225,174.767501,-0.571513,0.448861,4027_13
2,2,26.363556,127.735139,20.0,20091014,2009.0,10.0,0.025,,0.0,...,0.579204,0.28126,-0.191884,-0.85373,-3.160207,3.324236,62.711236,3.284478,-57.136191,3927_22


(4039, 3464)


Unnamed: 0.1,Unnamed: 0,lat,lon,area,YMD,year,month,depth_original,cliff_length,aicial_length,...,MIN_NormG_2020,MIN_NormR_2020,MIN_PPR_2020,MIN_PSNDc2_2020,MIN_RDVI_2020,MIN_IF_2020,MIN_SLAVI_2020,MIN_SIPI2_2020,MIN_VARIgreen_2020,mesh20
0,0,24.352222,124.202674,20.0,2002.12,2002.0,12.0,0.5,0.0,725.621704,...,0.409346,0.323023,-0.066356,-0.56081,-3.203797,4.297234,926.905329,1.669992,0.194766,3624_20
1,1,24.420028,124.078583,20.0,20100723,2010.0,7.0,,0.0,0.0,...,0.540733,0.275585,-0.29124,-0.824888,-2.222276,2.263106,57.661693,4.274851,-79.273275,3624_20
2,2,26.494197,127.836013,100.0,"2004.2.20-2004.2.22,2004.2.27-2004.2.29,2004.3...",2004.0,2.0,13.8,73.183418,80.595177,...,0.601792,0.252251,-0.318454,-0.861477,-2.060505,2.09925,42.672829,5.655713,-2.061417,3927_23


(4039, 2)


Unnamed: 0,0,1
0,0,0.47
1,1,0.68
2,2,0.25


In [None]:
train.rename(columns={"Unnamed: 0":"id"}, inplace=True)
test.rename(columns={"Unnamed: 0":"id"}, inplace=True)

In [None]:
def missing_func(df):
  total = df.isnull().sum().sort_values(ascending=False)
  percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
  df_missing = pd.concat([total,percent],axis=1,keys=['total','percent'])
  return df_missing 

In [None]:
train_missing = missing_func(train)
test_missing = missing_func(test)

In [None]:
train_cols = train.columns.tolist()
test_cols = test.columns.tolist()

target = "cover"
print(target)

display(train[target].head(3))

cover


0    0.050
1    0.725
2    0.025
Name: cover, dtype: float64

In [None]:
train_missing_40 = train_missing[train_missing.percent<0.4].index
test_missing_40 =  test_missing[test_missing.percent<0.4].index
train_missing_15 = train_missing[train_missing.percent<0.15].index
test_missing_15 =  test_missing[test_missing.percent<0.15].index

categorical_features = train.dtypes[train.dtypes=="object"].index

features_40 = set(train_missing_40) & set(test_missing_40)
features_15 = set(train_missing_15) & set(test_missing_15)
features = set(features_40) - set(categorical_features)
knn_features = set(features) - set(features_15)  

In [None]:
len(features), len(knn_features)

(2259, 512)

In [None]:
land_sat_feats = []
for years in list(range(2000,2021)):
  n_f = [col for col in features if str(years) in col]
  land_sat_feats += n_f
features = [col for col in features if col not in land_sat_feats]
knn_features = [col for col in knn_features if col not in land_sat_feats]

In [None]:
len(features), len(knn_features)

(309, 62)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from tqdm import tqdm

missing_train = train.fillna(9999)
missing_test = test.fillna(9999)

for mcol in tqdm(knn_features):
    knn_mtrain = missing_train.loc[missing_train[mcol]==9999]
    knn_train = missing_train.loc[missing_train[mcol]!=9999]
    knn_mtest = missing_test.loc[missing_test[mcol]==9999]

    knn = KNeighborsRegressor(n_neighbors=3)
    knn.fit(knn_train[features],knn_train[mcol])
    knn_mtrain[f"knn_{mcol}"] = knn.predict(knn_mtrain[features])
    knn_mtest[f"knn_{mcol}"] = knn.predict(knn_mtest[features])

    train = train.merge(knn_mtrain[["id",f"knn_{mcol}"]],how="left",on="id")
    train[mcol] = train[mcol].fillna(0)
    train[f"knn_{mcol}"] = train[f"knn_{mcol}"].fillna(0)
    train[mcol] = train[mcol] + train[f"knn_{mcol}"]

    test = test.merge(knn_mtest[["id",f"knn_{mcol}"]],how="left",on="id")
    test[mcol] = test[mcol].fillna(0)
    test[f"knn_{mcol}"] = test[f"knn_{mcol}"].fillna(0)
    test[mcol] = test[mcol] + test[f"knn_{mcol}"]

display(train[knn_features].head(10))
display(test[knn_features].head(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn_mtrain[f"knn_{mcol}"] = knn.predict(knn_mtrain[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn_mtest[f"knn_{mcol}"] = knn.predict(knn_mtest[features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn_mtrain[f"knn_{mcol}"] = knn.predict(knn_mtrain[features])
A value is trying to 

Unnamed: 0,DSWI,CTVI,H,mCRIRE,TSAVI,PSNDc2,CVI,TIRS1,BWDRVI,Blue,...,NBR,Gossan,NIR,Ferric_Oxides,Fe3,GLI,Red,MAX_CI,IF,MCARI1
0,2.304861,0.410823,1.536542,-0.264809,-0.716042,-0.816512,0.092327,8.243456,-0.98,44.513775,...,0.945309,0.022344,4.4964,0.061078,0.502376,-0.074453,12.291277,0.004414,2.214614,7.768361
1,2.496108,0.568762,1.410671,-0.463735,0.072192,-0.043946,0.675865,8.649507,-0.832205,76.809082,...,0.97147,0.125043,70.342323,0.090641,0.699943,0.065441,50.98962,-0.048293,12.037422,61.967369
2,2.222647,0.441918,1.532755,-0.270155,-0.638658,-0.734491,0.120194,8.27995,-0.969846,51.095936,...,0.971226,0.034677,7.821566,0.081935,0.532909,-0.001577,18.48061,4.169011,2.973305,9.920009
3,1.984526,0.580183,1.495565,-0.30063,-0.346838,-0.413721,0.388353,8.857862,-0.918048,57.790521,...,0.972719,0.067277,25.018325,0.088156,0.720089,0.011774,33.735753,1.480026,3.429616,7.866391
4,3.442151,-0.08945,1.554811,-0.374138,-0.601136,-0.857133,0.048014,8.385872,-0.984732,58.162979,...,0.984011,0.021489,4.474403,0.04764,0.326259,-0.056439,9.9196,-0.002413,2.475886,24.114712
5,1.984526,0.580183,1.495565,-0.30063,-0.346838,-0.413721,0.388353,8.857862,-0.918048,57.790521,...,0.972719,0.067277,25.018325,0.088156,0.720089,0.011774,33.735753,-0.065459,3.429616,7.866391
6,2.359482,0.404647,1.540411,-0.299195,-0.633086,-0.773227,0.105371,8.251698,-0.974746,50.166786,...,0.97309,0.034259,6.41568,0.080216,0.496713,-0.037421,15.022129,-0.049355,2.527918,11.351343
7,3.033152,0.130981,1.554245,-0.20086,-0.867709,-0.870618,0.032844,8.843853,-0.986262,75.348015,...,0.944751,0.034347,5.211494,0.127199,0.348759,0.078071,19.299984,-0.300877,4.602264,35.933363
8,2.513713,0.358983,1.551772,-0.326286,-0.599333,-0.773672,0.09548,9.141504,-0.974801,62.784416,...,0.951746,0.047933,8.011537,0.105604,0.45865,-0.022024,17.650833,0.143764,2.714684,18.619661
9,2.57504,0.337943,1.54039,-0.315544,-0.640413,-0.809502,0.082666,8.520046,-0.979164,46.514515,...,0.965199,0.039619,4.896877,0.094146,0.443215,-0.050981,11.636496,-0.014208,2.443077,13.099381


Unnamed: 0,DSWI,CTVI,H,mCRIRE,TSAVI,PSNDc2,CVI,TIRS1,BWDRVI,Blue,...,NBR,Gossan,NIR,Ferric_Oxides,Fe3,GLI,Red,MAX_CI,IF,MCARI1
0,1.984526,0.580183,1.495565,-0.30063,-0.346838,-0.413721,0.388353,8.857862,-0.918048,57.790521,...,0.972719,0.067277,25.018325,0.088156,0.720089,0.011774,33.735753,-0.010624,3.429616,7.866391
1,2.717712,0.324673,1.544322,-0.41828,-0.463083,-0.730532,0.113065,8.792016,-0.969327,55.66952,...,0.954783,0.057763,8.606872,0.100038,0.438826,-0.011086,15.177749,-0.031198,3.004079,22.162411
2,2.010201,0.581877,1.497484,-0.316523,-0.303412,-0.389217,0.406549,8.868892,-0.914342,57.926258,...,0.971844,0.068976,26.14219,0.0867,0.722763,0.009415,33.772681,0.036575,3.320265,9.178399
3,2.837816,0.254681,1.552029,-0.293223,-0.711587,-0.859705,0.057112,8.604989,-0.985025,55.203629,...,0.983847,0.025207,4.164535,0.068374,0.393594,-0.073439,11.296329,4.349364,2.313358,16.880606
4,1.942106,0.478249,1.529995,-0.135897,-0.928132,-0.817739,0.077599,8.590875,-0.980145,56.845776,...,0.966725,0.019871,5.69982,0.084139,0.57322,0.019538,24.134905,0.069147,3.43788,1.485439
5,2.024242,0.570722,1.503037,-0.313834,-0.3373,-0.421694,0.383605,8.869017,-0.919191,57.250135,...,0.97241,0.066737,24.570061,0.085772,0.706031,0.00818,32.193857,-0.062754,3.231371,9.650818
6,2.726644,0.313967,1.547764,-0.344197,-0.601392,-0.806173,0.082369,8.520046,-0.978765,52.546371,...,0.97022,0.024255,5.638958,0.05372,0.427119,-0.053068,12.489099,0.030087,2.437501,16.267716
7,2.767933,0.253065,1.550916,-0.257846,-0.752735,-0.845949,0.052659,9.294236,-0.983447,59.875225,...,0.941589,0.043065,4.99679,0.126179,0.392799,0.000194,14.640511,0.165662,3.002557,21.418659
8,2.612103,0.33897,1.538688,-0.339532,-0.588053,-0.781347,0.091107,8.452959,-0.975749,47.055965,...,0.982552,0.039329,5.77592,0.085077,0.44394,-0.028155,12.49447,0.211715,2.655085,14.739343
9,1.984526,0.580183,1.495565,-0.30063,-0.346838,-0.413721,0.388353,8.857862,-0.918048,57.790521,...,0.972719,0.067277,25.018325,0.088156,0.720089,0.011774,33.735753,-0.316521,3.429616,7.866391


## Model

In [None]:
#!pip install iterative-stratification

In [None]:
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, GroupKFold, StratifiedGroupKFold, KFold, TimeSeriesSplit
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
class CFG:
  ver=73
  model="xgboost"
  n_folds=15
  trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
  seed=42
  boosting_type="gbdt"

In [None]:
# =========================================================================================
# CV split
# =========================================================================================
def get_fold(df):
  Fold = TimeSeriesSplit(n_splits=CFG.n_folds)
  for n, (train_index, val_index) in enumerate(Fold.split(df)):
    df.loc[train_index, f'tr_fold{n}'] = 1
    df.loc[val_index, 'va_fold'] = int(n)
    df[f"tr_fold{n}"] = df[f"tr_fold{n}"].fillna(0)
    df[f'tr_fold{n}'] = df[f'tr_fold{n}'].astype(int)
  df["va_fold"] = df["va_fold"].fillna(999)
  df['va_fold'] = df['va_fold'].astype(int)
  return df

In [None]:
def get_score(labels, preds):
  rmse = np.sqrt(mean_squared_error(labels, preds))
  print(f"RMSE {rmse}")

In [None]:
#features = list(features)
features.remove('id')
#features.remove('area')
features.remove("year")
#features.remove("month")
#features.append("mesh20")

In [None]:
params = {
        'objective': 'reg:squarederror',
        'metric': 'rmse',
        'eta': 0.05,
        'booster': 'dart',
        'random_state': CFG.seed,
        "alpha":0.1,
        #'reg_lambda': 0.5,
        "n_estimators":5000,
        'min_child_weight': 1,
        'colsample_bytree': 0.6
    }

In [None]:
def get_pred(df, test, params):
  oof_df = pd.DataFrame()
  df_importance = pd.DataFrame({'feature': features})
  df_importance['importance'] = 0
  test_preds = np.zeros((len(test), CFG.n_folds))
  params["n_estimators"] = 5000

  ctg_list = ["year","month","mesh20"]

  for fold in range(CFG.n_folds):
    if fold in CFG.trn_fold:
      print("="*15,f" FOLD : {fold} ","="*15)
      train_folds = train[train[f'tr_fold{fold}'] == 1]
      valid_folds = train[train['va_fold'] == fold]

      print(f"train period : {str(train_folds.iloc[0].year)}年{str(train_folds.iloc[0].month)}月 ～ {str(train_folds.iloc[-1].year)}年{str(train_folds.iloc[-1].month)}月")
      print(f"valid period : {str(valid_folds.iloc[0].year)}年{str(valid_folds.iloc[0].month)}月 ～ {str(valid_folds.iloc[-1].year)}年{str(valid_folds.iloc[-1].month)}月")


      train_X = train_folds[features].values
      valid_X = valid_folds[features].values
      test_X = test[features].values
      train_labels = train_folds[target].values
      valid_labels = valid_folds[target].values

      #clf = model
      params["n_estimators"] += 250
      clf = xgb.XGBRegressor(**params)
      clf.fit(train_X, train_labels,
              eval_set=[(valid_X, valid_labels)],
              #eval_metric="rmse",
              #categorical_feature = ctg_list,
              verbose = 1000,
              early_stopping_rounds=50)
      
      valid_folds[f"pred"] = clf.predict(valid_X)
      test_preds[:, fold] = clf.predict(test_X)
      oof_df = pd.concat([oof_df, valid_folds])

      #f_importance = np.array(clf.feature_importances_)
      #temp_importance = pd.DataFrame({'feature': features, 'importance': f_importance})
      #df_importance['importance'] += temp_importance['importance']

      get_score(valid_labels, valid_folds["pred"])

  #df_importance['importance'] = df_importance['importance'] / np.sum(df_importance['importance'])
  #df_importance = df_importance.sort_values('importance', ascending=False)
  #display(df_importance.head(35))
  #fig = plt.figure(figsize=(16, 32), tight_layout=True)
  #sns.barplot(x="importance", y="feature", data=df_importance.head(35))
  #plt.title("LightGBM feature importances")
  get_score(oof_df[target], oof_df[f"pred"])
  return oof_df,  test_preds

In [22]:
train['year-month'] = train.year.astype(int).astype(str) + '-' + train.month.astype(int).astype(str)
train['year-month'] = pd.to_datetime(train['year-month'], format='%Y-%m')
train = train.sort_values("year-month").reset_index(drop=True)
train = get_fold(train)
oof_preds = []
preds = np.zeros((len(test), 4))
seeds = [42, 18, 0, 2023]
for i in range(4):
  print("#"*15,f" SEED : {CFG.seed} ","#"*15)
  params["seed"] = seeds[i]
  oof_df, test_pred = get_pred(train, test, params)
  pred = np.mean(test_pred, axis=1)
  preds[:, i] = pred
  oof_preds.append(oof_df["pred"].values)
  print()

###############  SEED : 42  ###############
train period : 1999.0年11.0月 ～ 2009.0年4.0月
valid period : 2009.0年4.0月 ～ 2009.0年10.0月
Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.31759




[474]	validation_0-rmse:0.16943


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.1694269559990698
train period : 1999.0年11.0月 ～ 2009.0年10.0月
valid period : 2009.0年10.0月 ～ 2009.0年10.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.39784
[202]	validation_0-rmse:0.08960


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.08929166646511166
train period : 1999.0年11.0月 ～ 2009.0年10.0月
valid period : 2009.0年10.0月 ～ 2009.0年12.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.41539
[241]	validation_0-rmse:0.17460


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.174467560910829
train period : 1999.0年11.0月 ～ 2009.0年12.0月
valid period : 2009.0年12.0月 ～ 2010.0年8.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.43558
[128]	validation_0-rmse:0.14779


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.14633902675290983
train period : 1999.0年11.0月 ～ 2010.0年8.0月
valid period : 2010.0年8.0月 ～ 2010.0年9.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.41412
[225]	validation_0-rmse:0.11412


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.11367755044905073
train period : 1999.0年11.0月 ～ 2010.0年9.0月
valid period : 2010.0年9.0月 ～ 2010.0年12.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.41878
[107]	validation_0-rmse:0.11316


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.1111313776006507
train period : 1999.0年11.0月 ～ 2010.0年12.0月
valid period : 2010.0年12.0月 ～ 2011.0年4.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.38938
[125]	validation_0-rmse:0.17764


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.17698170869945037
train period : 1999.0年11.0月 ～ 2011.0年4.0月
valid period : 2011.0年4.0月 ～ 2011.0年7.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.41413
[132]	validation_0-rmse:0.05958


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.059370868284005444
train period : 1999.0年11.0月 ～ 2011.0年7.0月
valid period : 2011.0年7.0月 ～ 2011.0年9.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.42290
[106]	validation_0-rmse:0.15149


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.1492454889667899
train period : 1999.0年11.0月 ～ 2011.0年9.0月
valid period : 2011.0年9.0月 ～ 2011.0年10.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.41649
[1000]	validation_0-rmse:0.05904
[1208]	validation_0-rmse:0.05897


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.05896123118452762
train period : 1999.0年11.0月 ～ 2011.0年10.0月
valid period : 2011.0年10.0月 ～ 2019.0年8.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.37167
[106]	validation_0-rmse:0.25270


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.2486447546767491
train period : 1999.0年11.0月 ～ 2019.0年8.0月
valid period : 2019.0年8.0月 ～ 2019.0年9.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.39341
[102]	validation_0-rmse:0.24420


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.2387142063377302
train period : 1999.0年11.0月 ～ 2019.0年9.0月
valid period : 2019.0年9.0月 ～ 2019.0年10.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.40858
[411]	validation_0-rmse:0.26818


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.26808551961653754
train period : 1999.0年11.0月 ～ 2019.0年10.0月
valid period : 2019.0年10.0月 ～ 2019.0年10.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.40154
[669]	validation_0-rmse:0.19530


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.19508476428336002
train period : 1999.0年11.0月 ～ 2019.0年10.0月
valid period : 2019.0年10.0月 ～ 2020.0年2.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.38034
[460]	validation_0-rmse:0.18511


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.1846105631576404
RMSE 0.17072805464973972

###############  SEED : 42  ###############
train period : 1999.0年11.0月 ～ 2009.0年4.0月
valid period : 2009.0年4.0月 ～ 2009.0年10.0月
Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.31218




[433]	validation_0-rmse:0.17283


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.1728039493417014
train period : 1999.0年11.0月 ～ 2009.0年10.0月
valid period : 2009.0年10.0月 ～ 2009.0年10.0月
Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.39651




[257]	validation_0-rmse:0.09048


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.09038965427374089
train period : 1999.0年11.0月 ～ 2009.0年10.0月
valid period : 2009.0年10.0月 ～ 2009.0年12.0月
Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.41648




[310]	validation_0-rmse:0.15247


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.15218665096463577
train period : 1999.0年11.0月 ～ 2009.0年12.0月
valid period : 2009.0年12.0月 ～ 2010.0年8.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.43200
[184]	validation_0-rmse:0.13428


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.1338792227524296
train period : 1999.0年11.0月 ～ 2010.0年8.0月
valid period : 2010.0年8.0月 ～ 2010.0年9.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.40943
[249]	validation_0-rmse:0.11617


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.11564140297218228
train period : 1999.0年11.0月 ～ 2010.0年9.0月
valid period : 2010.0年9.0月 ～ 2010.0年12.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.42281
[140]	validation_0-rmse:0.10969


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.10694103447638914
train period : 1999.0年11.0月 ～ 2010.0年12.0月
valid period : 2010.0年12.0月 ～ 2011.0年4.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.38890
[221]	validation_0-rmse:0.17544


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.17506661070996504
train period : 1999.0年11.0月 ～ 2011.0年4.0月
valid period : 2011.0年4.0月 ～ 2011.0年7.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.41365
[133]	validation_0-rmse:0.05978


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.05940965376038468
train period : 1999.0年11.0月 ～ 2011.0年7.0月
valid period : 2011.0年7.0月 ～ 2011.0年9.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.42136
[137]	validation_0-rmse:0.16236


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.1614671492785044
train period : 1999.0年11.0月 ～ 2011.0年9.0月
valid period : 2011.0年9.0月 ～ 2011.0年10.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.41792
[682]	validation_0-rmse:0.05894


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.058940866510295696
train period : 1999.0年11.0月 ～ 2011.0年10.0月
valid period : 2011.0年10.0月 ～ 2019.0年8.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.37241
[123]	validation_0-rmse:0.24720


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.24678910832132295
train period : 1999.0年11.0月 ～ 2019.0年8.0月
valid period : 2019.0年8.0月 ～ 2019.0年9.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.39263
[113]	validation_0-rmse:0.22992


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.22527020307242954
train period : 1999.0年11.0月 ～ 2019.0年9.0月
valid period : 2019.0年9.0月 ～ 2019.0年10.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.40858
[147]	validation_0-rmse:0.27940


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.27785122845829013
train period : 1999.0年11.0月 ～ 2019.0年10.0月
valid period : 2019.0年10.0月 ～ 2019.0年10.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.40282
[963]	validation_0-rmse:0.19419


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.19413855942600433
train period : 1999.0年11.0月 ～ 2019.0年10.0月
valid period : 2019.0年10.0月 ～ 2020.0年2.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.37972
[346]	validation_0-rmse:0.18774


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.18742326677587698
RMSE 0.16917817478746397

###############  SEED : 42  ###############
train period : 1999.0年11.0月 ～ 2009.0年4.0月
valid period : 2009.0年4.0月 ～ 2009.0年10.0月
Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.31075




[437]	validation_0-rmse:0.16961


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.1695953691727734
train period : 1999.0年11.0月 ～ 2009.0年10.0月
valid period : 2009.0年10.0月 ～ 2009.0年10.0月
Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.39639




[729]	validation_0-rmse:0.08853


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.08851287451899184
train period : 1999.0年11.0月 ～ 2009.0年10.0月
valid period : 2009.0年10.0月 ～ 2009.0年12.0月
Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.41785




[290]	validation_0-rmse:0.18575


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.18545182232808474
train period : 1999.0年11.0月 ～ 2009.0年12.0月
valid period : 2009.0年12.0月 ～ 2010.0年8.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.43071
[138]	validation_0-rmse:0.13116


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.13066413338178234
train period : 1999.0年11.0月 ～ 2010.0年8.0月
valid period : 2010.0年8.0月 ～ 2010.0年9.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.40926
[196]	validation_0-rmse:0.11582


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.11568538847466375
train period : 1999.0年11.0月 ～ 2010.0年9.0月
valid period : 2010.0年9.0月 ～ 2010.0年12.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.42283
[143]	validation_0-rmse:0.11643


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.11639866598007483
train period : 1999.0年11.0月 ～ 2010.0年12.0月
valid period : 2010.0年12.0月 ～ 2011.0年4.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.38911
[134]	validation_0-rmse:0.17552


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.17410278848495747
train period : 1999.0年11.0月 ～ 2011.0年4.0月
valid period : 2011.0年4.0月 ～ 2011.0年7.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.41433
[151]	validation_0-rmse:0.05975


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.05956585076258828
train period : 1999.0年11.0月 ～ 2011.0年7.0月
valid period : 2011.0年7.0月 ～ 2011.0年9.0月




Parameters: { "metric" } are not used.

[0]	validation_0-rmse:0.42140
[238]	validation_0-rmse:0.17203


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.1717045695943638
train period : 1999.0年11.0月 ～ 2011.0年9.0月
valid period : 2011.0年9.0月 ～ 2011.0年10.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.41655
[833]	validation_0-rmse:0.05904


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.059011874106492765
train period : 1999.0年11.0月 ～ 2011.0年10.0月
valid period : 2011.0年10.0月 ～ 2019.0年8.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.37174
[110]	validation_0-rmse:0.25720


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.2521986291564385
train period : 1999.0年11.0月 ～ 2019.0年8.0月
valid period : 2019.0年8.0月 ～ 2019.0年9.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.39244
[102]	validation_0-rmse:0.23702


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.23328301246686703
train period : 1999.0年11.0月 ～ 2019.0年9.0月
valid period : 2019.0年9.0月 ～ 2019.0年10.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.40926
[197]	validation_0-rmse:0.27482


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_folds[f"pred"] = clf.predict(valid_X)


RMSE 0.27443298970078744
train period : 1999.0年11.0月 ～ 2019.0年10.0月
valid period : 2019.0年10.0月 ～ 2019.0年10.0月
Parameters: { "metric" } are not used.





[0]	validation_0-rmse:0.40108


KeyboardInterrupt: ignored

In [None]:
cover_max = train.cover.max()
_oof = oof_df.copy()
_oof["pred"] = np.mean(oof_preds, axis=0)
_oof["pred"] = _oof["pred"].apply(lambda x: x if x>0 else 0)
_oof["pred"] = _oof["pred"].apply(lambda x: x if x<1 else cover_max)
get_score(_oof["cover"], _oof["pred"])

In [None]:
plt.hist(_oof.pred, label='pred', bins=25, edgecolor='white', alpha=.6)
plt.hist(_oof.cover, label="label", bins=25, edgecolor='white', alpha=.6)
plt.xlabel("Pred")
plt.legend()
plt.show()

In [None]:
pred_df = test.copy()
pred_df["pred"] = np.mean(preds, axis=1)

pred_df = pred_df.sort_index()
pred_df

In [None]:
pred_df["pred"] = pred_df["pred"].apply(lambda x: x if x>0 else 0)
pred_df["pred"] = pred_df["pred"].apply(lambda x: x if x<1 else cover_max)
pred_df[["pred"]].to_csv(os.path.join(OUTPUT_DIR, f'submit_{CFG.model}_SEEDAVERAGE_ver{CFG.ver}.csv'), header=False)

plt.hist(pred_df.pred,bins=25,edgecolor='white')
plt.xlabel("Pred")
plt.show()

display(pred_df)