In [9]:
# SIGNATE Student Cup 2021春【予測部門】における解放として有力なkNNと、特長量エンジニアリングについて、以下の記事を参考に学んでいく。
# https://signate.jp/competitions/449/discussions/pseudo-labeling-lb06630

In [10]:
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt

# ワーニングの出力をなくして表示をシンプルにするもの？
# warnings.simplefilter('ignore', pd.core.common.SettingWithCopyWarning)
warnings.simplefilter('ignore', UserWarning)

# 今回の分類数。ジャンルの数。
N_CLASSES = 11

# ファイルのインポート
INPUT = Path("input")
df_train = pd.read_csv(INPUT / "train.csv")
df_test = pd.read_csv(INPUT / "test.csv")
df_sample_sub = pd.read_csv(INPUT / "sample_submit.csv", header=None)
df_sample_sub.columns = ["index", "genre"]
df_genre_labels = pd.read_csv(INPUT / "genre_labels.csv")


# trainデータとtestデータをミックスしている。
# 特長量エンジニアリングを別々にやるのは面倒なので、ここで合体させて、学習時に分けるようにしていると思われる。賢い。
def merge_train_test(df_train, df_test):
    if "genre" not in df_test.columns.tolist():
        df_test["genre"] = -100
    res = pd.concat([df_train, df_test])
    res.reset_index(inplace=True, drop=True)
    return res

# 合体させたときにgenre=-100にしているので、それを起点にtrainデータとtestデータを分ける。
def split_train_test(df):
    df_train = df[df["genre"] != -100]
    df_test = df[df["genre"] == -100]
    df_train.reset_index(inplace=True, drop=True)
    df_test.reset_index(inplace=True, drop=True)
    return df_train, df_test


# parameters

# def lgb_metric(preds, data):  
#     pred_labels = preds.reshape(N_CLASSES, -1).argmax(axis=0)
#     score = f1_score(data.get_label(), pred_labels, average="macro")
#     return "macro_f1", score, True

# 以下、lightGBMの学習時のパラメータ定義
learning_rate = 0.01

lgb_params = {
    "objective": "multiclass",
    "num_class": N_CLASSES,
    #"metric": "None",
    "learning_rate": learning_rate,
    "num_leaves": 3,
    "min_data_in_leaf": 40,
    #"colsample_bytree": 1.0,
    #"feature_fraction": 1.0,
    #"bagging_freq": 0,
    #"bagging_fraction": 1.0,
    "verbosity": 0,
    "seed": 42,
}

#k近傍法のグループ数?
knn_n_neighbors = 6


# parameters - knn feature weights
# 以下、kNNの学習時のパラメータ定義。regionはラベルエンコーディング、その他の量的変数は標準化、それ以外にnanの数も特徴量として使っている。
knn_features = [
   'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
   'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
   'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
   'region_S', 'region_T', 'region_unknown',
   'standardscaled_popularity', 'standardscaled_duration_ms',
   'standardscaled_acousticness', 'standardscaled_positiveness',
   'standardscaled_danceability', 'standardscaled_loudness',
   'standardscaled_energy', 'standardscaled_liveness',
   'standardscaled_speechiness', 'standardscaled_instrumentalness',
   'standardscaled_log_tempo', 'standardscaled_num_nans'
]

dict_feature_weights = {}

# regionの特長量重みを設定。
# 以下のように重みを決めていくことができるらしい。
# 重みですが、最初に大まかに当たりをつけ(regionは違うものが近傍データとして選ばれないくらい大きめ、popularityも他の特徴より大きめなど)、
# 交差検証での精度を見ながら細かい調整をしています。
for col in [
    'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
    'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
    'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
    'region_S', 'region_T', 'region_unknown'
]:
    dict_feature_weights[col] = 100.0

for col in [
    'standardscaled_duration_ms',
    'standardscaled_acousticness', 'standardscaled_positiveness',
    'standardscaled_danceability', 'standardscaled_loudness',
    'standardscaled_energy', 'standardscaled_liveness',
    'standardscaled_speechiness', 'standardscaled_instrumentalness'
]:
    dict_feature_weights[col] = 1.0

# popularityは他の特徴よりも大きめに設定。
dict_feature_weights["standardscaled_popularity"] = 8.0
dict_feature_weights["standardscaled_log_tempo"] = 0.001
dict_feature_weights["standardscaled_num_nans"] = 100.0

# k近傍法の特徴量重みをnumpy配列に変換。
knn_feature_weights = np.array([dict_feature_weights[col] for col in knn_features])
print(knn_feature_weights)

# train,testデータに対して特長量エンジニアリングを行うため、合体させる。
df_main = merge_train_test(df_train, df_test)

[1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02
 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02 1.e+02
 1.e+02 8.e+00 1.e+00 1.e+00 1.e+00 1.e+00 1.e+00 1.e+00 1.e+00 1.e+00
 1.e+00 1.e-03 1.e+02]


In [11]:
# 動作検証用のおためし。
df = df_main.copy()
df["genre_name"] = df["genre"].map(dict(df_genre_labels[["labels", "genre"]].values))
df["tempo"] = df["tempo"].map(lambda x: sum(map(int, x.split("-"))) / 2)
df = pd.concat([df, pd.get_dummies(df["region"]).rename(columns={"unknown": "region_unknown"})], axis=1)
df.head()

Unnamed: 0,index,genre,popularity,duration_ms,acousticness,positiveness,danceability,loudness,energy,liveness,...,region_L,region_M,region_N,region_O,region_P,region_Q,region_R,region_S,region_T,region_unknown
0,0,10,11,201094,0.112811,0.157247,0.187841,-1.884852,0.893918,0.363568,...,False,False,False,False,False,False,False,False,False,False
1,1,8,69,308493,0.101333,0.346563,0.554444,-5.546495,0.874409,0.193892,...,False,False,False,False,False,False,False,False,False,False
2,2,3,43,197225,0.49642,0.265391,0.457642,-9.25567,0.439933,0.217146,...,False,False,False,False,False,False,False,False,False,False
3,3,10,45,301092,0.165667,0.245533,0.356578,-5.088788,0.868704,0.377025,...,False,False,False,False,False,False,False,False,False,False
4,4,3,57,277348,0.19072,0.777578,0.830479,-3.933896,0.650149,0.169323,...,False,False,False,False,False,False,False,False,False,True


In [12]:
for pseudo_labeling_threshold in [0.95, 0.925, 0.9, 0.875, 0.85, -np.inf]:
    #疑似ラベルを使って学習する際の閾値を設定。confidenceがこの閾値を超える場合に、そのラベルを使って学習する。
    df = df_main.copy()
    
    
    # feature engineering
    # genreのラベル番号からジャンル名に変換している。
    df["genre_name"] = df["genre"].map(dict(df_genre_labels[["labels", "genre"]].values))
    # tempoの値を平均値に変換している。
    df["tempo"] = df["tempo"].map(lambda x: sum(map(int, x.split("-"))) / 2)
    # regionのone-hotエンコーディング、ついでにunknownの列名を変更している。
    df = pd.concat([df, pd.get_dummies(df["region"]).rename(columns={"unknown": "region_unknown"})], axis=1)


    #===================ここまで確認完了。
    df["num_nans"] = 0
    for col in [
        "acousticness",
        "positiveness",
        "danceability",
        "energy",
        "liveness",
        "speechiness",
        "instrumentalness",
    ]:
        df["num_nans"] += df[col].isna()

    class CountEncoder:
        def fit(self, series):
            # "series"ごとの出現回数をカウント
            self.counts = series.groupby(series).count()
            return self

        def transform(self, series):
            # "series"の各要素を、fit時に集計した出現回数に置換
            return series.map(self.counts).fillna(0)

        def fit_transform(self, series):
            # fitとtransformをまとめて実行
            return self.fit(series).transform(series)
    columns_count_enc = ["region"]
    for col in columns_count_enc:
        # regionの出現回数をカウントして、新しい列を作成している。
        df["countenc_" + col] = CountEncoder().fit_transform(df[col])
        # 欠損値をnanに変換している。locはnanの行を取得している。
        df.loc[df[col].isna().values, "countenc_" + col] = np.nan


    columns_label_enc = ["region"]
    for col in columns_count_enc:
        # regionのラベルエンコーディングを行っている。
        # ラベルエンコーディングとは、カテゴリ変数を数値に変換すること。
        df["labelenc_" + col] = LabelEncoder().fit_transform(df[col])
        df.loc[df[col].isna().values, "labelenc_" + col] = np.nan

    class GroupFeatureExtractor:  # 参考: https://signate.jp/competitions/449/discussions/lgbm-baseline-lb06240
        EX_TRANS_METHODS = ["deviation", "zscore"]

        def __init__(self, group_key, group_values, agg_methods):
            self.group_key = group_key
            self.group_values = group_values

            self.ex_trans_methods = [m for m in agg_methods if m in self.EX_TRANS_METHODS]
            self.agg_methods = [m for m in agg_methods if m not in self.ex_trans_methods]
            self.df_agg = None

        def fit(self, df_train, y=None):
            """

            """
            if not self.agg_methods:
                return
            dfs = []
            for agg_method in self.agg_methods:
                if callable(agg_method):
                    agg_method_name = agg_method.__name__
                else:
                    agg_method_name = agg_method
                # group by key
                df_agg = (df_train[[self.group_key] + self.group_values].groupby(self.group_key).agg(agg_method))
                df_agg.columns = self._get_column_names(agg_method_name)
                dfs.append(df_agg)
            self.df_agg = pd.concat(dfs, axis=1).reset_index()

        def transform(self, df_eval):
            """
            Transform evaluation data.
            """
            key = self.group_key
            if self.agg_methods:
                df_features = pd.merge(df_eval[[self.group_key]], self.df_agg, on=self.group_key, how="left")
            else:
                df_features = df_eval[[self.group_key]].copy()
            if self.ex_trans_methods:
                if "deviation" in self.ex_trans_methods:
                    # deviationとは、平均からの差分。X - X.mean()
                    df_features[self._get_agg_column_names("deviation")] = df_eval[self.group_values] - df_eval[[key]+self.group_values].groupby(key).transform("mean")
                if "zscore" in self.ex_trans_methods:
                    # z-scoreとは、平均を引いて標準偏差で割ること。つまりは標準化。
                    df_features[self._get_column_names("zscore")] = (df_eval[self.group_values] - df_eval[[key]+self.group_values].groupby(key).transform("mean")) \
                                                                    / (df_eval[[key]+self.group_values].groupby(key).transform("std") + 1e-8)
            df_features.drop(self.group_key, axis=1, inplace=True)
            return df_features

        def _get_column_names(self, method):
            return [f"agg_{method}_{col}_grpby_{self.group_key}" for col in self.group_values]

        def fit_transform(self, df_train, y=None):
            self.fit(df_train, y=y)
            return self.transform(df_train)   

    # tempoを対数変換し特徴量に追加している。
    df["log_tempo"] = np.log(df["tempo"])
    # カテゴリ変数の特徴量を抽出している。group_keyはregion、group_valuesは特徴量、agg_methodsは標準化。
    gfe = GroupFeatureExtractor(
        "region", 
        ['popularity', 'duration_ms', 'acousticness', 'positiveness', 'danceability', 'loudness', 'energy', 'liveness', 'speechiness', 'instrumentalness', 'log_tempo'],
        ["zscore"]
    )
    # fit_transformで特徴量を抽出している。
    df = pd.concat([df, gfe.fit_transform(df)], axis=1)
    # 一旦今のdfの結果を可視化する。
    print("add zscore features")
    print(df.head())
    print(df.info())
    print(df.describe())

    # k近傍法の特徴量を抽出するクラスを作成。
    class KNNFeatureExtractor:
        def __init__(self, n_neighbors=5):
            self.knn = KNeighborsClassifier(n_neighbors + 1)

        def fit(self, X, y):
            self.knn.fit(X, y)
            self.y = y if isinstance(y, np.ndarray) else np.array(y)
            return self

        def transform(self, X, is_train_data):
            # kneighborsを使用して距離とindexを取得する。
            distances, indexes = self.knn.kneighbors(X)
            distances = distances[:, 1:] if is_train_data else distances[:, :-1]
            indexes = indexes[:, 1:] if is_train_data else indexes[:, :-1]
            labels = self.y[indexes]
            score_columns = [f"knn_score_class{c:02d}" for c in range(N_CLASSES)]
            df_knn = pd.DataFrame(
                [np.bincount(labels_, distances_, N_CLASSES) for labels_, distances_ in zip(labels, 1.0 / distances)],
                columns=score_columns
            )
            df_knn["max_knn_scores"] = df_knn.max(1)
            for col in score_columns:
                df_knn[f"sub_max_knn_scores_{col}"] = df_knn["max_knn_scores"] - df_knn[col]
            for i, col1 in enumerate(score_columns):
                for j, col2 in enumerate(score_columns[i+1:], i+1):
                    if {i, j} & {8, 10}:
                        df_knn[f"sub_{col1}_{col2}"] = df_knn[col1] - df_knn[col2]
            df_knn["sum_knn_scores"] = df_knn.sum(1)

            return df_knn


    # feature scaling

    df["log_tempo"] = np.log(df["tempo"])
    for col in [
        'popularity', 'duration_ms', 'acousticness',
        'positiveness', 'danceability', 'loudness', 'energy', 'liveness',
        'speechiness', 'instrumentalness', 'log_tempo', 'num_nans',
    ]:
        df["standardscaled_" + col] = StandardScaler().fit_transform(df[[col]])[:, 0]


    #　特徴量作成用にマージしたデータをtrainデータとtestデータに分ける。
    df_train, df_test = split_train_test(df)
    target = df_train["genre"]
    
    
    # train
    # N_SPLITSは交差検証の分割数、SEED_SKFは乱数のシード値。
    N_SPLITS = 15
    SEED_SKF = 42
    np.random.seed(42)

    # StratifiedKFoldでデータを分割する。
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED_SKF)
    # oofはout of foldの略。交差検証の結果を格納するための変数。
    oof = np.zeros((len(df_train), N_CLASSES))
    # 予測結果を格納するための変数。
    predictions = np.zeros((len(df_test), N_CLASSES))
    # 特徴量の重要度を格納するための変数。
    df_feature_importance = pd.DataFrame()

    # 量的変数の特徴量定義
    features_numerical = [
        'popularity', 'duration_ms', 'acousticness',
        'positiveness', 'danceability', 'loudness', 'energy', 'liveness',
        'speechiness', 'instrumentalness', 'tempo',
        'region_A', 'region_B', 'region_C', 'region_D', 'region_E', 'region_F',
        'region_G', 'region_H', 'region_I', 'region_J', 'region_K', 'region_L',
        'region_M', 'region_N', 'region_O', 'region_P', 'region_Q', 'region_R',
        'region_S', 'region_T', 'region_unknown', 'countenc_region',
        'num_nans',
        'agg_zscore_popularity_grpby_region',
        'agg_zscore_duration_ms_grpby_region',
        'agg_zscore_acousticness_grpby_region',
        'agg_zscore_positiveness_grpby_region',
        'agg_zscore_danceability_grpby_region',
        'agg_zscore_loudness_grpby_region', 'agg_zscore_energy_grpby_region',
        'agg_zscore_liveness_grpby_region',
        'agg_zscore_speechiness_grpby_region',
        'agg_zscore_instrumentalness_grpby_region',
        'agg_zscore_log_tempo_grpby_region',
        'knn_score_class00', 'knn_score_class01',
        'knn_score_class02', 'knn_score_class03', 'knn_score_class04',
        'knn_score_class05', 'knn_score_class06', 'knn_score_class07',
        'knn_score_class08', 'knn_score_class09', 'knn_score_class10',
        'max_knn_scores',
        'sub_max_knn_scores_knn_score_class00',
        'sub_max_knn_scores_knn_score_class01',
        'sub_max_knn_scores_knn_score_class02',
        'sub_max_knn_scores_knn_score_class03',
        'sub_max_knn_scores_knn_score_class04',
        'sub_max_knn_scores_knn_score_class05',
        'sub_max_knn_scores_knn_score_class06',
        'sub_max_knn_scores_knn_score_class07',
        'sub_max_knn_scores_knn_score_class08',
        'sub_max_knn_scores_knn_score_class09',
        'sub_max_knn_scores_knn_score_class10',
        'sub_knn_score_class00_knn_score_class08',
        'sub_knn_score_class00_knn_score_class10',
        'sub_knn_score_class01_knn_score_class08',
        'sub_knn_score_class01_knn_score_class10',
        'sub_knn_score_class02_knn_score_class08',
        'sub_knn_score_class02_knn_score_class10',
        'sub_knn_score_class03_knn_score_class08',
        'sub_knn_score_class03_knn_score_class10',
        'sub_knn_score_class04_knn_score_class08',
        'sub_knn_score_class04_knn_score_class10',
        'sub_knn_score_class05_knn_score_class08',
        'sub_knn_score_class05_knn_score_class10',
        'sub_knn_score_class06_knn_score_class08',
        'sub_knn_score_class06_knn_score_class10',
        'sub_knn_score_class07_knn_score_class08',
        'sub_knn_score_class07_knn_score_class10',
        'sub_knn_score_class08_knn_score_class09',
        'sub_knn_score_class08_knn_score_class10',
        'sub_knn_score_class09_knn_score_class10',
        'sum_knn_scores'
    ]
    # カテゴリカル変数の特徴量定義
    features_categorical = ["labelenc_region"]
    features = features_numerical + features_categorical

    # 交差検証の実行
    for fold_, (indexes_trn, indexes_val) in enumerate(skf.split(df_train.values, target.values)):
        print(f"------------------------------ fold {fold_} ------------------------------")

        # train dataのindexとvalidation dataのindexを取得して、reset_indexでindexを振り直している。drop=trueは元のindexを削除するため。
        df_trn = df_train.loc[indexes_trn].reset_index(drop=True)
        df_val = df_train.loc[indexes_val].reset_index(drop=True)
        target_trn = target.loc[indexes_trn].reset_index(drop=True)
        target_val = target.loc[indexes_val].reset_index(drop=True)

        # make knn features
        X = df_trn[knn_features].fillna(0.0).values * knn_feature_weights
        knn_feature_extractor = KNNFeatureExtractor(knn_n_neighbors).fit(X, target_trn)
        df_trn = pd.concat([df_trn, knn_feature_extractor.transform(X, is_train_data=True)], axis=1)
        X = df_val[knn_features].fillna(0.0).values * knn_feature_weights
        df_val = pd.concat([df_val, knn_feature_extractor.transform(X, is_train_data=False)], axis=1)
        X = df_test[knn_features].fillna(0.0).values * knn_feature_weights
        df_test_knn_features = knn_feature_extractor.transform(X, is_train_data=False)
        for col in df_test_knn_features.columns:
            df_test[col] = df_test_knn_features[col]

        lgb_train = lgb.Dataset(
            df_trn.loc[:, features],
            label=target_trn,
            feature_name=features,
            categorical_feature=features_categorical
        )
        lgb_valid = lgb.Dataset(
            df_val.loc[:, features],
            label=target_val,
            feature_name=features,
            categorical_feature=features_categorical
        )

        lgb_params["learning_rate"] = learning_rate + np.random.random() * 0.001  # おまじない
        num_round = 999999999
        verbose_eval = 0  # この数字を1にすると学習時のスコア推移がコマンドライン表示される
        evaluation_results = {} # 学習時のスコアを保存する辞書
        model = lgb.train(
            lgb_params,
            lgb_train, 
            num_round, 
            valid_sets=[lgb_train, lgb_valid], 
            callbacks=[lgb.early_stopping(stopping_rounds=50, 
                                verbose=True), # early_stopping用コールバック関数            fobj=None,            
                        lgb.log_evaluation(verbose_eval),
                        lgb.record_evaluation(evaluation_results)] # コマンドライン出力用コールバック関数
#feval=lgb_metric,
        )

        # cv
        prediction_round = model.best_iteration+150 if num_round >= 1e8 else num_round  # おまじない
        oof[indexes_val] = model.predict(df_val[features], num_iteration=prediction_round)

        # feature importance
        df_fold_importance = pd.DataFrame()
        df_fold_importance["feature"] = features
        df_fold_importance["importance"] = model.feature_importance()
        df_fold_importance["fold"] = fold_
        df_feature_importance = pd.concat([df_feature_importance, df_fold_importance], axis=0)

        # prediction for test data
        predictions += model.predict(df_test[features], num_iteration=prediction_round) / N_SPLITS
        print()

    
    score = f1_score(target, oof.argmax(1), average="macro")
    print("CV score (not reliable!)")
    print(f"  f1: {score:8.5f}")
    print()
    print(classification_report(target, oof.argmax(1)))
    
    
    df_test["prediction"] = predictions.argmax(1)
    df_test["confidence"] = predictions.max(1)
    df_test["genre"] = np.where(predictions.max(1) > pseudo_labeling_threshold, predictions.argmax(1), -100)
    df = merge_train_test(df_train, df_test)
    df_main["genre"] = df_main["index"].map(dict(df[["index", "genre"]].values))
    print((df_test["confidence"] > pseudo_labeling_threshold).sum(), f"rows were filled. (confidence>{pseudo_labeling_threshold})")
    print("filled test labels:", np.bincount(df_test[df_test["genre"]!=-100]["genre"]))
    print("\n")

add zscore features
   index  genre  popularity  duration_ms  acousticness  positiveness  \
0      0     10          11       201094      0.112811      0.157247   
1      1      8          69       308493      0.101333      0.346563   
2      2      3          43       197225      0.496420      0.265391   
3      3     10          45       301092      0.165667      0.245533   
4      4      3          57       277348      0.190720      0.777578   

   danceability  loudness    energy  liveness  ...  \
0      0.187841 -1.884852  0.893918  0.363568  ...   
1      0.554444 -5.546495  0.874409  0.193892  ...   
2      0.457642 -9.255670  0.439933  0.217146  ...   
3      0.356578 -5.088788  0.868704  0.377025  ...   
4      0.830479 -3.933896  0.650149  0.169323  ...   

   agg_zscore_duration_ms_grpby_region  agg_zscore_acousticness_grpby_region  \
0                            -0.535165                             -1.014876   
1                             0.915294                        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[722]	training's multi_logloss: 0.63719	valid_1's multi_logloss: 0.866821

------------------------------ fold 1 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[957]	training's multi_logloss: 0.592877	valid_1's multi_logloss: 0.779181

------------------------------ fold 2 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[696]	training's multi_logloss: 0.639719	valid_1's multi_logloss: 0.805379

------------------------------ fold 3 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[697]	training's multi_logloss: 0.652066	valid_1's multi_logloss: 0.745833

------------------------------ fold 4 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1140]	training's multi_logloss: 0.583479	valid_1's multi_logloss: 0.763067

------------------------------ fold 5 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[548]	training's multi_logloss: 0.676812	valid_1's multi_logloss: 0.821049

------------------------------ fold 6 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[746]	training's multi_logloss: 0.648701	valid_1's multi_logloss: 0.694938

------------------------------ fold 7 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[440]	training's multi_logloss: 0.690969	valid_1's multi_logloss: 0.859752

------------------------------ fold 8 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[984]	training's multi_logloss: 0.592679	valid_1's multi_logloss: 0.882381

------------------------------ fold 9 ------------------------------
Training until validation scores don't improve for 50 rounds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Early stopping, best iteration is:
[1230]	training's multi_logloss: 0.556078	valid_1's multi_logloss: 0.859852

------------------------------ fold 10 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[483]	training's multi_logloss: 0.697557	valid_1's multi_logloss: 0.800032

------------------------------ fold 11 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[767]	training's multi_logloss: 0.624151	valid_1's multi_logloss: 0.785461

------------------------------ fold 12 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[606]	training's multi_logloss: 0.651355	valid_1's multi_logloss: 0.871401

------------------------------ fold 13 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[588]	training's multi_logloss: 0.662695	valid_1's multi_logloss: 0.844214

------------------------------ fold 14 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[541]	training's multi_logloss: 0.6804	valid_1's multi_logloss: 0.805726

CV score (not reliable!)
  f1:  0.66191

              precision    recall  f1-score   support

           0       0.83      0.62      0.71        32
           1       0.58      0.41      0.48       205
           2       0.71      0.58      0.63       191
           3       0.81      0.76      0.78       362
           4       0.71      0.60      0.65        45
           5       0.62      0.51      0.56       126
           6       0.56      0.36      0.44        50
           7       0.65      0.62      0.64       334
           8       0.72      0.80      0.75      1305
           9       0.86      0.81      0.83        59
          10       0.78      0.81      0.80      1337

    accuracy                           0.74      4046
   macro avg       0.71      0.63      0.66      4046
weighted avg       0.73      0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction"] = predictions.argmax(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["confidence"] = predictions.max(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["genre"] = np.where(predictions.max(1) > pseudo_labeling_threshold, predictions.argmax(1), -100)


------------------------------ fold 0 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1006]	training's multi_logloss: 0.534335	valid_1's multi_logloss: 0.710349

------------------------------ fold 1 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[968]	training's multi_logloss: 0.539319	valid_1's multi_logloss: 0.623192

------------------------------ fold 2 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[636]	training's multi_logloss: 0.587653	valid_1's multi_logloss: 0.748541

------------------------------ fold 3 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[906]	training's multi_logloss: 0.549992	valid_1's multi_logloss: 0.762505

------------------------------ fold 4 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[587]	training's multi_logloss: 0.609805	valid_1's multi_logloss: 0.77723

------------------------------ fold 5 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[576]	training's multi_logloss: 0.609124	valid_1's multi_logloss: 0.804657

------------------------------ fold 6 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[859]	training's multi_logloss: 0.56427	valid_1's multi_logloss: 0.666346

------------------------------ fold 7 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[639]	training's multi_logloss: 0.589059	valid_1's multi_logloss: 0.772694

------------------------------ fold 8 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[724]	training's multi_logloss: 0.585806	valid_1's multi_logloss: 0.714313

------------------------------ fold 9 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[659]	training's multi_logloss: 0.585758	valid_1's multi_logloss: 0.805647

------------------------------ fold 10 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[390]	training's multi_logloss: 0.648848	valid_1's multi_logloss: 0.888005

------------------------------ fold 11 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1155]	training's multi_logloss: 0.516819	valid_1's multi_logloss: 0.749975

------------------------------ fold 12 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[690]	training's multi_logloss: 0.579084	valid_1's multi_logloss: 0.772187

------------------------------ fold 13 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1482]	training's multi_logloss: 0.487441	valid_1's multi_logloss: 0.714249

------------------------------ fold 14 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[512]	training's multi_logloss: 0.622461	valid_1's multi_logloss: 0.717506

CV score (not reliable!)
  f1:  0.67708

              precision    recall  f1-score   support

           0       0.89      0.69      0.77        35
           1       0.59      0.40      0.48       205
           2       0.72      0.55      0.63       191
           3       0.85      0.81      0.83       457
           4       0.65      0.62      0.64        45
           5       0.62      0.52      0.57       126
           6       0.60      0.36      0.45        50
           7       0.64      0.60      0.62       334
           8       0.71      0.79      0.75      1311
           9       0.90      0.90      0.90        77
          10       0.81      0.84      0.82      1622

    accuracy                           0.75      4453
   macro avg       0.73      0.64      0.68      4453
weighted avg       0.75      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction"] = predictions.argmax(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["confidence"] = predictions.max(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["genre"] = np.where(predictions.max(1) > pseudo_labeling_threshold, predictions.argmax(1), -100)


471 rows were filled. (confidence>0.925)
filled test labels: [  9   4   7  69   1   5   0   0 115  15 246]


add zscore features
   index  genre  popularity  duration_ms  acousticness  positiveness  \
0      0     10          11       201094      0.112811      0.157247   
1      1      8          69       308493      0.101333      0.346563   
2      2      3          43       197225      0.496420      0.265391   
3      3     10          45       301092      0.165667      0.245533   
4      4      3          57       277348      0.190720      0.777578   

   danceability  loudness    energy  liveness  ...  \
0      0.187841 -1.884852  0.893918  0.363568  ...   
1      0.554444 -5.546495  0.874409  0.193892  ...   
2      0.457642 -9.255670  0.439933  0.217146  ...   
3      0.356578 -5.088788  0.868704  0.377025  ...   
4      0.830479 -3.933896  0.650149  0.169323  ...   

   agg_zscore_duration_ms_grpby_region  agg_zscore_acousticness_grpby_region  \
0                            -0.5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1119]	training's multi_logloss: 0.476436	valid_1's multi_logloss: 0.571583

------------------------------ fold 1 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[686]	training's multi_logloss: 0.523386	valid_1's multi_logloss: 0.742099

------------------------------ fold 2 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[607]	training's multi_logloss: 0.541106	valid_1's multi_logloss: 0.68314

------------------------------ fold 3 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[893]	training's multi_logloss: 0.50238	valid_1's multi_logloss: 0.604102

------------------------------ fold 4 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[715]	training's multi_logloss: 0.528466	valid_1's multi_logloss: 0.638988

------------------------------ fold 5 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[790]	training's multi_logloss: 0.515595	valid_1's multi_logloss: 0.720571

------------------------------ fold 6 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[865]	training's multi_logloss: 0.509096	valid_1's multi_logloss: 0.650767

------------------------------ fold 7 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1010]	training's multi_logloss: 0.472292	valid_1's multi_logloss: 0.781212

------------------------------ fold 8 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[690]	training's multi_logloss: 0.534013	valid_1's multi_logloss: 0.636481

------------------------------ fold 9 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[730]	training's multi_logloss: 0.526628	valid_1's multi_logloss: 0.611721

------------------------------ fold 10 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[579]	training's multi_logloss: 0.551279	valid_1's multi_logloss: 0.683546

------------------------------ fold 11 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[614]	training's multi_logloss: 0.537412	valid_1's multi_logloss: 0.680359

------------------------------ fold 12 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[902]	training's multi_logloss: 0.496256	valid_1's multi_logloss: 0.622482

------------------------------ fold 13 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[770]	training's multi_logloss: 0.520722	valid_1's multi_logloss: 0.659927

------------------------------ fold 14 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1095]	training's multi_logloss: 0.47256	valid_1's multi_logloss: 0.712441

CV score (not reliable!)
  f1:  0.69006

              precision    recall  f1-score   support

           0       0.89      0.75      0.81        44
           1       0.55      0.41      0.47       209
           2       0.73      0.58      0.65       198
           3       0.88      0.84      0.86       526
           4       0.74      0.63      0.68        46
           5       0.63      0.54      0.58       131
           6       0.50      0.30      0.37        50
           7       0.64      0.62      0.63       334
           8       0.74      0.81      0.78      1426
           9       0.88      0.91      0.90        92
          10       0.84      0.86      0.85      1868

    accuracy                           0.78      4924
   macro avg       0.73      0.66      0.69      4924
weighted avg       0.78      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction"] = predictions.argmax(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["confidence"] = predictions.max(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["genre"] = np.where(predictions.max(1) > pseudo_labeling_threshold, predictions.argmax(1), -100)


------------------------------ fold 0 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[742]	training's multi_logloss: 0.48132	valid_1's multi_logloss: 0.586785

------------------------------ fold 1 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[475]	training's multi_logloss: 0.522402	valid_1's multi_logloss: 0.576532

------------------------------ fold 2 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1474]	training's multi_logloss: 0.385917	valid_1's multi_logloss: 0.684863

------------------------------ fold 3 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1718]	training's multi_logloss: 0.376051	valid_1's multi_logloss: 0.567068

------------------------------ fold 4 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[642]	training's multi_logloss: 0.497856	valid_1's multi_logloss: 0.597804

------------------------------ fold 5 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[772]	training's multi_logloss: 0.470557	valid_1's multi_logloss: 0.694926

------------------------------ fold 6 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[718]	training's multi_logloss: 0.485146	valid_1's multi_logloss: 0.649003

------------------------------ fold 7 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[787]	training's multi_logloss: 0.465501	valid_1's multi_logloss: 0.633173

------------------------------ fold 8 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1000]	training's multi_logloss: 0.441974	valid_1's multi_logloss: 0.636662

------------------------------ fold 9 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1107]	training's multi_logloss: 0.436684	valid_1's multi_logloss: 0.55515

------------------------------ fold 10 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[503]	training's multi_logloss: 0.5268	valid_1's multi_logloss: 0.623114

------------------------------ fold 11 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[808]	training's multi_logloss: 0.467639	valid_1's multi_logloss: 0.58153

------------------------------ fold 12 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[552]	training's multi_logloss: 0.505005	valid_1's multi_logloss: 0.600629

------------------------------ fold 13 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[513]	training's multi_logloss: 0.512372	valid_1's multi_logloss: 0.713265

------------------------------ fold 14 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[859]	training's multi_logloss: 0.471883	valid_1's multi_logloss: 0.513397

CV score (not reliable!)
  f1:  0.71085

              precision    recall  f1-score   support

           0       0.86      0.81      0.84        47
           1       0.61      0.46      0.53       212
           2       0.76      0.59      0.67       217
           3       0.89      0.86      0.87       577
           4       0.71      0.56      0.63        48
           5       0.66      0.58      0.62       146
           6       0.59      0.39      0.47        51
           7       0.64      0.60      0.62       338
           8       0.77      0.83      0.80      1652
           9       0.93      0.92      0.92        97
          10       0.85      0.87      0.86      2002

    accuracy                           0.80      5387
   macro avg       0.75      0.68      0.71      5387
weighted avg       0.80      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction"] = predictions.argmax(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["confidence"] = predictions.max(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["genre"] = np.where(predictions.max(1) > pseudo_labeling_threshold, predictions.argmax(1), -100)


             index        genre   popularity   duration_ms  acousticness  \
count  8092.000000  8092.000000  8092.000000  8.092000e+03   8091.000000   
mean   4045.500000   -24.098616    41.107143  2.419948e+05      0.340836   
std    2336.103522    48.997455    16.135588  8.181782e+04      0.238708   
min       0.000000  -100.000000     0.000000  5.826000e+03      0.000000   
25%    2022.750000  -100.000000    31.000000  2.031058e+05      0.147908   
50%    4045.500000     7.000000    42.000000  2.357385e+05      0.249039   
75%    6068.250000    10.000000    52.000000  2.724855e+05      0.508565   
max    8091.000000    10.000000    82.000000  2.135773e+06      1.000000   

       positiveness  danceability     loudness       energy     liveness  ...  \
count   8068.000000   8073.000000  8092.000000  8091.000000  8083.000000  ...   
mean       0.465976      0.501746    -7.676095     0.606209     0.267455  ...   
std        0.222835      0.159932     4.049943     0.200847     0.156399

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1344]	training's multi_logloss: 0.393248	valid_1's multi_logloss: 0.508318

------------------------------ fold 1 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[741]	training's multi_logloss: 0.450624	valid_1's multi_logloss: 0.594272

------------------------------ fold 2 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[814]	training's multi_logloss: 0.439455	valid_1's multi_logloss: 0.602755

------------------------------ fold 3 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[700]	training's multi_logloss: 0.459669	valid_1's multi_logloss: 0.616608

------------------------------ fold 4 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[771]	training's multi_logloss: 0.449509	valid_1's multi_logloss: 0.638252

------------------------------ fold 5 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[865]	training's multi_logloss: 0.447835	valid_1's multi_logloss: 0.462088

------------------------------ fold 6 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[511]	training's multi_logloss: 0.494532	valid_1's multi_logloss: 0.615055

------------------------------ fold 7 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[571]	training's multi_logloss: 0.472113	valid_1's multi_logloss: 0.66078

------------------------------ fold 8 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[683]	training's multi_logloss: 0.467842	valid_1's multi_logloss: 0.551284

------------------------------ fold 9 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[643]	training's multi_logloss: 0.465512	valid_1's multi_logloss: 0.647191

------------------------------ fold 10 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[692]	training's multi_logloss: 0.467814	valid_1's multi_logloss: 0.533358

------------------------------ fold 11 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[945]	training's multi_logloss: 0.42304	valid_1's multi_logloss: 0.526832

------------------------------ fold 12 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[463]	training's multi_logloss: 0.491389	valid_1's multi_logloss: 0.647131

------------------------------ fold 13 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[746]	training's multi_logloss: 0.456355	valid_1's multi_logloss: 0.56839

------------------------------ fold 14 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[758]	training's multi_logloss: 0.458632	valid_1's multi_logloss: 0.545455

CV score (not reliable!)
  f1:  0.71015

              precision    recall  f1-score   support

           0       0.83      0.78      0.80        50
           1       0.59      0.44      0.50       217
           2       0.75      0.62      0.68       222
           3       0.88      0.87      0.87       607
           4       0.70      0.58      0.63        52
           5       0.65      0.60      0.62       149
           6       0.54      0.37      0.44        51
           7       0.68      0.64      0.66       357
           8       0.80      0.85      0.83      1829
           9       0.90      0.89      0.89        98
          10       0.86      0.88      0.87      2084

    accuracy                           0.81      5716
   macro avg       0.74      0.68      0.71      5716
weighted avg       0.81      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction"] = predictions.argmax(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["confidence"] = predictions.max(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["genre"] = np.where(predictions.max(1) > pseudo_labeling_threshold, predictions.argmax(1), -100)


             index        genre   popularity   duration_ms  acousticness  \
count  8092.000000  8092.000000  8092.000000  8.092000e+03   8091.000000   
mean   4045.500000   -21.024469    41.107143  2.419948e+05      0.340836   
std    2336.103522    47.495166    16.135588  8.181782e+04      0.238708   
min       0.000000  -100.000000     0.000000  5.826000e+03      0.000000   
25%    2022.750000  -100.000000    31.000000  2.031058e+05      0.147908   
50%    4045.500000     8.000000    42.000000  2.357385e+05      0.249039   
75%    6068.250000    10.000000    52.000000  2.724855e+05      0.508565   
max    8091.000000    10.000000    82.000000  2.135773e+06      1.000000   

       positiveness  danceability     loudness       energy     liveness  ...  \
count   8068.000000   8073.000000  8092.000000  8091.000000  8083.000000  ...   
mean       0.465976      0.501746    -7.676095     0.606209     0.267455  ...   
std        0.222835      0.159932     4.049943     0.200847     0.156399

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[416]	training's multi_logloss: 0.484907	valid_1's multi_logloss: 0.627067

------------------------------ fold 1 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1139]	training's multi_logloss: 0.397175	valid_1's multi_logloss: 0.494886

------------------------------ fold 2 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[488]	training's multi_logloss: 0.471083	valid_1's multi_logloss: 0.686954

------------------------------ fold 3 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[689]	training's multi_logloss: 0.440964	valid_1's multi_logloss: 0.628434

------------------------------ fold 4 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[662]	training's multi_logloss: 0.454858	valid_1's multi_logloss: 0.524864

------------------------------ fold 5 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[808]	training's multi_logloss: 0.4382	valid_1's multi_logloss: 0.576338

------------------------------ fold 6 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[757]	training's multi_logloss: 0.445885	valid_1's multi_logloss: 0.546292

------------------------------ fold 7 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[885]	training's multi_logloss: 0.419789	valid_1's multi_logloss: 0.591152

------------------------------ fold 8 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1086]	training's multi_logloss: 0.405665	valid_1's multi_logloss: 0.503334

------------------------------ fold 9 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[641]	training's multi_logloss: 0.459066	valid_1's multi_logloss: 0.49755

------------------------------ fold 10 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[849]	training's multi_logloss: 0.430552	valid_1's multi_logloss: 0.566663

------------------------------ fold 11 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[583]	training's multi_logloss: 0.46106	valid_1's multi_logloss: 0.527472

------------------------------ fold 12 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[589]	training's multi_logloss: 0.456648	valid_1's multi_logloss: 0.605377

------------------------------ fold 13 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[692]	training's multi_logloss: 0.452158	valid_1's multi_logloss: 0.489589

------------------------------ fold 14 ------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test[col] = df_test_knn_features[col]


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[684]	training's multi_logloss: 0.452547	valid_1's multi_logloss: 0.593819

CV score (not reliable!)
  f1:  0.72624

              precision    recall  f1-score   support

           0       0.87      0.78      0.82        51
           1       0.60      0.46      0.52       218
           2       0.76      0.60      0.67       226
           3       0.88      0.87      0.87       623
           4       0.75      0.64      0.69        56
           5       0.67      0.61      0.64       155
           6       0.65      0.42      0.51        53
           7       0.66      0.62      0.64       378
           8       0.81      0.87      0.84      1952
           9       0.91      0.91      0.91        98
          10       0.87      0.89      0.88      2137

    accuracy                           0.82      5947
   macro avg       0.77      0.70      0.73      5947
weighted avg       0.82      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["prediction"] = predictions.argmax(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["confidence"] = predictions.max(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["genre"] = np.where(predictions.max(1) > pseudo_labeling_threshold, predictions.argmax(1), -100)
