## Data Preparation and Wrangling

In [None]:
import numpy as np
import sympy as sy
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.datasets
import seaborn as sns
import plotly
import plotly.graph_objects as go
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import  LabelEncoder, StandardScaler
from sklearn.svm import SVC

In [None]:
stores = pd.read_csv("C:/Users/daisu/OneDrive/Desktop/GCI/drive-download-20201224T003704Z-001/stores.csv", )
genres = pd.read_csv("C:/Users/daisu/OneDrive/Desktop/GCI/drive-download-20201224T003704Z-001/genres.csv", )
goods = pd.read_csv("C:/Users/daisu/OneDrive/Desktop/GCI/drive-download-20201224T003704Z-001/goods.csv", )

In [None]:
good_genre=pd.merge(goods, genres)

In [None]:
train_df = pd.read_csv("C:/Users/daisu/OneDrive/Desktop/GCI/drive-download-20201224T003704Z-001/train.csv")
test_df  =pd.read_csv("C:/Users/daisu/OneDrive/Desktop/GCI/drive-download-20201224T003704Z-001/test.csv" )

In [None]:
train_df = pd.merge(train_df, good_genre)
train_df = pd.merge(train_df, stores)
test_df = pd.merge(test_df, good_genre)
test_df = pd.merge(test_df, stores)

In [None]:
train_df["yy_mm_dd"] = pd.to_datetime(train_df['yy_mm_dd'],format='%y-%m-%d')

In [None]:
train_df.index = train_df.yy_mm_dd

In [None]:
train_df["month"] = train_df.index.month

In [None]:
train_df["year"] = train_df.index.year

In [None]:
units_sold_month = train_df.groupby(by = [train_df.index.month, train_df.index.year]).sum()

In [None]:
units_sold_month

In [None]:
units_sold_month.index.names = ["month", "year"]

In [None]:
units_sold_month = units_sold_month.rename(columns={"units_sold_day": "units_sold_month"})

In [None]:
units_sold_month

In [None]:
units_sold_month.drop("price store_id goods_id goods_genre_id num_month".split(" "), axis=1, inplace=True)

In [None]:
units_sold_month

In [None]:
train_df = pd.merge(train_df, units_sold_month, on = ["month", "year"])

In [None]:
train_df

In [None]:
test_df

In [None]:
test_df.goods_genre_name.value_counts()

In [None]:
train_df.drop("yy_mm_dd price store_id units_sold_day goods_id goods_genre_id num_month year month".split(" "), axis=1, inplace=True)
test_df.drop("index store_id goods_id goods_genre_id".split(" "), axis=1, inplace=True)

In [None]:
sns.set_style("darkgrid")
plt.hist(train_df)

In [None]:
print(f"Original size of train_df: {train_df.shape}")
print(f"Original size of test_df: {test_df.shape}")

In [None]:
# from sklearn.preprocessing import LabelEncoder
# encoder = LabelEncoder()

# train_df.goods_name = encoder.fit_transform(train_df.goods_name)
# test_df.goods_name = encoder.transform(test_df.goods_name)

# train_df.goods_genre_name = encoder.fit_transform(train_df.goods_genre_name)
# test_df.goods_genre_name = encoder.transform(test_df.goods_genre_name)

# train_df.store_name = encoder.fit_transform(train_df.store_name)
# test_df.store_name = encoder.transform(test_df.store_name)

In [None]:
"""# Get values which are not common in both train and test dataframe
filter1=test_df["goods_name"].isin(train_df["goods_name"])
filter2=test_df["goods_genre_name"].isin(train_df["goods_genre_name"])
filter3=test_df["store_name"].isin(train_df["store_name"])

# Filter for which are False
goods_diff = test_df.goods_name[~filter1]
genre_diff = test_df.goods_genre_name[~filter1]
store_diff = test_df.store_name[~filter1]

# Get distinct values
goods_diff = np.unique(goods_diff)
genre_diff = np.unique(genre_diff)
store_diff = np.unique(store_diff)"""

In [None]:
genre_diff

In [None]:
"""goods_diff = np.unique(goods_diff)
genre_diff = np.unique(genre_diff)
store_diff = np.unique(store_diff)"""

In [None]:
"""# Replace those not in common by the name, "others"
for good in test_df.goods_name:
    if good in goods_diff_list:
        test_df["goods_name"].replace(good, "others", inplace=True)

for genre in test_df.goods_genre_name:
    if genre in genre_diff_list:
        test_df["goods_genre_name"].replace(genre, "others", inplace=True)"""

In [None]:
"""test_df = test_df[test_df.goods_name!="others"]
test_df = test_df[test_df.goods_genre_name!="others"]
test_df = test_df[test_df.store_name!="others"]"""

In [None]:
y

## Hyperparameter Search

In [None]:
import optuna.integration.lightgbm as lgb_o
from sklearn.model_selection import train_test_split
import sklearn.datasets
from sklearn.metrics import r2_score

#Prepare train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#Set data for lightGBM
train = lgb_o.Dataset(X_train, y_train)
test = lgb_o.Dataset(X_test, y_test)

#Hyperparameter search
params = {'objective': 'regression',
          'metric': 'rmse',
          'random_seed':0} 

gbm_o = lgb_o.train(params,
                    train,
                    valid_sets=test,
                    early_stopping_rounds=100,
                    verbose_eval=200,)

y_train_pred = gbm_o.predict(X_train,num_iteration=gbm_o.best_iteration)
y_test_pred = gbm_o.predict(X_test,num_iteration=gbm_o.best_iteration)

best_params = gbm_o.params
print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))
    

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import optuna.integration.lightgbm as lgb

X_train, y_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Set data for lightgbm
train = lgb.Dataset(X_train, y_train)

tuner = lgb.LightGBMTunerCV(params, train, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=3))

# Search for the hyperparameters
tuner.run()

#Show the best parameters
best_params = tuner.best_params
print(" Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

## One-Hot Encoding for XGBoost

In [None]:
train_df

In [None]:
# Get the column values which exist both in train and test dataset


filter1 = train_df["goods_name"].isin(test_df["goods_name"])
filter2=train_df["goods_genre_name"].isin(test_df["goods_genre_name"])
filter3=train_df["store_name"].isin(test_df["store_name"])

train_df= train_df[filter1&filter2&filter3]


filter1 = test_df["goods_name"].isin(train_df["goods_name"])
filter2=test_df["goods_genre_name"].isin(train_df["goods_genre_name"])
filter3=test_df["store_name"].isin(train_df["store_name"])

test_df= test_df[filter1&filter2&filter3]
#genre_diff = train_df.goods_genre_name[filter2]
#store_diff = train_df.store_name[filter3]

# goods_diff = np.unique(goods_diff)
# genre_diff = np.unique(genre_diff)
# store_diff = np.unique(store_diff)

# Replace those not in common by the name, "others"

# for good in train_df.goods_name:
#     if good in goods_diff:
#         train_df["goods_name"].replace(good, "others", inplace=True)

# for genre in test_df.goods_genre_name:
#     if genre in genre_diff:
#         train_df["goods_genre_name"].replace(genre, "others", inplace=True)
        
# for store in train_df.store_name:
#     if store in store_diff:
#         train_df["store"].replace(store, "others", inplace=True)
        
# train_df = train_df[train_df.goods_genre_name!="others"]
# train_df = train_df[train_df.store_name!="others"]

In [None]:
train_df

In [None]:
test_df

In [None]:
train_df = pd.get_dummies(train_df, sparse=True)

In [None]:
test_df = pd.get_dummies(test_df, sparse=True)

In [None]:
train_df.drop("yy_mm_dd", axis=1, inplace=True)

In [None]:
output=train_df.pop("units_sold_month")

In [None]:
train_df = pd.concat([train_df, output], axis=1)

In [None]:
X = train_df
y = output

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
y_train

In [None]:
#!pip install xgboost

## Hyperparameter for XGBoost

In [None]:
import xgboost  as xgb

import optuna

def objective(trial):
    params = {
        'objective': 'regression',
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 0.9, 0.1),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0),
        'random_state': 0,
        'tree_method': 'gpu_hist'
    }

    xgboost = xgb.XGBRegressor(**params, n_estimators=10000)
    xgboost= xgboost.fit(X_train, y_train, early_stopping_rounds=100, eval_metric='rmse', eval_set=[[X_test, y_test]])
    y_pred = 
    rmse = np.sqrt(MSE(y_test, y_pred))
    print(f"RMSE: {rmse}")
    
    return rmse

study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective, n_trials=100)

## Hyperparameter for CatBoost

In [None]:
!pip install catb

In [None]:
import catboost as cb
import numpy as np

import optuna

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 50, 300),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.001, 0.5),
        "random_strength": trial.suggest_int("random_strength", 0, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.01, 100.00
        ),
        "od_type": trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        "od_wait": trial.suggest_int("od_wait", 10, 50),
        "verbose": True,
        "task_type": "GPU"
    }
    categorical_cat = np.where(train_df.dtypes != np.float)[0]

    gbm = cb.CatBoostRegressor(**params)
    gbm = gbm.fit(X_train, y_train, cat_features=categorical_cat, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100,plot=True)

    y_pred = gbm.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"RMSE: {rmse}")
    return rmse

In [None]:
study_cat = optuna.create_study(direction="minimize")
study_cat.optimize(objective, n_trials=100, timeout=300)

print("Number of finished trials: {}".format(len(study_cat.trials)))
print("Best trial:")
trial = study_cat.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
categorical_cat = train_df.columns.values
categorical_cat

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

In [None]:
print(study.best_params)
print(study.best_value)
print(study.best_trial)

In [None]:
study = optuna.create_study(pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective_with_prune, n_trials=50)

In [None]:
study_cat.best_trial.params

In [None]:
trials_df = study_cat.trials_dataframe()
trials_df

In [None]:
# blue dot is the score of this trial and orange line show the best score.
#Note that blue dot is not in the all trial, because we turned on pruning thus many of the trials are stopped before getting final objective value.
#optuna.visualization.plot_optimization_history(study_cat)

In [None]:
#optuna.visualization.plot_intermediate_values(study_cat)

In [None]:
#optuna.visualization.plot_slice(study_cat)

In [None]:
optuna.visualization.plot_contour(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
#
best_params

## Train the model(LightGBM)

In [None]:
for column in train_df.columns:
    train_df[column] = train_df[column].astype("category")

In [None]:
#Caution: You don't have to turn DataFrame into Numpy array
X = train_df
y = output

In [None]:
X_train,  X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
X_test.shape[0]/X_train.shape[0]

In [None]:
# from sklearn.ensemble import BaggingRegressor
    
# """ Bagging meta-estimator is an ensembling algorithm that can be used for
#     both classification (BaggingClassifier) and regression (BaggingRegressor) problems. 
#     It follows the typical bagging technique to make predictions. 
#     Following are the steps for the bagging meta-estimator algorithm:

#     1. Random subsets are created from the original dataset (Bootstrapping).
#     2. The subset of the dataset includes all features.
#     3. A user-specified base estimator is fitted on each of these smaller sets.
#     4. Predictions from each model are combined to get the final result.
#     """

# bar = BaggingRegressor(tree.DecisionTreeRegressor(random_state=1))


# from sklearn.ensemble import AdaBoostingRegressor
# """ Adaptive boosting or AdaBoost is one of the simplest boosting algorithms. 
#     Usually, decision trees are used for modelling. 
#     Multiple sequential models are created, each correcting the errors from the last model. 
#     AdaBoost assigns weights to the observations which are incorrectly predicted and 
#     the subsequent model works to predict these values correctly.
#     """

# abr = AdaBoostRegressor()


# from sklearn.ensemble import GradientBoostingRegressor
# """ Gradient Boosting or GBM is another ensemble machine learning algorithm that works for
#     both regression and classification problems. GBM uses the boosting technique, 
#     combining a number of weak learners to form a strong learner. Regression trees used as a base learner, 
#     each subsequent tree in series is built on the errors calculated by the previous tree."""

# gbr = GradientBoostingRegressor()


# from xgboost import XGBRegressor
# """ XGBoost (extreme Gradient Boosting) is an advanced implementation of the gradient boosting algorithm.
#     XGBoost has proved to be a highly effective ML algorithm, extensively used in machine learning competitions and 
#     hackathons. XGBoost has high predictive power and is almost 10 times faster than the other gradient boosting techniques. It also includes a variety of regularization which reduces overfitting and improves overall performance. 
#     Hence it is also known as ‘regularized boosting‘ technique.
#     """

# xgb = XGBRegressor()


from catboost import CatBoostRegressor

cat = CatBoostRegressor(
 iterations= 179,
 depth = 10,
 learning_rate = 0.3291221497178803,
 random_strength = 47,
 bagging_temperature = 60.918745770482076,
 od_type = 'IncToDec',
 od_wait = 20,
)

categorical_cat = np.where(train_df.dtypes !=np.float)[0]


import lightgbm as lgb
from lightgbm import LGBMRegressor
#     """
#     Light GBM beats all the other algorithms when the dataset is extremely large. 
#     Compared to the other algorithms, Light GBM takes lesser time to run on a huge dataset.
#     LightGBM is a gradient boosting framework that uses tree-based algorithms and follows leaf-wise approach 
#     while other algorithms work in a level-wise approach pattern. 
#     """

categorical_list = ['goods_name', 'goods_genre_name', 'store_name']
lgb = LGBMRegressor(objective = 'regression',
                     metric = 'rmse',
                     random_seed = 0,
                     feature_pre_filter = False,
                     lambda_l1 = 2.377982329588689e-07,
                     lambda_l2 = 1.2825820088020978e-08,
                     num_leaves = 256,
                     feature_fraction = 0.6,
                     bagging_fraction = 0.7471801931739468,
                     bagging_freq = 7,
                     min_child_samples = 5,
                     categorical_features=categorical_list)
#y_pred = lgb.predict(X_test)


In [None]:
# bar.fit(X, y) # Bagging
# abr.fit(X, y) # AdaBoost
# gbr.fit(X, y) # GradientBoost
# xgb.fit(X, y) # XGBoost
cat.fit(X, y, cat_features=categorical_cat) # CatBoost
lgb.fit(X, y, categorical_feature=categorical_list) # LightGBM

In [None]:
importance = pd.DataFrame(cat.feature_importances_, index=train_df.columns, columns=['importance'])
display(importance)

In [None]:
importance.plot.barh()

In [None]:
importance = pd.DataFrame(lgb.feature_importances_, index=train_df.columns, columns=['importance'])
display(importance)

In [None]:
importance.plot.barh()

In [None]:
# pred_dict ={}
# for i, booster in enumerate(boosters):
#     y_pred = booster.predict(X_test, num_iteration=best_iteration)
#     pred_dict.setdefault(i, y_pred)
#     rmse = np.sqrt(MSE(y_test, y_pred))
#     print(f"RMSE: {rmse}")

In [None]:
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import mean_squared_error as MSE

In [None]:
y_pred = y_pred

In [None]:
y_test

In [None]:
rmse = np.sqrt(MSE(y_test, y_pred))
print(f"RMSE: {rmse}")

In [None]:
importance = pd.DataFrame(regressor.feature_importances_, index=test_df.columns, columns=['importance'])
display(importance)

In [None]:
sns.set_style("darkgrid")
importance.plot.barh()

## Auto-Gluon

In [None]:
import autogluon as ag
from autogluon import TabularPrediction as task

In [None]:
X_train

In [None]:
train_data = task.Dataset(X_train)

In [None]:
train_data

In [None]:
dir = "C:/Users/daisu/OneDrive/Desktop/GCI/drive-download-20201224T003704Z-001/model"
predictor = task.fit(train_data=train_data, label="units_sold_month", output_directory=dir, problem_type="regression", eval_metric="root_mean_squared_error", AG_args_fit={"use_gpu":True} )

In [None]:
predictor.fit_summary()

In [None]:
y_pred = predictor.predict(X_test)
print(f"Predictions: {y_pred}")
perf = predictor.evaluate_predictions(y_true = y_test, y_pred = y_pred, auxiliary_metrics = True)

In [None]:
y_pred = predictor.predict(X_test)
print(f"Predictions: {y_pred}")
perf = predictor.evaluate_predictions(y_true = y_test, y_pred = y_pred, auxiliary_metrics = True)

In [None]:
predictor.feature_importance(X)

In [None]:
predictor.get_model_best()

In [None]:
result = predictor.predict(test_df)

## Predict test set

In [None]:
# Change data type into categorical 
for feature in test_df.columns:
    test_df[feature] = pd.Series(test_df[feature], dtype="category")

In [None]:
test_df

In [None]:
cat_pred = cat.predict(test_df)
lgb_pred = lgb.predict(test_df)
cat_lgb = (cat_pred+lgb_pred)/2

In [None]:
pd.DataFrame({"cat":cat_pred, "lgb":lgb_pred, "cat_lgb": cat_lgb})

In [None]:
pred_dict ={}
for i, booster in enumerate(boosters):
    result = booster.predict(test_df, num_iteration=best_iteration)
    pred_dict.setdefault(i, result)

In [None]:
pred_dict

In [None]:
result=0
for i in pred_dict.keys():
    result += pred_dict[i]
result=result/1

In [None]:
result

In [None]:
submission  =pd.read_csv("C:/Users/daisu/OneDrive/Desktop/GCI/drive-download-20201224T003704Z-001/sample_submission.csv", index_col=0 )
submission["units_sold_month"] = result

In [None]:
submission

In [None]:
submission.to_csv("C:/Users/daisu/OneDrive/Desktop/GCI/drive-download-20201224T003704Z-001/submission_Auto_Gluon2.csv")

In [None]:
class ModelExtractionCallback(object):

    def __init__(self):
        self._model = None

    def __call__(self, env):
        # _CVBooster の参照を保持する
        self._model = env.model

    def _assert_called_cb(self):
        if self._model is None:
            # コールバックが呼ばれていないときは例外にする
            raise RuntimeError('callback has not called yet')

    @property
    def boosters_proxy(self):
        self._assert_called_cb()
        # Booster へのプロキシオブジェクトを返す
        return self._model

    @property
    def raw_boosters(self):
        self._assert_called_cb()
        # Booster のリストを返す
        return self._model.boosters

    @property
    def best_iteration(self):
        self._assert_called_cb()
        # Early stop したときの boosting round を返す
        return self._model.best_iteration




#  データセットを読み込む

X, y = train_df, output

# デモ用にデータセットを分割する
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=42)

# LightGBM 用のデータセット表現に直す
lgb_train = lgb.Dataset(X_train, y_train)

# 学習済みモデルを取り出すためのコールバックを用意する
extraction_cb = ModelExtractionCallback()
callbacks = [
    extraction_cb,
]

# データセットを 5-Fold CV で学習する
lgbm_params = {
     #dart(drop out trees) often performs better
    'objective': 'regression',
    "metric": "rmse",
     "random_seed": 0,
     "feature_pre_filter": False,
     "lambda_l1": 2.377982329588689e-07,
     "lambda_l2":1.2825820088020978e-08,
     "num_leaves": 256,
     "feature_fraction": 0.6,
     "bagging_fraction": 0.7471801931739468,
     "bagging_freq": 7,
     "min_child_samples": 5,
}
# NOTE: 一般的には返り値の内容 (交差検証の結果) を確認する
lgb.cv(lgbm_params,
        lgb_train,
        num_boost_round=10,
        early_stopping_rounds=10,
        nfold=10,
        shuffle=True,
        stratified=True,
        #seed=42,
        callbacks=callbacks,
        verbose_eval=1
        )

# コールバックのオブジェクトから学習済みモデルを取り出す
proxy = extraction_cb.boosters_proxy
boosters = extraction_cb.raw_boosters
best_iteration = extraction_cb.best_iteration


# # 各モデルで個別に推論する場合
# pred_dict={}
# for i, booster in enumerate(boosters):
#     y_pred_proba = booster.predict(X_test,
#                                     num_iteration=best_iteration)
#     y_pred = np.argmax(y_pred_proba, axis=1)
#     pred_dict.setdefault(i, y_pred)
#     accuracy = accuracy_score(y_test, y_pred) #正解率　全体に対して予測が当たった割合
#     precision = precision_score(y_test, y_pred) #適合率 1と予測した中で実際にどれだけ1であったかの割合 ex)異常検知システムがアラートを出した回数のうち、実際に異常であった割合
#     recall = recall_score(y_test, y_pred) #再現率 実際は1のデータのうち正しく1と予測できた割合 ex)病気の診断システムで再現率100%といった場合
#     f1 =f1_score(y_test, y_pred)              #F1スコア 適合率と再現率の調和平均
#     booster.feature_importances
#     print(f'Model {i}\n accuracy: {accuracy},\n precision: {precision},\n recall: {recall},\n f1: {f1}')





# model = lgb.LGBMClassifier(objective='binary',
#                         num_leaves = 23,
#                         learning_rate=0.1,
#                         n_estimators=100,
#                         boosting= "dart")

# # 学習する
# result = model.fit(X_train, y_train,
#                    eval_set=[(X_test, y_test)],
#                    eval_metric='multi_logloss'
#                   )

# # テストデータで予測する
# y_pred = model.predict(X_test, num_iteration=result.best_iteration_)

# # Accuracy を計算する
# accuracy = sum(y_test == y_pred) / len(y_test)
# print()
# print(f"accuracy: {accuracy}")
# print(f"Precision: {precision_score(y_test, y_pred)}") #適合率 1と予測した中で実際にどれだけ1であったかの割合 ex)異常検知システムがアラートを出した回数のうち、実際に異常であった割合
# print(f"Recall: {recall_score(y_test, y_pred)}") #再現率 実際は1のデータのうち正しく1と予測できた割合 ex)病気の診断システムで再現率100%といった場合
# print(f"F1: {f1_score(y_test, y_pred)}")             #F1スコア 適合率と再現率の調和平均

# # importanceを表示する
# importance = pd.DataFrame(model.feature_importances_, index=df.columns, columns=['importance'])
# display(importance)
# importance.plot.barh()



# ## Optuna and Auto Hyperparameter tuning
# import optuna.integration.lightgbm as lgb
# from sklearn.model_selection import train_test_split

# # Set data as LGB
# train = lgb.Dataset(X_train, y_train)
# test  = lgb.Dataset(X_test, y_test)

# # Hyper-parameter search
# params = {"objective": "regression",
#           "metric": "rmse"}


# lgb_trained = lgb.train(params,
#                         train, valid_sets=test,
#                         early_stopping_rounds=100)

# best_params = lgb_trained.params
# print("Params:     ")
# for key, value in best_params.items():
#     print(f"{key}: {value}")


# lgb.plot_importance(gbm)
# lgb.create_tree_diagraph(gbm)