In [None]:
import os
import gc
import pickle
from typing import List, Callable, Tuple

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
from geopy.distance import geodesic

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA, TruncatedSVD, NMF
from sklearn.cluster import KMeans

from MeCab import Tagger
import unidic_lite

import torch
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer

import lightgbm as lgbm
import xgboost as xgb
import catboost as cbt

from racoon.dataset import TableDataset
from racoon.encoder import LabelEncoder, TargetEncoder, CountEncoder
from racoon.runner import ModelSet, BaseRunner, StackedRunner

In [None]:
class ClassicalTokenizer:
    def __init__(self):
        self.tagger = Tagger(f'-Owakati -r /dev/null -d {unidic_lite.DICDIR}')
        
    def __call__(self, text:str) -> List[str]:
        return self.tagger.parse(text)
    
classical_tokenizer = ClassicalTokenizer()

In [None]:
# https://jablogs.com/detail/43132
from math import cos, asin, sqrt
def closest(station_name, data, v, n=3):
    def distance(lat1, lon1, lat2, lon2):
        p = 0.017453292519943295
        hav = 0.5 - cos((lat2-lat1)*p)/2 + cos(lat1*p)*cos(lat2*p) * (1-cos((lon2-lon1)*p)) / 2
        return 12742 * asin(sqrt(hav))

    dist = list(map(lambda p: distance(v['latitude'],v['longitude'],p['latitude'],p['longitude']), data))
    indices = np.argsort(dist)
    results = []
    # 名前が重複している可能性がある(例：新宿)
    for ind in indices:
        if station_name[ind] not in results:
            results.append(station_name[ind])
            if len(results) >= n:
                break

    return results, geodesic((data[indices[0]]["latitude"], data[indices[0]]["longitude"]), (v["latitude"], v["longitude"])).m

In [None]:
def rmsle(y, pred):
    return np.sqrt(mean_squared_log_error(y, pred))

In [None]:
train_df = pd.read_csv("../input/train_data.csv")
test_df = pd.read_csv("../input/test_data.csv")

station_df = pd.read_csv("../input/station_list.csv")
station_name = station_df["station_name"].tolist()
station_latlon = station_df[["longitude", "latitude"]].to_dict(orient="records")

whole_df = pd.concat([train_df, test_df]).reset_index(drop=True)

In [None]:
def target_encoding(df:pd.DataFrame, cols:List[str], target) -> pd.DataFrame:
    train_length = len(target)
    encoder = TargetEncoder(cols=cols).fit(X=df.iloc[:train_length][cols], y=target)
    output_df = encoder.transform(df[cols])
    output_df.columns = [col+"_te" for col in output_df.columns]
    return output_df

In [None]:
def datetime_encodeing(df:pd.DataFrame, col:str) -> pd.DataFrame:
    series = pd.to_datetime(df[col])
    output_df = []
    
    year = series.dt.year.copy()
    year.fillna(-1, inplace=True)
    year = year.astype(int)

    year.name = col + "_year"
    
    month = series.dt.month.copy()
    month.name = col + "_month"
    month.fillna(-1, inplace=True)
    month = month.astype(int)

    day = series.dt.day.copy()
    day.name = col + "_day"
    day.fillna(-1, inplace=True)
    day = day.astype(int)

    return pd.concat([year, month, day], axis=1)

In [None]:
def transformer_encoding(df:pd.DataFrame, decompose_func:Callable, col="name", model_name:str="paraphrase-multilingual-MiniLM-L12-v2", lower:bool=True) -> pd.DataFrame:
    model_list = {
        "paraphrase-multilingual-MiniLM-L12-v2": "phm-mini-lm-l12-v2",
        "bert-base-multilingual-uncased": "bert-base-multi-uncased",
        "xlm-roberta-large": "xlm-roberta-large",
    }

    batch_size=16
    texts = df[col].tolist()
    
    
    if lower:
        texts = list(map(str.lower, texts))
    
    if model_name == "paraphrase-multilingual-MiniLM-L12-v2":
        
        model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", cache_folder="../input/model_cache/")
        encoded: SentenceTransformer = model.encode(texts, batch_size=batch_size, show_progress_bar=True)
        
    elif model_name in ["bert-base-multilingual-uncased", "xlm-roberta-large"]:
        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="../input/model_cache/")
        model = AutoModel.from_pretrained(model_name, cache_dir="../input/model_cache/")
        
        target_device = torch.device(device = "cuda" if torch.cuda.is_available() else "cpu")
        model.to(target_device)
        
        encoded = np.zeros((len(texts), model.config.hidden_size), dtype=np.float32)
        for batch_idx in tqdm(range(0, len(texts), batch_size), desc="Batching"):
            inputs = tokenizer(texts[batch_idx:batch_idx+batch_size], max_length=98, padding="max_length", return_tensors="pt")
            with torch.no_grad():
                inputs = {k:v.to(target_device) for k, v in inputs.items()}
                outputs = model(**inputs)
                # cls token or mean_pooling
                outputs = outputs["last_hidden_state"][:, 0, :].cpu()
                encoded[batch_idx:batch_idx+batch_size, :] = outputs.numpy()
    else:
        raise ValueError
        
    del model
    torch.cuda.empty_cache()
    gc.collect()
          
    output_df = pd.DataFrame(decompose_func.fit_transform(encoded))
    output_df.columns = [f"{model_list[model_name]}_{decompose_func.__class__.__name__}_{i}" for i in range(output_df.shape[1])]
    return output_df

In [None]:
def tfidf_encoding(df:pd.DataFrame, decompose_func:Callable, tokenizer=None, col="name"):

    temp = df[col].str.lower()

    pipline = make_pipeline(
        TfidfVectorizer(tokenizer=tokenizer, min_df=2, ),
        decompose_func,
    )

    output_df = pipline.fit_transform(temp)
    output_df = pd.DataFrame(output_df)
    output_df.columns = [f"tfidf_{decompose_func.__class__.__name__}_{i}" for i in range(output_df.shape[1])]
    return output_df

In [None]:
def count_encoding(df:pd.DataFrame, cols:List[str]) -> pd.DataFrame:
    output_df = CountEncoder(cols=cols).fit_transform(df[cols])
    output_df.columns = [col+"_ce" for col in output_df.columns]
    return output_df

In [None]:
def compute_distance(df:pd.DataFrame, base_latitude:int, base_longitude:int, base_name:str) -> pd.DataFrame:
    """東京: 35.681753 139.764708 """
    latitudes = df["latitude"]
    longitudes = df["longitude"]
    distances = []
    for latitude, longitude in zip(latitudes, longitudes):
        distances.append(geodesic((base_latitude, base_longitude), (latitude, longitude)).km)
    
    distances = pd.DataFrame(distances)
    distances.columns = [f"dist_{base_name}"]
    return distances

def nearest_station_to_name(df:pd.DataFrame, nearest_station_cols:List[str]) -> pd.Series:
    nearest_stations = df[nearest_station_cols].apply(lambda x: ",".join(x), axis=1)
    df["name"] = nearest_stations + "," + df["name"]
    return df["name"]

def gen_nearest_station(df:pd.DataFrame, station_name, station_latlon) -> pd.DataFrame:
    latlons = df[["latitude", "longitude"]].to_dict(orient="records")
    nearest_station = list(map(lambda data: closest(station_name, station_latlon, data), latlons))

    nearest_station_name = [row[0] for row in nearest_station]
    nearest_station_name = pd.DataFrame(nearest_station_name)
    nearest_station_name.columns = [f"nearest_station_{i}" for i in range(nearest_station_name.shape[1])]

    nearest_station_dist = [row[1] for row in nearest_station]
    nearest_station_dist = pd.DataFrame({"nearest_station_dist": nearest_station_dist})

    output_df = pd.concat([nearest_station_name, nearest_station_dist], axis=1)
    return output_df

In [None]:
def agg_encoding(df: pd.DataFrame, pk: List[str], agg_funcs, suffix=""):
    """
    simple groupby
    examples
    ========
    |    | city   | cat   |   target |
    |---:|:-------|:------|---------:|
    |  0 | tokyo  | A     |        0 |
    |  1 | nagoya | B     |        1 |
    |  2 | osaka  | A     |        0 |
    |  3 | tokyo  | B     |        1 |
    |  4 | nagoya | A     |        0 |
    |  5 | osaka  | C     |        1 |
    |  6 | tokyo  | A     |        0 |
    |  7 | osaka  | C     |        1 |
    |  8 | tokyo  | A     |        0 |
    aggregator.generate_statics_features(df, ["city"], {"target":["count"]})
    |    | city   |   cat_count |
    |---:|:-------|------------:|
    |  0 | nagoya |           2 |
    |  1 | osaka  |           3 |
    |  2 | tokyo  |           4 |
    """
    
    agg_pvs = df.groupby(pk).agg(agg_funcs)
        
    rename_columns = ['_'.join(col).strip() for col in agg_pvs.columns.values]
    
    if suffix != "":
        rename_columns = [suffix+col for col in rename_columns]
    else:
        suffix = "/".join(pk)
        rename_columns = [suffix+"_"+col for col in rename_columns]
        
    agg_pvs.columns = rename_columns
    
    agg_pvs.reset_index(inplace=True)

    return pd.merge(df, agg_pvs, on=pk, how="left")[rename_columns]

def math_encoding(df:pd.DataFrame, cols:List[str]) -> pd.DataFrame:
    output_df = []
    for col in cols:
        enc = df[col]**2
        enc.name = col + "**2"
        output_df.append(enc)

    return pd.concat(output_df, axis=1)

In [None]:
whole_df[["number_of_reviews", "reviews_per_month"]]

In [None]:
def label_encoding(df:pd.DataFrame, columns=List[str]) -> pd.DataFrame:
    return LabelEncoder(output_suffix="").fit_transform(df[columns])

def make_features(df:pd.DataFrame):

    # preprocess
    _df = df.copy()
    _df["latlon_cluster"] = KMeans(n_clusters=100, random_state=42).fit_predict(_df[["latitude","longitude"]])
    # _df = pd.concat([_df, gen_nearest_station(_df, station_name=station_name, station_latlon=station_latlon)], axis=1)
    # _df["name"] = nearest_station_to_name(_df, nearest_station_cols=["nearest_station_0","nearest_station_1","nearest_station_2"])

    target = _df[_df["y"].notnull()]["y"].values

    output_df = []

    output_df.append(_df[["number_of_reviews", "minimum_nights", "availability_365", "reviews_per_month", "latlon_cluster", "latitude", "longitude"]])

    enc_funcs = [
        [label_encoding, {"df":_df, "columns":["neighbourhood", "room_type"]}],
        [count_encoding, {"df":_df, "cols":["neighbourhood", "minimum_nights"]}],
        [datetime_encodeing, {"df":_df, "col":"last_review"}],
        [compute_distance, {"df":_df, "base_latitude":35.681753, "base_longitude":139.764708, "base_name":"tokyo"}],
        [compute_distance, {"df":_df, "base_latitude":35.688690, "base_longitude":139.698812, "base_name":"shinjuku"}],
        [compute_distance, {"df":_df, "base_latitude":35.658700, "base_longitude":139.700872, "base_name":"shibuya"}],
        [compute_distance, {"df":_df, "base_latitude":35.710430, "base_longitude":139.809332, "base_name":"skytree"}],

        [agg_encoding, {"df":_df,"pk":["neighbourhood"], "agg_funcs":{
            "minimum_nights":["mean", "std"],
            "availability_365":["mean", "std"],
            "number_of_reviews":["mean", "max", "std"],
            }}],

        # [agg_encoding, {"df":_df,"pk":["room_type"], "agg_funcs":{
        #     "minimum_nights":["mean", "std"],
        #     "availability_365":["mean", "std"],
        #     "number_of_reviews":["mean", "max", "std"],
        #     }}],

        [agg_encoding, {"df":_df,"pk":["neighbourhood", "room_type"], "agg_funcs":{
            "minimum_nights":["mean", "std"],
            "availability_365":["mean", "std"],
            "number_of_reviews":["mean", "max", "std"],
            }}],

        # [agg_encoding, {"df":_df,"pk":["latlon_cluster"], "agg_funcs":{
        #     "minimum_nights":["mean", "std"],
        #     "availability_365":["mean", "std"],
        #     "number_of_reviews":["mean", "max", "std"],
        #     }}],
        
        [math_encoding, {"df":_df, "cols":["minimum_nights", "availability_365", "number_of_reviews"]}],
        # [gen_nearest_station, {"df":_df, "station_name": station_name, "station_latlon": station_latlon}],
        # [nearest_station_to_name, {"df":_df, "nearest_station_cols": ["nearest_station_0","nearest_station_1","nearest_station_2"]}],
        [target_encoding, {"df":_df, "cols":["neighbourhood", "latlon_cluster"], "target":np.log1p(target)}],

        [tfidf_encoding, {"df":_df, "decompose_func": TruncatedSVD(n_components=25, random_state=42)}],
        [tfidf_encoding, {"df":_df, "decompose_func": NMF(n_components=25, random_state=42)}],

        [transformer_encoding, {"df":_df, "decompose_func": PCA(n_components=30, random_state=42), "col":"name", "lower":True, "model_name":"paraphrase-multilingual-MiniLM-L12-v2"}],
        [transformer_encoding, {"df":_df, "decompose_func": PCA(n_components=30, random_state=42), "col":"name", "lower":True, "model_name":"bert-base-multilingual-uncased"}],
        
        # [transformer_encoding, {"df":df, "decompose_func": TruncatedSVD(n_components=30, random_state=42), "col":"name", "lower":True, "model_name":"bert-base-multilingual-uncased"}],
        # [transformer_encoding, {"df":df, "decompose_func": PCA(n_components=30, random_state=42), "col":"name", "lower":True, "model_name":"xlm-roberta-large"}],
    ]

    for func, params in tqdm(enc_funcs, desc="Generate Features..."):
        print(f"Exec: {func.__name__}")
        output_df.append(func(**params))

    output_df = pd.concat(output_df, axis=1)
    return output_df

In [None]:
feature_df = make_features(whole_df)
feature_df

In [None]:
targets = train_df["y"]
log_targets = np.log1p(targets)
train_features = feature_df.iloc[:len(train_df)].values
test_features = feature_df.iloc[len(train_df):].values

In [None]:
def load_cv(path:str):
    with open(path, "rb") as f:
        return pickle.load(f)

In [None]:
cv = load_cv("../input/fold/kfold.pkl")

In [None]:
ds = TableDataset(
    train_features=train_features,
    train_targets=log_targets,
    test_features=test_features,
    cv=cv,
    type_of_target="regression"
)

In [None]:
ds

In [None]:
# model_set = ModelSet(
#     model=RandomForestRegressor(n_estimators=1000, max_depth=4)
# )
# model_set

model_set_lgb = ModelSet(
    model=lgbm.LGBMRegressor(
        objective='regression',
        n_estimators=10000,
        max_depth=8,
        importance_type="gain",
        random_state=42,
        colsample_bytree=0.4,
        subsamples=0.7,
        # subsample_freq=3,
    ),
    fit_params={
        "callbacks": [
            lgbm.early_stopping(stopping_rounds=200),
            lgbm.log_evaluation(period=200),
            ]
        }
)

model_set_xgb = ModelSet(
    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        n_estimators=10000,
        colsample_bytree=0.8,
        subsample=0.8,
        verbosity=0,
        random_state=42,
        importance_type="gain",
    ),
    fit_params={
        "callbacks":[
            # xgb.callback.EarlyStopping(rounds=50),
        ],
        "early_stopping_rounds":100,
        "verbose":False,
    }
)

model_set_cat = ModelSet(
    model = cbt.CatBoostRegressor(
        iterations=100000,
        loss_function='RMSE',
        use_best_model=True,
        random_seed=42,
        learning_rate=0.1,
        verbose=500,
    ),
    fit_params={
        "early_stopping_rounds":200,
    }
)

In [None]:
stacked_runner = StackedRunner(
    runner=BaseRunner(
        table_dataset=ds,
        metric_func=mean_squared_error,
    )
)

In [None]:
stacked_runner.train_eval([
    model_set_lgb,
    # model_set_xgb,
    model_set_cat,
])

In [None]:
stacked_runner.eval_result

In [None]:
print(f"{rmsle(targets, np.expm1((stacked_runner.eval_result[2].oof))):.5f}")

In [None]:
print(f"{rmsle(targets, np.expm1(((stacked_runner.eval_result[0].oof+stacked_runner.eval_result[2].oof)/2))):.5f}")

In [None]:
with open("../output/exp007/oof_val_pred.pkl", "wb") as f:
    preds = np.array([stacked_runner.eval_result[0].oof, stacked_runner.eval_result[1].oof, stacked_runner.eval_result[2].oof]).T
    preds = np.expm1(preds)
    pickle.dump(preds, f)

In [None]:
with open("../output/exp007/oof_test_pred.pkl", "wb") as f:
    preds = np.array([stacked_runner.eval_result[0].test_probas, stacked_runner.eval_result[1].test_probas, stacked_runner.eval_result[2].test_probas]).T
    preds = np.expm1(preds)
    pickle.dump(preds, f)

In [None]:
# emsamble_preds = np.array([
#     stacked_runner.eval_result[0].oof,
#     stacked_runner.eval_result[1].oof,
#     stacked_runner.eval_result[2].oof,
# ]).mean(axis=0)

# rmsle(targets, np.expm1(emsamble_preds))

In [None]:
importances = np.array([t.feature_importances_ for t in stacked_runner.train_models[0]])
def plot_importance(importances:np.ndarray, col_name:List[str]):
    importance_df = pd.DataFrame(importances)
    importance_df.columns = col_name

    sort_col = importance_df.mean(axis=0).sort_values(ascending=False).index
    temp = pd.melt(importance_df)

    plt.figure(figsize = (8,10))
    sns.set_theme(style="whitegrid")
    sns.boxplot(
        data=temp,
        x="value",
        y="variable",
        order=sort_col[:50],
    )

plot_importance(importances, feature_df.columns)

In [None]:
test_preds = np.expm1((stacked_runner.eval_result[0].test_probas+stacked_runner.eval_result[1].test_probas)/2)

In [None]:
sub_df = pd.read_csv("../input/submission.csv")

In [None]:
sub_df['y'] = test_preds

In [None]:
sub_df.to_csv("../output/watanabe_exp007_k.csv", index=False)