<a href="https://colab.research.google.com/github/kyochanpy/Kaggle_Indoor_Location_Navigation/blob/main/note_books/lgbm_29.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import shutil
import scipy.stats as stats
from pathlib import Path
import glob
from tqdm import tqdm


from sklearn.model_selection import KFold
import lightgbm as lgb

import psutil
import random
import os
import time
import sys
import math
from contextlib import contextmanager

from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
N_SPLITS = 10
SEED = 42


In [4]:
LOG_PATH = Path("./log/")
LOG_PATH.mkdir(parents=True, exist_ok=True)


In [5]:
@contextmanager
def timer(name: str):
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    try:
        yield
    finally:
        m1 = p.memory_info()[0] / 2. ** 30
        delta = m1 - m0
        sign = '+' if delta >= 0 else '-'
        delta = math.fabs(delta)
        print(f"[{m1:.1f}GB({sign}{delta:.1f}GB): {time.time() - t0:.3f}sec] {name}", file=sys.stderr)


In [6]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

    
def comp_metric(xhat, yhat, x, y):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2))
    return intermediate.sum()/xhat.shape[0]


def score_log(df: pd.DataFrame, num_files: int, nam_file: str, data_shape: tuple, n_fold: int, seed: int, mpe: float):
    score_dict = {'n_files': num_files, 'file_name': nam_file, 'shape': data_shape, 'fold': n_fold, 'seed': seed, 'score': mpe}
    # noinspection PyTypeChecker
    df = pd.concat([df, pd.DataFrame.from_dict([score_dict])])
    df.to_csv(LOG_PATH / f"log_score.csv", index=False)
    return df

In [7]:
set_seed(SEED)


In [8]:
feature_dir = "/content/drive/MyDrive/all_data"
train_files = sorted(glob.glob(os.path.join(feature_dir, '*_train.csv')))
test_files = sorted(glob.glob(os.path.join(feature_dir, '*_test.csv')))
subm = pd.read_csv('/content/drive/MyDrive/sample_submission.csv', index_col=0)

In [9]:
lgb_params = {'objective': 'root_mean_squared_error',
              'boosting_type': 'gbdt',
              'n_estimators': 50000,
              'learning_rate': 0.1,
              'num_leaves': 90,
              'colsample_bytree': 0.4,
              'subsample': 0.6,
              'subsample_freq': 2,
              'bagging_seed': SEED,
              'reg_alpha': 8,
              'reg_lambda': 2,
              'random_state': SEED,
              'n_jobs': -1
              }

# 共通関数

In [10]:
def get_beacon_features(input_df):
    output_df = input_df.loc[:, ['beacon_rssi', 'beacon_distance']]
    return output_df

def get_mag_features(input_df):
    output_df = input_df.loc[:, ['mag_y', 'mag_z']]
    return output_df

def get_le_macaddr_features(input_df):
    output_df = pd.DataFrame()
    le = LabelEncoder()
    le.fit(input_df['beacon_mac_addr'].astype(str))
    output_df["LE_beacon_mac_addr"] = le.transform(input_df['beacon_mac_addr'].astype(str))
    return output_df


In [16]:
def get_fixed_rotation_vector(input_df):
    input_df_ = input_df.loc[:, ['rotation_x', 'rotation_y', 'rotation_z']]
    x_list = []
    y_list = []
    z_list = []
    for x in input_df_['rotation_x'].values:
        if x <= -0.5:
            x = x + 0.5
        elif x >= 0.5:
            x = x - 0.5
        x_list.append(x)
    for y in input_df_['rotation_y'].values:
        if y <= 0:
            y = y + 1
        y_list.append(y)
    for z in input_df_['rotation_z'].values:
        if z <= 0:
            z = z + 1
        z_list.append(z)
    output_df = pd.DataFrame()
    output_df['fixed_rotation_x'] = x_list
    output_df['fixed_rotation_y'] = y_list
    output_df['fixed_rotation_z'] = z_list
    return output_df
    
def get_deg_rotation_vector(input_df):
    input_df_ = input_df.loc[:, ['rotation_x', 'rotation_y', 'rotation_z']]
    x_list = []
    y_list = []
    z_list = []
    for x in input_df_['rotation_x'].values:
        x_ = math.degrees(math.asin(x))
        if x_ <= -45:
            x_ = x_ + 45
        elif x_ >= 45:
            x_ = x_ -45
        x_list.append(x_)
    for y in input_df_['rotation_y'].values:
        y_ = math.degrees(math.asin(y))
        if y_ <= 0:
            y_ = y_ + 90
        y_list.append(y_)
    for z in input_df_['rotation_z'].values:
        z_ = math.degrees(math.asin(z))
        if z_ <= 0:
            z_ = z_ + 90
        z_list.append(z_)
    output_df = pd.DataFrame()
    output_df['deg_rotation_x'] = x_list
    output_df['deg_rotation_y'] = y_list
    output_df['deg_rotation_z'] = z_list
    return output_df

def get_mag_rotation_features(input_df):
    input_df_ = pd.concat([input_df, get_fixed_rotation_vector(input_df), get_deg_rotation_vector(input_df)], axis=1)
    output_df = pd.DataFrame()
    n_list = []
    input_df_['fixed_deg_rotation_zz'] = input_df_['fixed_rotation_z'] * input_df_['deg_rotation_z']
    input_df_['mag_rotation_zz'] = input_df_['mag_z'] * input_df_['fixed_deg_rotation_zz'] * 0.05
    output_df['ud_mag_rotation_zz'] = abs(input_df_['mag_rotation_zz'].values - input_df_['mag_rotation_zz'].max())
    return output_df

def get_distance_features(input_df):
    input_df_ = pd.concat([input_df, get_mag_rotation_features(input_df)], axis=1)
    output_df = pd.DataFrame()
    output_df['work_distance_y'] = abs(input_df_['acc_y'] * input_df_['ud_mag_rotation_zz'] * input_df_['mag_z'] * 0.01)
    output_df['work_distance_z'] = abs(input_df_['acc_z'] * input_df_['ud_mag_rotation_zz'] * input_df_['mag_z'] * 0.01)
    output_df['work_distance_yz'] = abs(output_df['work_distance_y'] + output_df['work_distance_z']) * 0.2
    return output_df

def get_sum_sensor_features(input_df):
    output_df = pd.DataFrame()
    output_df['sum_sensor_x'] = input_df['acc_x'] + input_df['acc_unc_x'] + input_df['acc_unc_x2'] + input_df['gyro_x'] + input_df['gyro_unc_x'] + input_df['gyro_unc_x2'] + input_df['mag_x'] + input_df['mag_unc_x'] + input_df['mag_unc_x2'] + input_df['rotation_x']
    output_df['sum_sensor_y'] = input_df['acc_y'] + input_df['acc_unc_y'] + input_df['acc_unc_y2'] + input_df['gyro_y'] + input_df['gyro_unc_y'] + input_df['gyro_unc_y2'] + input_df['mag_y'] + input_df['mag_unc_y'] + input_df['mag_unc_y2'] + input_df['rotation_y']
    output_df['sum_sensor_xy'] = output_df['sum_sensor_x'] + output_df['sum_sensor_y']
    return output_df

def get_fixed_sum_features(input_df):
    input_df_ = pd.DataFrame()
    output_df = pd.DataFrame()
    input_df_['sum_sensor_y'] = input_df['acc_y'] + input_df['acc_unc_y'] + input_df['gyro_y'] + input_df['gyro_unc_y'] + input_df['mag_y'] + input_df['mag_unc_y'] + input_df['rotation_y']
    input_df_['sum_sensor_x'] = input_df['acc_x'] + input_df['acc_unc_x'] + input_df['gyro_x'] + input_df['gyro_unc_x'] + input_df['mag_x'] + input_df['mag_unc_x'] + input_df['rotation_x']
    input_df_['sum_sensor_xy'] = input_df_['sum_sensor_x'] + input_df_['sum_sensor_y']
    x_list = []
    x_25 = input_df_['sum_sensor_x'].quantile(q=0.25)
    y_list = []
    y_25 = input_df_['sum_sensor_y'].quantile(q=0.25)
    xy_list = []
    xy_25 = input_df_['sum_sensor_xy'].quantile(q=0.25)
    for x in input_df_['sum_sensor_x'].values:
        x_ = abs(x - x_25) + abs(x_25)
        x_list.append(x_)
    for y in input_df_['sum_sensor_y'].values:
        y_ = abs(y - y_25) + abs(y_25)
        y_list.append(y_)
    for xy in input_df_['sum_sensor_xy'].values:
        xy_ = abs(xy - xy_25) + abs(xy_25)
        xy_list.append(xy_)
    output_df['fixed_sum_sensor_x'] = x_list
    output_df['fixed_sum_sensor_y'] = y_list
    output_df['fixed_sum_sensor_xy'] = xy_list
    return output_df

# train用関数

In [17]:
def get_target_train(input_df):
    output_df = input_df.loc[:, ['x', 'y', 'path', 'timestamp' ]]
    return output_df

def get_wifi_train_festures(input_df):
    output_df = input_df.iloc[:, 1:-42]
    return output_df

# test用関数

In [18]:
def get_target_test(input_df):
    output_df = input_df.loc[:, ['site_path_timestamp', 'path', 'timestamp']]
    return output_df

def get_wifi_test_festures(input_df):
    output_df = input_df.iloc[:, 1:-41]
    return output_df


In [19]:
def get_process_funcs_train():
    funcs = [get_wifi_train_festures,
             get_beacon_features,
             get_le_macaddr_features,
             get_mag_rotation_features,
             get_mag_features,
             get_fixed_rotation_vector,
             get_deg_rotation_vector,
             get_distance_features,
             get_sum_sensor_features,
             get_fixed_sum_features,
             get_target_train]
    return funcs

def get_process_funcs_test():
    funcs = [get_wifi_test_festures,
             get_beacon_features,
             get_le_macaddr_features,
             get_mag_rotation_features,
             get_mag_features,
             get_fixed_rotation_vector,
             get_deg_rotation_vector,
             get_distance_features,
             get_sum_sensor_features,
             get_fixed_sum_features,
             get_target_test]
    return funcs

def to_feature(input_df, funcs):
    output_df = pd.DataFrame()
    for func in tqdm(funcs, total=len(funcs)):
        _df = func(input_df)
        assert len(_df) == len(input_df), func.__name__
        output_df = pd.concat([output_df, _df], axis=1)
    return output_df

In [None]:
score_df = pd.DataFrame()
oof = list()
predictions = list()
for n_files, file in enumerate(train_files):
    data = pd.read_csv(file, index_col=0).rename({'f':'floor'}, axis=1).fillna(0)
    test_data = pd.read_csv(test_files[n_files], index_col=0).fillna(0)

    #process_funcs_train = get_process_funcs_train()
    #process_funcs_test = get_process_funcs_test()
    data = to_feature(data, get_process_funcs_train())
    test_data = to_feature(test_data, get_process_funcs_test())

    print(data.shape)
    print(test_data.shape)

    oof_x, oof_y = np.zeros(data.shape[0]), np.zeros(data.shape[0])
    preds_x, preds_y = 0, 0

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    for fold, (trn_idx, val_idx) in enumerate(kf.split(data.iloc[:, :-4])):
        X_train = data.iloc[trn_idx, :-4]
        y_trainx = data.iloc[trn_idx, -4]
        y_trainy = data.iloc[trn_idx, -3]

        X_valid = data.iloc[val_idx, :-4]
        y_validx = data.iloc[val_idx, -4]
        y_validy = data.iloc[val_idx, -3]
        
        modelx = lgb.LGBMRegressor(**lgb_params)
        with timer("fit X"):
            modelx.fit(X_train, y_trainx,
                       eval_set=[(X_valid, y_validx)],
                       eval_metric='rmse',
                       verbose=False,
                       early_stopping_rounds=20
                       )

        modely = lgb.LGBMRegressor(**lgb_params)
        with timer("fit Y"):
            modely.fit(X_train, y_trainy,
                       eval_set=[(X_valid, y_validy)],
                       eval_metric='rmse',
                       verbose=False,
                       early_stopping_rounds=20
                       )

            
        oof_x[val_idx] = modelx.predict(X_valid)
        oof_y[val_idx] = modely.predict(X_valid)

        preds_x += modelx.predict(test_data.iloc[:, :-3]) / N_SPLITS
        preds_y += modely.predict(test_data.iloc[:, :-3]) / N_SPLITS
        preds_f = test_data['floor'].values

        score = comp_metric(oof_x[val_idx], oof_y[val_idx], y_validx.to_numpy(), y_validy.to_numpy())
        print(f"fold {fold}: mean position error {score}")
        score_df = score_log(score_df, n_files, os.path.basename(file), data.shape, fold, SEED, score)

    print("*+"*40)
    print(f"file #{n_files}, shape={data.shape}, name={os.path.basename(file)}")
    score = comp_metric(oof_x, oof_y,
                        data.iloc[:, -4].to_numpy(), data.iloc[:, -3].to_numpy())
    oof.append(score)
    print(f"mean position error {score}")
    print("*+"*40)
    score_df = score_log(score_df, n_files, os.path.basename(file), data.shape, 999, SEED, score)
    
    test_preds = pd.DataFrame(np.stack((preds_f, preds_x, preds_y))).T
    test_preds.columns = subm.columns
    test_preds.index = test_data["site_path_timestamp"]
    test_preds["floor"] = test_preds["floor"].astype(int)
    predictions.append(test_preds)

  9%|▉         | 1/11 [00:00<00:01,  9.83it/s]

In [None]:
data

In [None]:
all_preds = pd.concat(predictions)
all_preds = all_preds.reindex(subm.index)

In [None]:
all_preds

Unnamed: 0_level_0,floor,x,y
site_path_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,0.0,93.958514,99.233139
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000009017,0.0,91.940152,102.341637
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000015326,0.0,89.112067,105.326688
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000018763,0.0,89.678749,105.679917
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000022328,0.0,90.620852,110.508695
...,...,...,...
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000082589,,,
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000085758,,,
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000090895,,,
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000096899,,,


In [None]:
all_preds = pd.concat(predictions)
all_preds = all_preds.reindex(subm.index)
all_preds.to_csv('submission_lgbm_28.csv')

In [None]:
!mv /content/submission_lgbm_28.csv /content/drive/MyDrive

In [None]:
floor = pd.read_csv('/content/drive/MyDrive/only_accurate_floor.csv')

In [None]:
all_preds['floor'] = floor['floor'].values

In [None]:
all_preds.to_csv('submission_lgbm_27.csv')

In [None]:
!mv /content/submission_lgbm_27.csv /content/drive/MyDrive

In [None]:
pd.