<a href="https://colab.research.google.com/github/kyochanpy/Kaggle_Indoor_Location_Navigation/blob/main/note_books/lgbm_23_all_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import numpy as np
import pandas as pd
import shutil
import scipy.stats as stats
from pathlib import Path
import glob
from tqdm import tqdm


from sklearn.model_selection import KFold
import lightgbm as lgb

import psutil
import random
import os
import time
import sys
import math
from contextlib import contextmanager

from sklearn.preprocessing import LabelEncoder

In [2]:
N_SPLITS = 10
SEED = 618


In [3]:
LOG_PATH = Path("./log/")
LOG_PATH.mkdir(parents=True, exist_ok=True)


In [4]:
@contextmanager
def timer(name: str):
    t0 = time.time()
    p = psutil.Process(os.getpid())
    m0 = p.memory_info()[0] / 2. ** 30
    try:
        yield
    finally:
        m1 = p.memory_info()[0] / 2. ** 30
        delta = m1 - m0
        sign = '+' if delta >= 0 else '-'
        delta = math.fabs(delta)
        print(f"[{m1:.1f}GB({sign}{delta:.1f}GB): {time.time() - t0:.3f}sec] {name}", file=sys.stderr)


In [5]:
def set_seed(seed=527):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

    
def comp_metric(xhat, yhat, x, y):
    intermediate = np.sqrt(np.power(xhat-x, 2) + np.power(yhat-y, 2))
    return intermediate.sum()/xhat.shape[0]


def score_log(df: pd.DataFrame, num_files: int, nam_file: str, data_shape: tuple, n_fold: int, seed: int, mpe: float):
    score_dict = {'n_files': num_files, 'file_name': nam_file, 'shape': data_shape, 'fold': n_fold, 'seed': seed, 'score': mpe}
    # noinspection PyTypeChecker
    df = pd.concat([df, pd.DataFrame.from_dict([score_dict])])
    df.to_csv(LOG_PATH / f"log_score.csv", index=False)
    return df

# **特徴量関数**

In [6]:
'timestamp',
       'path', 'beacon_timestamp', 'beacon_uuid', 'beacon_major_id',
       'beacon_minor_id', 'beacon_tx_power', 'beacon_rssi', 'beacon_distance',
       'beacon_mac_addr', 'beacon_timestamp2', 'accelerometer_timestamp',
       'accelerometer_x', 'accelerometer_y', 'accelerometer_z',
       'accelerometer_uncalibrated_timestamp', 'accelerometer_uncalibrated_x',
       'accelerometer_uncalibrated_y', 'accelerometer_uncalibrated_z',
       'accelerometer_uncalibrated_x2', 'accelerometer_uncalibrated_y2',
       'accelerometer_uncalibrated_z2', 'gyroscope_timestamp', 'gyroscope_x',
       'gyroscope_y', 'gyroscope_z', 'gyroscope_uncalibrated_timestamp',
       'gyroscope_uncalibrated_x', 'gyroscope_uncalibrated_y',
       'gyroscope_uncalibrated_z', 'gyroscope_uncalibrated_x2',
       'gyroscope_uncalibrated_y2', 'gyroscope_uncalibrated_z2',
       'magnetic_field_timestamp', 'magnetic_field_x', 'magnetic_field_y',
       'magnetic_field_z', 'magnetic_field_uncalibrated_timestamp',
       'magnetic_field_uncalibrated_x', 'magnetic_field_uncalibrated_y',
       'magnetic_field_uncalibrated_z', 'magnetic_field_uncalibrated_x2',
       'magnetic_field_uncalibrated_y2', 'magnetic_field_uncalibrated_z2',
       'rotation_vector_timestamp', 'rotation_vector_x', 'rotation_vector_y',
       'rotation_vector_z'

IndentationError: ignored

In [10]:
#train用

def get_numerical_features_train(input_df):
    input_df_ = input_df['path']
    output_df = input_df.drop(['Unnamed: 0.1', 'Unnamed: 0.1.1', 'timestamp',
       'path', 'beacon_timestamp', 'beacon_uuid', 'beacon_major_id',
       'beacon_minor_id', 'beacon_tx_power',
       'beacon_mac_addr', 'beacon_timestamp2', 'accelerometer_timestamp',
       'accelerometer_x', 'accelerometer_y', 'accelerometer_z',
       'accelerometer_uncalibrated_timestamp', 'accelerometer_uncalibrated_x',
       'accelerometer_uncalibrated_y', 'accelerometer_uncalibrated_z',
       'accelerometer_uncalibrated_x2', 'accelerometer_uncalibrated_y2',
       'accelerometer_uncalibrated_z2', 'gyroscope_timestamp','gyroscope_z', 'gyroscope_uncalibrated_timestamp',
       'gyroscope_uncalibrated_z', 'gyroscope_uncalibrated_x2',
       'gyroscope_uncalibrated_y2', 'gyroscope_uncalibrated_z2',
       'magnetic_field_timestamp', 'magnetic_field_x', 'magnetic_field_y',
       'magnetic_field_uncalibrated_timestamp',
       'magnetic_field_uncalibrated_x', 'magnetic_field_uncalibrated_x2',
       'magnetic_field_uncalibrated_y2', 'magnetic_field_uncalibrated_z2',
       'rotation_vector_timestamp',
       'rotation_vector_z'
], axis=1)
    output_df['path'] = input_df_
    return output_df


def get_sum_sensor_features(input_df):
    output_df = pd.DataFrame()
    output_df['sum_sensor_x'] = input_df['accelerometer_x'] + input_df['accelerometer_uncalibrated_x'] + input_df['accelerometer_uncalibrated_x2'] + input_df['gyroscope_x'] + input_df['gyroscope_uncalibrated_x'] + input_df['accelerometer_uncalibrated_x'] + input_df['gyroscope_uncalibrated_x2'] + input_df['magnetic_field_x'] + input_df['magnetic_field_uncalibrated_x'] + input_df['magnetic_field_uncalibrated_x2'] + input_df['rotation_vector_x']
    output_df['sum_sensor_y'] = input_df['accelerometer_y'] + input_df['accelerometer_uncalibrated_y'] + input_df['accelerometer_uncalibrated_y2'] + input_df['gyroscope_y'] + input_df['gyroscope_uncalibrated_y'] + input_df['accelerometer_uncalibrated_y'] + input_df['gyroscope_uncalibrated_y2'] + input_df['magnetic_field_y'] + input_df['magnetic_field_uncalibrated_y'] + input_df['magnetic_field_uncalibrated_y2'] + input_df['rotation_vector_y']
    output_df['sum_sensor_z'] = input_df['accelerometer_x'] + input_df['accelerometer_uncalibrated_x'] + input_df['accelerometer_uncalibrated_x2'] + input_df['gyroscope_z'] + input_df['gyroscope_uncalibrated_z'] + input_df['accelerometer_uncalibrated_z'] + input_df['gyroscope_uncalibrated_z2'] + input_df['magnetic_field_z'] + input_df['magnetic_field_uncalibrated_z'] + input_df['magnetic_field_uncalibrated_z2'] + input_df['rotation_vector_z']
    output_df['sum_all_sensor'] = output_df['sum_sensor_x'] + output_df['sum_sensor_y'] + output_df['sum_sensor_z']
    return output_df


def get_le_features(input_df):
    _input_df = input_df.astype(str)
    cat_cols = ['beacon_mac_addr']
    output_df = pd.DataFrame()
    for c in cat_cols:
        le = LabelEncoder()
        le.fit(_input_df[c])
        output_df["LE_"+c] = le.transform(_input_df[c])
    return output_df





In [11]:
#test用
def get_numerical_features_test(input_df):
    input_df_ = input_df['site_path_timestamp']
    output_df = input_df.drop(['Unnamed: 0.1', 'Unnamed: 0.1.1',
                               'timestamp',
       'beacon_timestamp', 'beacon_uuid', 'beacon_major_id',
       'beacon_minor_id', 'beacon_tx_power',
       'beacon_mac_addr', 'beacon_timestamp2', 'accelerometer_timestamp',
       'accelerometer_x', 'accelerometer_y', 'accelerometer_z',
       'accelerometer_uncalibrated_timestamp', 'accelerometer_uncalibrated_x',
       'accelerometer_uncalibrated_y', 'accelerometer_uncalibrated_z',
       'accelerometer_uncalibrated_x2', 'accelerometer_uncalibrated_y2',
       'accelerometer_uncalibrated_z2', 'gyroscope_timestamp','gyroscope_z', 'gyroscope_uncalibrated_timestamp',
       'gyroscope_uncalibrated_z', 'gyroscope_uncalibrated_x2',
       'gyroscope_uncalibrated_y2', 'gyroscope_uncalibrated_z2',
       'magnetic_field_timestamp', 'magnetic_field_x', 'magnetic_field_y',
       'magnetic_field_uncalibrated_timestamp',
       'magnetic_field_uncalibrated_x', 'magnetic_field_uncalibrated_x2',
       'magnetic_field_uncalibrated_y2', 'magnetic_field_uncalibrated_z2',
       'rotation_vector_timestamp',
       'rotation_vector_z','site_path_timestamp'
                               ], axis=1)
    output_df['site_path_timestamp'] = input_df_
    return output_df


In [12]:
def get_process_funcs_train():
    funcs = [
        get_sum_sensor_features,
        get_le_features,
        get_numerical_features_train
    ]
    return funcs

def get_process_funcs_test():
    funcs = [
        get_sum_sensor_features,
        get_le_features,
        get_numerical_features_test
    ]
    return funcs

def to_feature(input_df, funcs):
    output_df = pd.DataFrame()
    for func in tqdm(funcs, total=len(funcs)):
        _df = func(input_df)
        assert len(_df) == len(input_df), func.__name__
        output_df = pd.concat([output_df, _df], axis=1)

    return output_df

In [13]:
feature_dir = "/content/drive/MyDrive/all_data_floor"
train_files = sorted(glob.glob(os.path.join(f'{feature_dir}','*_train.csv')))
test_files = sorted(glob.glob(os.path.join(f'{feature_dir}','*_test.csv')))
subm = pd.read_csv('/content/drive/MyDrive/sample_submission.csv', index_col=0)
floor_accurate = pd.read_csv('/content/drive/MyDrive/submission_floor_accurate.csv').drop(['x', 'y'], axis=1)

In [14]:
lgb_params = {'objective': 'root_mean_squared_error',
              'boosting_type': 'gbdt',
              'n_estimators': 50000,
              'learning_rate': 0.1,
              'num_leaves': 90,
              'colsample_bytree': 0.4,
              'subsample': 0.6,
              'subsample_freq': 2,
              'bagging_seed': SEED,
              'reg_alpha': 8,
              'reg_lambda': 2,
              'random_state': SEED,
              'n_jobs': -1
              }

In [None]:
score_df = pd.DataFrame()
oof = list()
predictions = list()
for n_files, file in enumerate(train_files):
    data = pd.read_csv(file, index_col=0).fillna(0)
    test_data = pd.read_csv(test_files[n_files], index_col=0).fillna(0)

    # all featrues
    process_funcs_train = get_process_funcs_train()
    process_funcs_test = get_process_funcs_test()
    data = to_feature(data, process_funcs_train).rename(columns={'f': 'floor'})
    test_data = to_feature(test_data, process_funcs_test)

    print(data.shape)
    print(test_data.shape)

    oof_x, oof_y = np.zeros(data.shape[0]), np.zeros(data.shape[0])
    preds_x, preds_y = 0, 0

    kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    for fold, (trn_idx, val_idx) in enumerate(kf.split(data.iloc[:, :-3])):
        X_train = data.iloc[trn_idx, :-3]
        y_trainx = data.iloc[trn_idx, -3]
        y_trainy = data.iloc[trn_idx, -2]

        X_valid = data.iloc[val_idx, :-3]
        y_validx = data.iloc[val_idx, -3]
        y_validy = data.iloc[val_idx, -2]
        
        modelx = lgb.LGBMRegressor(**lgb_params)
        with timer("fit X"):
            modelx.fit(X_train, y_trainx,
                       eval_set=[(X_valid, y_validx)],
                       eval_metric='rmse',
                       verbose=False,
                       early_stopping_rounds=20
                       )

        modely = lgb.LGBMRegressor(**lgb_params)
        with timer("fit Y"):
            modely.fit(X_train, y_trainy,
                       eval_set=[(X_valid, y_validy)],
                       eval_metric='rmse',
                       verbose=False,
                       early_stopping_rounds=20
                       )
            
        oof_x[val_idx] = modelx.predict(X_valid)
        oof_y[val_idx] = modely.predict(X_valid)

        preds_x += modelx.predict(test_data.iloc[:, :-1]) / N_SPLITS
        preds_y += modely.predict(test_data.iloc[:, :-1]) / N_SPLITS

        score = comp_metric(oof_x[val_idx], oof_y[val_idx], y_validx.to_numpy(), y_validy.to_numpy())
        print(f"fold {fold}: mean position error {score}")
        score_df = score_log(score_df, n_files, os.path.basename(file), data.shape, fold, SEED, score)

    print("*+"*40)
    print(f"file #{n_files}, shape={data.shape}, name={os.path.basename(file)}")
    score = comp_metric(oof_x, oof_y, data.iloc[:, -3].to_numpy(), data.iloc[:, -2].to_numpy())
    oof.append(score)
    print(f"mean position error {score}")
    print("*+"*40)
    score_df = score_log(score_df, n_files, os.path.basename(file), data.shape, 999, SEED, score)

    test_preds = pd.DataFrame(np.stack((preds_x, preds_y))).T
    test_preds['floor'] = test_data['floor'].values
    test_preds.columns = subm.columns
    test_preds.index = test_data["site_path_timestamp"]
    predictions.append(test_preds)

  interactivity=interactivity, compiler=compiler, result=result)
100%|██████████| 3/3 [00:29<00:00,  9.92s/it]
100%|██████████| 3/3 [00:00<00:00,  3.12it/s]


(9296, 3417)
(299, 3415)


[2.9GB(+0.2GB): 25.118sec] fit X
[3.1GB(+0.1GB): 19.952sec] fit Y


fold 0: mean position error 2.697205819380121


[3.1GB(+0.0GB): 20.302sec] fit X
[3.1GB(+0.0GB): 36.688sec] fit Y


fold 1: mean position error 2.741916588185573


[3.1GB(+0.0GB): 28.038sec] fit X
[3.1GB(+0.0GB): 24.762sec] fit Y


fold 2: mean position error 2.930265024970512


[3.1GB(+0.0GB): 28.363sec] fit X
[3.1GB(+0.0GB): 24.347sec] fit Y


fold 3: mean position error 2.7826057039180614


[3.1GB(+0.0GB): 25.087sec] fit X
[3.1GB(+0.0GB): 25.199sec] fit Y


fold 4: mean position error 2.8741907136040883


[3.1GB(+0.0GB): 29.938sec] fit X
[3.1GB(-0.0GB): 21.729sec] fit Y


fold 5: mean position error 2.8128564932672457


[3.1GB(+0.0GB): 21.514sec] fit X
[3.1GB(+0.0GB): 23.949sec] fit Y


fold 6: mean position error 2.913757989581229


[3.1GB(+0.0GB): 24.351sec] fit X
[3.1GB(+0.0GB): 26.782sec] fit Y


fold 7: mean position error 2.8459695620236545


[3.1GB(+0.0GB): 26.937sec] fit X
[3.1GB(+0.0GB): 30.907sec] fit Y


fold 8: mean position error 2.717013416734641


[3.1GB(+0.0GB): 24.010sec] fit X
[3.1GB(+0.0GB): 25.693sec] fit Y


fold 9: mean position error 2.9271718399439837
*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+
file #0, shape=(9296, 3417), name=5a0546857ecc773753327266_train.csv
mean position error 2.8242838337121547
*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+*+


  interactivity=interactivity, compiler=compiler, result=result)
100%|██████████| 3/3 [00:28<00:00,  9.41s/it]
100%|██████████| 3/3 [00:00<00:00, 34.80it/s]


(9737, 3083)
(26, 3081)


[3.2GB(+0.0GB): 23.658sec] fit X
[3.2GB(+0.0GB): 19.930sec] fit Y


fold 0: mean position error 3.4266885644117986


[3.2GB(+0.0GB): 27.815sec] fit X
[3.2GB(+0.0GB): 24.837sec] fit Y


fold 1: mean position error 3.3894008292406324


[3.2GB(+0.0GB): 24.013sec] fit X
[3.2GB(+0.0GB): 38.081sec] fit Y


fold 2: mean position error 3.3460630365809956


[3.2GB(+0.0GB): 28.825sec] fit X
[3.2GB(+0.0GB): 19.989sec] fit Y


fold 3: mean position error 3.4188461396190695


[3.2GB(+0.0GB): 30.553sec] fit X


In [None]:
all_preds = pd.concat(predictions)
all_preds = all_preds.reindex(subm.index)
floor_accurate = floor_accurate.reindex(subm.index)
all_preds['floor'] = floor_accurate['floor'].values
all_preds.to_csv('submission_lgbm_23.csv')

In [None]:
shutil.move('submission_lgbm_wifi_23.csv', '/content/drive/MyDrive')

In [None]:
all_preds['floor'] = floor_accurate['floor']


In [None]:
all_preds

Unnamed: 0_level_0,x,y,floor
site_path_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,88.812457,101.054509,0
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000009017,88.334100,101.459640,0
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000015326,86.407735,105.915363,0
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000018763,86.373000,106.192623,0
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000022328,88.410066,109.426687,0
...,...,...,...
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000082589,213.660124,92.248068,5
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000085758,215.185774,94.339885,5
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000090895,202.804718,107.437614,5
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000096899,203.883893,109.724543,5


In [None]:
floor_accurate

Unnamed: 0,site_path_timestamp,floor
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,0
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,0
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,0
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,0
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,0
...,...,...
10128,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5
10129,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5
10130,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5
10131,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5


In [None]:
all_preds

Unnamed: 0_level_0,x,y,floor
site_path_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000000009,88.812457,101.054509,
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000009017,88.334100,101.459640,
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000015326,86.407735,105.915363,
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000018763,86.373000,106.192623,
5a0546857ecc773753327266_046cfa46be49fc10834815c6_0000000022328,88.410066,109.426687,
...,...,...,...
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000082589,213.660124,92.248068,
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000085758,215.185774,94.339885,
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000090895,202.804718,107.437614,
5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f412_0000000096899,203.883893,109.724543,


In [None]:
a = pd.read_csv('/content/drive/MyDrive/submission_lstm_in_floor_08_before_post.csv')
a

Unnamed: 0,site_path_timestamp,floor,x,y
0,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,,
1,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,,
2,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,,
3,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,,
4,5a0546857ecc773753327266_046cfa46be49fc1083481...,0,,
...,...,...,...,...
10128,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5,,
10129,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5,,
10130,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5,,
10131,5dc8cea7659e181adb076a3f_fd64de8c4a2fc5ebb0e9f...,5,,
