In [2]:
import os
import sys
import glob
import yaml
import pickle
import random

import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns 
sns.set()
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [47]:
floor = pd.read_csv("../input/oof_preds_floor.csv")
floor['path'] = floor['path'].str.split('.', expand=True)[0]
floor

Unnamed: 0,path,floor_pred,floor_actual
0,5d10a1699c50c70008fe8979,F3,F3
1,5d10a16cf9037900086afc3f,F3,F3
2,5d11942cffe23f0008604e2e,F3,F3
3,5d119434ffe23f0008604e34,F3,F3
4,5d11943dffe23f0008604e3a,F3,F3
...,...,...,...
10826,5dd7af4b9191710006b56823,F2,F2
10827,5dd7b2a8c5b77e0006b16ae4,F1,F1
10828,5dd7b4bbc5b77e0006b16afb,B1,B1
10829,5dd7bea99191710006b568a0,B1,B1


In [48]:
identifiable_list = []
for idx, row in floor.iterrows():
    if row['floor_pred'] == row['floor_actual']:
        identifiable_list.append(True)
    else:
        identifiable_list.append(False)
    
floor['is_known_floor'] = identifiable_list
floor

Unnamed: 0,path,floor_pred,floor_actual,is_known_floor
0,5d10a1699c50c70008fe8979,F3,F3,True
1,5d10a16cf9037900086afc3f,F3,F3,True
2,5d11942cffe23f0008604e2e,F3,F3,True
3,5d119434ffe23f0008604e34,F3,F3,True
4,5d11943dffe23f0008604e3a,F3,F3,True
...,...,...,...,...
10826,5dd7af4b9191710006b56823,F2,F2,True
10827,5dd7b2a8c5b77e0006b16ae4,F1,F1,True
10828,5dd7b4bbc5b77e0006b16afb,B1,B1,True
10829,5dd7bea99191710006b568a0,B1,B1,True


In [49]:
with open('../input/2kaido_wifi_dataset_v4/train_10000_7.pkl', 'rb') as f:
  train_df = pickle.load(f)
train_df = train_df[['site_id', 'floor_str', 'path']].drop_duplicates()
train_df

Unnamed: 0,site_id,floor_str,path
0,5a0546857ecc773753327266,B1,5e1580d1f4c3420006d520e4
19,5a0546857ecc773753327266,B1,5e1580bb1506f2000638fc62
38,5a0546857ecc773753327266,B1,5e158ef31506f2000638fd1d
61,5a0546857ecc773753327266,B1,5e158ed7f4c3420006d5216a
86,5a0546857ecc773753327266,B1,5e158efe1506f2000638fd25
...,...,...,...
240758,5dc8cea7659e181adb076a3f,F7,5dcd5c9323759900063d590a
240790,5dc8cea7659e181adb076a3f,F7,5dcd5c88a4dbe7000630b084
240823,5dc8cea7659e181adb076a3f,F7,5dcd5c95a4dbe7000630b090
240835,5dc8cea7659e181adb076a3f,F7,5dcfb844878f3300066c70ee


In [50]:
floor_df = train_df.merge(floor[['path', 'is_known_floor']])
floor_df

Unnamed: 0,site_id,floor_str,path,is_known_floor
0,5a0546857ecc773753327266,B1,5e1580d1f4c3420006d520e4,True
1,5a0546857ecc773753327266,B1,5e1580bb1506f2000638fc62,True
2,5a0546857ecc773753327266,B1,5e158ef31506f2000638fd1d,True
3,5a0546857ecc773753327266,B1,5e158ed7f4c3420006d5216a,True
4,5a0546857ecc773753327266,B1,5e158efe1506f2000638fd25,True
...,...,...,...,...
10507,5dc8cea7659e181adb076a3f,F7,5dcd5c9323759900063d590a,True
10508,5dc8cea7659e181adb076a3f,F7,5dcd5c88a4dbe7000630b084,True
10509,5dc8cea7659e181adb076a3f,F7,5dcd5c95a4dbe7000630b090,True
10510,5dc8cea7659e181adb076a3f,F7,5dcfb844878f3300066c70ee,True


In [51]:
site_floor_dict = {}
for (site_id, floor_str), df in floor_df.groupby(['site_id', 'floor_str']):

    # floor内の全てのpathのfloor予測が正解していれば100%特定可能なfloorとする。
    if df['is_known_floor'].sum() == len(df):
        site_floor_dict[f"{site_id}_{floor_str}"] = True
    else:
        site_floor_dict[f"{site_id}_{floor_str}"] = False

In [53]:
site_floor_dict

{'5a0546857ecc773753327266_B1': True,
 '5a0546857ecc773753327266_F1': True,
 '5a0546857ecc773753327266_F2': True,
 '5a0546857ecc773753327266_F3': False,
 '5a0546857ecc773753327266_F4': False,
 '5c3c44b80379370013e0fd2b_B1': True,
 '5c3c44b80379370013e0fd2b_F1': True,
 '5c3c44b80379370013e0fd2b_F2': True,
 '5c3c44b80379370013e0fd2b_F3': False,
 '5c3c44b80379370013e0fd2b_F4': False,
 '5c3c44b80379370013e0fd2b_F5': False,
 '5d27075f03f801723c2e360f_B1': True,
 '5d27075f03f801723c2e360f_F1': True,
 '5d27075f03f801723c2e360f_F2': True,
 '5d27075f03f801723c2e360f_F3': False,
 '5d27075f03f801723c2e360f_F4': False,
 '5d27075f03f801723c2e360f_F5': True,
 '5d27096c03f801723c31e5e0_B1': True,
 '5d27096c03f801723c31e5e0_F1': True,
 '5d27096c03f801723c31e5e0_F2': False,
 '5d27096c03f801723c31e5e0_F3': False,
 '5d27096c03f801723c31e5e0_F4': True,
 '5d27096c03f801723c31e5e0_F5': True,
 '5d27096c03f801723c31e5e0_F6': True,
 '5d27097f03f801723c320d97_B1': True,
 '5d27097f03f801723c320d97_B2': True,
 '5

In [58]:
sum(list(site_floor_dict.values()))/len(list(site_floor_dict.values()))

0.5182481751824818

## 半分のfloorは完全に特定可能

In [60]:
output_df = pd.DataFrame()
output_df['site_floor'] = site_floor_dict.keys()
output_df['is_known_floor'] = site_floor_dict.values()
output_df

Unnamed: 0,site_floor,is_known_floor
0,5a0546857ecc773753327266_B1,True
1,5a0546857ecc773753327266_F1,True
2,5a0546857ecc773753327266_F2,True
3,5a0546857ecc773753327266_F3,False
4,5a0546857ecc773753327266_F4,False
...,...,...
132,5dc8cea7659e181adb076a3f_F3,False
133,5dc8cea7659e181adb076a3f_F4,False
134,5dc8cea7659e181adb076a3f_F5,True
135,5dc8cea7659e181adb076a3f_F6,False


In [61]:
output_df.to_csv('../input/is_known_floor.csv', index=False)