### Wifi features

This this is the code to generate the wifi features available in [this dataset](https://www.kaggle.com/devinanzelmo/indoor-navigation-and-location-wifi-features). Using these features can get a score below 14. For an example notebook using them see [this notebook](https://www.kaggle.com/devinanzelmo/wifi-features-lightgbm-starter). They only uses waypoints, wifi and timestamp data to generate solution. See this [forum post](https://www.kaggle.com/c/indoor-location-navigation/discussion/215445) for an outline of this solution method, and methods of improvement.

There are `break`'s inserted into loops which need to be removed to get this to run. Right now data is written to current working directory. This takes 2-4 hours to run depending on hard drive etc. There is a lot of room for improvement speeding up feature generation. 

**Update:** I added one line that creates a column for the path filename, this allows for a groupkfold crossvalidation. 


In [1]:
import pandas as pd
import numpy as np
import glob
import os
import gc
import json 

In [2]:
base_path = './'

In [3]:
folders = sorted(glob.glob(os.path.join(base_path,'train2'+'/*')))

used_buildings=[]
for folder in folders:

    used_buildings.append(folder.replace('./train2/',''))
    

In [4]:
used_buildings

['5cd56b6be2acfd2d33b59d1f']

In [5]:
floor_map = {"L1":-4,"B3":-3, "B2":-2, "B1":-1, "F1":0, "F2": 1, "F3":2, "F4":3, "F5":4, "F6":5, "F7":6,"F8":7, "F9":8,
             "1F":0, "2F":1, "3F":2, "4F":3, "5F":4, "6F":5, "7F":6, "8F": 7, "9F":8}

In [6]:
# get only the wifi bssid that occur over 1000 times(this number can be experimented with)
# these will be the only ones used when constructing features
bssid = dict()
trainingtimeDict = dict()

for building in used_buildings:
    #break
    folders = sorted(glob.glob(os.path.join(base_path,'train2/'+building+'/*')))
    print(building)
    print(folders)
    wifi = list()
    for folder in folders:
        floor = floor_map[folder.split('/')[-1]]
        files = glob.glob(os.path.join(folder, "*.txt"))
        for file in files:
            with open(file) as f:
                txt = f.readlines()
                for e, line in enumerate(txt):
                    tmp = line.strip().split()
                    if tmp[1] == "TYPE_WIFI":
                        wifi.append(tmp)
    
    df = pd.DataFrame(wifi)
    
    
    #unique time
    dftime = list(df[0].unique())

    #75% of unique time
    from random import sample
    trainingTime = sample(dftime,int(len(dftime)*0.75))

    trainingTimedf = pd.DataFrame(trainingTime,columns=['timestamp'])
    
    
    #select 75% timestamp of wifi as training data
    df_train = df.merge(trainingTimedf,left_on = 0, right_on = 'timestamp', how='inner')
    del df_train['timestamp']
    
    value_counts = df_train[3].value_counts()
    
    #only show the value that count over 500 times
    top_bssid = value_counts[value_counts > 500].index.tolist()
    #print(len(top_bssid))
    bssid[building] = top_bssid
    trainingtimeDict[building] = trainingTime
    #del df
    del wifi
    gc.collect()



# generate all the training data 
building_dfs = dict()

for building in used_buildings:
    #break
    folders = sorted(glob.glob(os.path.join(base_path,'train2', building +'/*')))
    dfs_train = list()
    dfs_test = list()
    index = sorted(bssid[building])
    trainingTime = sorted(trainingtimeDict[building])
    
    print(building)
    
    wifi = list()
    waypoint = list()
    for folder in folders:
        floor = floor_map[folder.split('/')[-1]]
        files = glob.glob(os.path.join(folder, "*.txt"))
        #print(floor)
        for file in files:
            #print (file)
            #wifi_currentFile = list()
            #waypoint = list()
            with open(file) as f:
                txt = f.readlines()
            for line in txt:
                line = line.strip().split()
                if line[1] == "TYPE_WAYPOINT":
                    line.append(floor)
                    line.append(file.split('/')[-1].split('.')[0])
                    waypoint.append(line)
                if line[1] == "TYPE_WIFI":
                    line.append(file.split('/')[-1].split('.')[0])
                    wifi.append(line)
    
    #all wifi and waypoint for each building
    df_wifi = pd.DataFrame(wifi)
    df_waypoint = pd.DataFrame(waypoint)
    

            

    #each set（path,time）
    for timestamp_path, wifi_record in df_wifi.groupby([0,7]):

        dists = list()
        
        for waypoint_index, waypoint_record in enumerate(waypoint):
        # when the wifi and waypoint are not on the same path, append an arbitarily large number
            

            if waypoint_record[5] != timestamp_path[1]:
                dist = 10000000000
                
            else:    
            #the nearest waypoint in the same timestamp
                dist = abs(int(waypoint_record[0]) - int(timestamp_path[0]))

            
            
            dists.append(dist)
            
        nearest_wp_index = np.argmin(dists)
 
    
 
        #For each timing, wifi data pivot/timestamp/path
        wifi_record = wifi_record.drop_duplicates(subset=3)
        tmp = wifi_record.iloc[:,3:5]
        feat = tmp.set_index(3).reindex(index).replace(np.nan, -999).T
        feat["x"] = float(waypoint[nearest_wp_index][2])
        feat["y"] = float(waypoint[nearest_wp_index][3])
        feat["f"] = waypoint[nearest_wp_index][4]
        feat["path"] = wifi_record[7].unique()[0] # useful for crossvalidation
        feat['timestamp'] = wifi_record[0].unique()[0]
        feat['waypointtimestamp'] = waypoint[nearest_wp_index][0]
        
        
        #75% timestamp of wifi as training data->dfs_train
        #the rest->dfs_test
        if (wifi_record[0].unique()[0] in trainingTime)==True:
            dfs_train.append(feat)
        else:
            dfs_test.append(feat)
    
    building_df_train = pd.concat(dfs_train)

    
    building_df_test = pd.concat(dfs_test)

    
    
    #create filtered data
    training_filtered = []
    
    #we group by each waypoint timestamp, and see which wifitimestamp is closest
    for waypoint_time, wifi_record in building_df_train.groupby(['waypointtimestamp']):

        distance = list()
        for idx in range(0,wifi_record.shape[0]):
            
            distance.append(abs(int(waypoint_time) - int(wifi_record.iloc[idx]['timestamp'])))
    
        nearest_wifi_index = np.argmin(distance)

        training_filtered.append(wifi_record.iloc[[nearest_wifi_index]])

    df_train_filtered = pd.concat(training_filtered)

    df_train_filtered.drop('waypointtimestamp',axis=1)
    #print (df_train_filtered.columns)
    df_train_filtered.to_csv(building+"_train_filtered.csv")
    
    
    

    
    
 
    
    #do the same thing for the testing set
    testing_filtered = []
    
    for waypoint_time, wifi_record in building_df_test.groupby(['waypointtimestamp']):
        
        distance = list()
        for idx in range(0,wifi_record.shape[0]):
            
            distance.append(abs(int(waypoint_time) - int(wifi_record.iloc[idx]['timestamp'])))
    
        nearest_wifi_index = np.argmin(distance)
        
        testing_filtered.append(wifi_record.iloc[[nearest_wifi_index]])

    df_test_filtered = pd.concat(testing_filtered)  
    df_test_filtered.drop('wifitimestamp',axis=1)
    df_test_filtered.to_csv(building+"_test_filtered.csv")


    


5cd56b6be2acfd2d33b59d1f
['./train2/5cd56b6be2acfd2d33b59d1f/F1', './train2/5cd56b6be2acfd2d33b59d1f/F2', './train2/5cd56b6be2acfd2d33b59d1f/F3']
5cd56b6be2acfd2d33b59d1f
1560842747286
3
x                                 27.5081
y                                 63.6128
f                                       2
path             5d0896415125450008037c76
timestamp                   1560842746728
wifitimestamp               1560842747286
Name: 4, dtype: object
0
1560842746728
1560842758558
3
x                                 25.1819
y                                 68.4099
f                                       2
path             5d0896415125450008037c76
timestamp                   1560842756732
wifitimestamp               1560842758558
Name: 4, dtype: object
3
x                                 25.1819
y                                 68.4099
f                                       2
path             5d0896415125450008037c76
timestamp                   1560842758171
wifitimestamp      

1560846504599
1560846516583
3
x                                 64.7223
y                                 10.1716
f                                       0
path             5d08a1515125450008037d85
timestamp                   1560846511871
wifitimestamp               1560846516583
Name: 4, dtype: object
0
1560846511871
1560846523484
3
x                                 66.1582
y                                 7.75464
f                                       0
path             5d08a1515125450008037d85
timestamp                   1560846522094
wifitimestamp               1560846523484
Name: 4, dtype: object
3
x                                 66.1582
y                                 7.75464
f                                       0
path             5d08a1515125450008037d85
timestamp                   1560846525049
wifitimestamp               1560846523484
Name: 4, dtype: object
3
x                                 66.1582
y                                 7.75464
f                        