In [59]:
import re
from pathlib import Path
from typing import List, Dict, Tuple
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

In [60]:
datasets = {}
names = ['win_a', 'win_e', 'mac_s', 'mac_j']
for name in names:
    data_path = Path.cwd() / 'data' / 'processed' / f'{name}.csv'
    datasets[name] = pd.read_csv(data_path)

In [61]:
datasets['win_a'].describe()

Unnamed: 0,dBm,Channel
count,2754.0,2754.0
mean,-83.099129,43.067538
std,8.763189,50.296532
min,-96.0,1.0
25%,-89.0,6.0
50%,-85.5,11.0
75%,-80.0,64.0
max,-56.5,161.0


In [62]:
datasets['win'] = pd.concat([datasets['win_a'], datasets['win_e']])
datasets['win']

Unnamed: 0,Location,SSID,Network type,Authentication,Encryption,BSSID,Signal,dBm,Radio type,Channel,Basic rates (Mbps),Other rates (Mbps)
0,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fe:41:c1,81%,-59.5,802.11n,6,12 24,36 48 54
1,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,38:17:c3:0c:7f:e1,33%,-83.5,802.11n,6,12 24,36 48 54
2,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fe:41:91,16%,-92.0,802.11ac,100,12 24,36 48 54
3,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,f0:5c:19:ba:bc:d0,22%,-89.0,802.11ac,161,12 24,36 48 54
4,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fe:44:a1,29%,-85.5,802.11n,11,12 24,36 48 54
...,...,...,...,...,...,...,...,...,...,...,...,...
819,MToilet,eduroam,,,,b4:5d:50:fe:41:d2,10%,-95.0,802.11ac,52,12 24,36 48 54
820,MToilet,eduroam,,,,b0:b8:67:63:50:b1,40%,-80.0,802.11ac,153,12 24,36 48 54
821,MToilet,eduroam,,,,18:64:72:5e:a9:72,100%,-50.0,802.11ac,149,12 24,36 48 54
822,MToilet,eduroam,,,,b0:b8:67:63:30:91,40%,-80.0,802.11ac,149,12 24,36 48 54


In [63]:
datasets['win']['Location'].unique()

array(['Stair2', 'GSR2-4', 'GSR2-3/2', 'GSR2-1', 'PrintingRoom',
       'Stairs1', 'FToilet', 'LW2.1b', 'Lift2', 'Lift1', 'MToilet',
       'Walkway', 'CommonArea', 'Stairs3', 'SR2-4b', 'SR2-4a', 'SR2-3b',
       'GSR2-6', 'SR2-3a', 'SR2-2a', 'SR2-2b', 'SR2-1b', 'SR2-1a',
       'Stairs2', 'LW2.1a'], dtype=object)

In [64]:
datasets.pop('win_a')
datasets.pop('win_e')

Unnamed: 0,Location,SSID,Network type,Authentication,Encryption,BSSID,Signal,dBm,Radio type,Channel,Basic rates (Mbps),Other rates (Mbps)
0,GSR2-6,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b0:b8:67:63:7f:32,100%,-50.0,802.11ac,108,12 24,36 48 54
1,GSR2-6,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fe:41:91,30%,-85.0,802.11ac,100,12 24,36 48 54
2,GSR2-6,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fd:bd:51,40%,-80.0,802.11ac,64,12 24,36 48 54
3,GSR2-6,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fe:41:d1,15%,-92.5,802.11ac,52,12 24,36 48 54
4,GSR2-6,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fe:44:b1,14%,-93.0,802.11ac,44,12 24,36 48 54
...,...,...,...,...,...,...,...,...,...,...,...,...
819,MToilet,eduroam,,,,b4:5d:50:fe:41:d2,10%,-95.0,802.11ac,52,12 24,36 48 54
820,MToilet,eduroam,,,,b0:b8:67:63:50:b1,40%,-80.0,802.11ac,153,12 24,36 48 54
821,MToilet,eduroam,,,,18:64:72:5e:a9:72,100%,-50.0,802.11ac,149,12 24,36 48 54
822,MToilet,eduroam,,,,b0:b8:67:63:30:91,40%,-80.0,802.11ac,149,12 24,36 48 54


In [65]:
datasets['mac_j']

Unnamed: 0,location,network_name,ssid_hex,bssid,channel_number,band,bandwidth,phy,rssi,security_type,encryption,age
0,CommonArea,,,<redacted>,6,2GHz,20MHz,b/g/n,-92,WPA/WPA2,"aes_ccm, ccmp, tkip",0ms
1,CommonArea,CS168-2138,43533136382d32313338,<redacted>,4,2GHz,20MHz,b/g,-86,WPA/WPA2,,0ms
2,CommonArea,CS168-2160,43533136382d32313630,<redacted>,2,2GHz,20MHz,b/g,-76,WPA/WPA2,,0ms
3,CommonArea,dlink-M920-dd51,646c696e6b2d4d3932302d64643531,<redacted>,8,2GHz,20MHz,b/g/n,-91,WPA/WPA2,aes_ccm,0ms
4,CommonArea,E250-0528,453235302d30353238,<redacted>,4,2GHz,20MHz,b/g,-84,WPA/WPA2,,0ms
...,...,...,...,...,...,...,...,...,...,...,...,...
149,Stairs3,WLAN-SMU,574c414e2d534d55,<redacted>,6,2GHz,20MHz,g/n,-68,WPA/WPA2,aes_ccm,0ms
150,Stairs3,WLAN-SMU,574c414e2d534d55,<redacted>,136,5GHz,40MHz,a/n/ac,-69,WPA/WPA2,aes_ccm,0ms
151,Stairs3,WLAN-SMU,574c414e2d534d55,<redacted>,1,2GHz,20MHz,g/n,-83,WPA/WPA2,aes_ccm,0ms
152,Stairs3,WLAN-SMU,574c414e2d534d55,<redacted>,161,5GHz,40MHz,a/n/ac,-68,WPA/WPA2,aes_ccm,0ms


In [66]:
datasets.pop('mac_j')
datasets

{'mac_s':     location        network_name      ssid              bssid  \
 0    FToilet  BayView Hotel BH 1   BayView  02:0d:3c:21:99:0d   
 1    FToilet             eduroam   eduroam  18:64:72:4c:9a:62   
 2    FToilet             eduroam   eduroam  18:64:72:4c:9a:72   
 3    FToilet             eduroam   eduroam  18:64:72:5e:a9:62   
 4    FToilet             eduroam   eduroam  18:64:72:5e:a9:72   
 ..       ...                 ...       ...                ...   
 382  Stairs1            WLAN-SMU  WLAN-SMU  b0:b8:67:63:6b:82   
 383  Stairs1            WLAN-SMU  WLAN-SMU  b0:b8:67:63:6b:92   
 384  Stairs1            WLAN-SMU  WLAN-SMU  b4:5d:50:fd:b0:91   
 385  Stairs1            WLAN-SMU  WLAN-SMU  b4:5d:50:fd:b4:c1   
 386  Stairs1            WLAN-SMU  WLAN-SMU  b4:5d:50:fd:b4:d1   
 
        security_type   band  bandwidth   phy  rssi   bi  age  
 0               open  5g153         20     a   -92  100  0ms  
 1    wpa2-enterprise    2g1         20     n   -74  100  0ms  
 2   

In [67]:
datasets['mac_s']

Unnamed: 0,location,network_name,ssid,bssid,security_type,band,bandwidth,phy,rssi,bi,age
0,FToilet,BayView Hotel BH 1,BayView,02:0d:3c:21:99:0d,open,5g153,20,a,-92,100,0ms
1,FToilet,eduroam,eduroam,18:64:72:4c:9a:62,wpa2-enterprise,2g1,20,n,-74,100,0ms
2,FToilet,eduroam,eduroam,18:64:72:4c:9a:72,wpa2-enterprise,5g36,40,n/ac,-74,100,0ms
3,FToilet,eduroam,eduroam,18:64:72:5e:a9:62,wpa2-enterprise,2g1,20,n,-87,100,0ms
4,FToilet,eduroam,eduroam,18:64:72:5e:a9:72,wpa2-enterprise,5g149,40,n/ac,-86,100,0ms
...,...,...,...,...,...,...,...,...,...,...,...
382,Stairs1,WLAN-SMU,WLAN-SMU,b0:b8:67:63:6b:82,wpa2-enterprise,2g6,20,n,-69,100,0ms
383,Stairs1,WLAN-SMU,WLAN-SMU,b0:b8:67:63:6b:92,wpa2-enterprise,5g60,40,n/ac,-61,100,0ms
384,Stairs1,WLAN-SMU,WLAN-SMU,b4:5d:50:fd:b0:91,wpa2-enterprise,5g48,40,n/ac,-89,100,0ms
385,Stairs1,WLAN-SMU,WLAN-SMU,b4:5d:50:fd:b4:c1,wpa2-enterprise,2g1,20,n,-86,100,0ms


In [68]:
def standardize_columns(df, is_dataset1=True):
    if is_dataset1:
        # Dataset 1 column renaming
        column_mapping = {
            'Radio type': 'radio_type',  # Change 'Radio type' to 'radio_type'
            'Basic rates (Mbps)': 'basic_rates_mbps',
            'Other rates (Mbps)': 'other_rates_mbps'
        }
        df = df.rename(columns=column_mapping)
        # Convert remaining columns to lowercase
        df.columns = df.columns.str.lower()
        return df
    else:
        # Create mapping for dataset 2 columns to match dataset 1
        column_mapping = {
            'location': 'location',
            'network_name': 'ssid',  # We'll use network_name as SSID
            'ssid': 'ssid_orig',    # Keep original ssid as backup
            'bssid': 'bssid',
            'security_type': 'authentication',
            'band': 'band',         # Keep band for channel extraction
            'bandwidth': 'bandwidth',
            'phy': 'radio_type',    # Map phy to radio_type
            'rssi': 'dbm',          # We'll convert rssi to dBm
            'bi': 'basic_rates_mbps',
            'age': 'other_rates_mbps'
        }
        return df.rename(columns=column_mapping)

In [69]:
def standardize_values(df, is_dataset1=True):
    df_std = df.copy()
    
    if not is_dataset1:
        # Convert security_type to match dataset1 authentication format
        df_std['authentication'] = df_std['authentication'].map({
            'open': 'Open',
            'wpa2-enterprise': 'WPA2-Enterprise'
        })
        
        # Add network type column
        df_std['network type'] = 'Infrastructure'
        
        # Add encryption column
        df_std['encryption'] = 'CCMP'  # Default for WPA2-Enterprise
        df_std.loc[df_std['authentication'] == 'Open', 'encryption'] = None
        
        # Convert rssi to dBm and signal percentage
        df_std['dbm'] = df_std['dbm'].astype(str).str.replace('-', '').astype(float) * -1
        
        # Calculate approximate signal percentage
        def rssi_to_signal(dbm):
            if pd.isna(dbm):
                return None
            # Approximate conversion: -50 dBm ≈ 100%, -100 dBm ≈ 0%
            signal = 2 * (100 + dbm)
            return min(max(signal, 0), 100)  # Clamp between 0 and 100
        
        df_std['signal'] = df_std['dbm'].apply(rssi_to_signal).apply(lambda x: f"{int(x)}%" if pd.notnull(x) else None)
        
        # Extract channel from band column
        def extract_channel(band):
            if pd.isna(band):
                return None
            # Extract channel number after 'g' in the band string
            # Example: '5g140' -> '140'
            match = re.search(r'g(\d+)', str(band))
            return match.group(1) if match else None
        
        df_std['channel'] = df_std['band'].apply(extract_channel)
        
        # Set basic and other rates to None as they're not available in dataset2
        df_std['basic_rates_mbps'] = None
        df_std['other_rates_mbps'] = None
        
        # Convert radio type
        phy_mapping = {
            'n': '802.11n',
            'a': '802.11a',
            'n/ac': '802.11ac'
        }
        df_std['radio_type'] = df_std['radio_type'].map(phy_mapping)
    
    # Final column ordering to match dataset1 with new column names
    columns_order = [
        'location', 'ssid', 'network type', 'authentication', 'encryption',
        'bssid', 'signal', 'dbm', 'radio_type', 'channel',
        'basic_rates_mbps', 'other_rates_mbps'
    ]
    
    # Select only the columns we want in the final output
    df_std = df_std[columns_order]
    
    return df_std

In [70]:
datasets_std = {}
for name, df in datasets.items():
    is_dataset1 = name == 'win'
    df_std = standardize_columns(df, is_dataset1)
    df_std = standardize_values(df_std, is_dataset1)
    datasets_std[name] = df_std

In [71]:
datasets_std['win']

Unnamed: 0,location,ssid,network type,authentication,encryption,bssid,signal,dbm,radio_type,channel,basic_rates_mbps,other_rates_mbps
0,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fe:41:c1,81%,-59.5,802.11n,6,12 24,36 48 54
1,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,38:17:c3:0c:7f:e1,33%,-83.5,802.11n,6,12 24,36 48 54
2,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fe:41:91,16%,-92.0,802.11ac,100,12 24,36 48 54
3,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,f0:5c:19:ba:bc:d0,22%,-89.0,802.11ac,161,12 24,36 48 54
4,Stair2,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fe:44:a1,29%,-85.5,802.11n,11,12 24,36 48 54
...,...,...,...,...,...,...,...,...,...,...,...,...
819,MToilet,eduroam,,,,b4:5d:50:fe:41:d2,10%,-95.0,802.11ac,52,12 24,36 48 54
820,MToilet,eduroam,,,,b0:b8:67:63:50:b1,40%,-80.0,802.11ac,153,12 24,36 48 54
821,MToilet,eduroam,,,,18:64:72:5e:a9:72,100%,-50.0,802.11ac,149,12 24,36 48 54
822,MToilet,eduroam,,,,b0:b8:67:63:30:91,40%,-80.0,802.11ac,149,12 24,36 48 54


In [72]:
datasets_std['mac_s']

Unnamed: 0,location,ssid,network type,authentication,encryption,bssid,signal,dbm,radio_type,channel,basic_rates_mbps,other_rates_mbps
0,FToilet,BayView Hotel BH 1,Infrastructure,Open,,02:0d:3c:21:99:0d,16%,-92.0,802.11a,153,,
1,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:4c:9a:62,52%,-74.0,802.11n,1,,
2,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:4c:9a:72,52%,-74.0,802.11ac,36,,
3,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:5e:a9:62,26%,-87.0,802.11n,1,,
4,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:5e:a9:72,28%,-86.0,802.11ac,149,,
...,...,...,...,...,...,...,...,...,...,...,...,...
382,Stairs1,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b0:b8:67:63:6b:82,62%,-69.0,802.11n,6,,
383,Stairs1,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b0:b8:67:63:6b:92,78%,-61.0,802.11ac,60,,
384,Stairs1,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fd:b0:91,22%,-89.0,802.11ac,48,,
385,Stairs1,WLAN-SMU,Infrastructure,WPA2-Enterprise,CCMP,b4:5d:50:fd:b4:c1,28%,-86.0,802.11n,1,,


In [73]:
def get_unique_string_counts_by_column(df: pd.DataFrame) -> pd.DataFrame:
    unique_counts_by_column = {}
    for col in df.columns:
        if df[col].dtype == 'object':
            unique_counts_by_column[col] = df[col].nunique()
    return pd.DataFrame(list(unique_counts_by_column.items()), columns=['Column', 'Unique String Count'])


In [74]:
get_unique_string_counts_by_column(datasets_std['win'])

Unnamed: 0,Column,Unique String Count
0,location,25
1,ssid,109
2,network type,1
3,authentication,2
4,encryption,1
5,bssid,328
6,signal,48
7,radio_type,5
8,basic_rates_mbps,8
9,other_rates_mbps,7


In [75]:
get_unique_string_counts_by_column(datasets_std['mac_s'])

Unnamed: 0,Column,Unique String Count
0,location,6
1,ssid,23
2,network type,1
3,authentication,2
4,encryption,1
5,bssid,139
6,signal,43
7,radio_type,3
8,channel,27
9,basic_rates_mbps,0


In [76]:
def merge_datasets(datasets: Dict[str, pd.DataFrame]) -> pd.DataFrame:
    # Merge all datasets into a single DataFrame
    merged_df = pd.concat(datasets.values(), ignore_index=True)
    return merged_df

In [77]:
merged_df = merge_datasets(datasets_std)
merged_df

Unnamed: 0,location,ssid,network type,authentication,encryption,bssid,signal,dbm,radio_type,channel,basic_rates_mbps,other_rates_mbps
0,FToilet,BayView Hotel BH 1,Infrastructure,Open,,02:0d:3c:21:99:0d,16%,-92.0,802.11a,153,,
1,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:4c:9a:62,52%,-74.0,802.11n,1,,
2,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:4c:9a:72,52%,-74.0,802.11ac,36,,
3,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:5e:a9:62,26%,-87.0,802.11n,1,,
4,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:5e:a9:72,28%,-86.0,802.11ac,149,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3960,MToilet,eduroam,,,,b4:5d:50:fe:41:d2,10%,-95.0,802.11ac,52,12 24,36 48 54
3961,MToilet,eduroam,,,,b0:b8:67:63:50:b1,40%,-80.0,802.11ac,153,12 24,36 48 54
3962,MToilet,eduroam,,,,18:64:72:5e:a9:72,100%,-50.0,802.11ac,149,12 24,36 48 54
3963,MToilet,eduroam,,,,b0:b8:67:63:30:91,40%,-80.0,802.11ac,149,12 24,36 48 54


In [78]:
merged_df.drop(columns=['signal', 'basic_rates_mbps', 'other_rates_mbps'], inplace=True)
merged_df

Unnamed: 0,location,ssid,network type,authentication,encryption,bssid,dbm,radio_type,channel
0,FToilet,BayView Hotel BH 1,Infrastructure,Open,,02:0d:3c:21:99:0d,-92.0,802.11a,153
1,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:4c:9a:62,-74.0,802.11n,1
2,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:4c:9a:72,-74.0,802.11ac,36
3,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:5e:a9:62,-87.0,802.11n,1
4,FToilet,eduroam,Infrastructure,WPA2-Enterprise,CCMP,18:64:72:5e:a9:72,-86.0,802.11ac,149
...,...,...,...,...,...,...,...,...,...
3960,MToilet,eduroam,,,,b4:5d:50:fe:41:d2,-95.0,802.11ac,52
3961,MToilet,eduroam,,,,b0:b8:67:63:50:b1,-80.0,802.11ac,153
3962,MToilet,eduroam,,,,18:64:72:5e:a9:72,-50.0,802.11ac,149
3963,MToilet,eduroam,,,,b0:b8:67:63:30:91,-80.0,802.11ac,149


In [79]:
merged_df.columns

Index(['location', 'ssid', 'network type', 'authentication', 'encryption',
       'bssid', 'dbm', 'radio_type', 'channel'],
      dtype='object')

In [80]:
def list_columns_and_dtypes(df: pd.DataFrame) -> pd.DataFrame:
    return pd.DataFrame(df.dtypes, columns=['Data Type']).reset_index().rename(columns={'index': 'Column'})

In [81]:
list_columns_and_dtypes(merged_df)

Unnamed: 0,Column,Data Type
0,location,object
1,ssid,object
2,network type,object
3,authentication,object
4,encryption,object
5,bssid,object
6,dbm,float64
7,radio_type,object
8,channel,object


In [82]:
merged_df['location'].unique()

array(['FToilet', 'GSR2-1', 'GSR2-3/2', 'GSR2-4', 'PrintingRoom',
       'Stairs1', 'Stair2', 'LW2.1b', 'Lift2', 'Lift1', 'MToilet',
       'Walkway', 'CommonArea', 'Stairs3', 'SR2-4b', 'SR2-4a', 'SR2-3b',
       'GSR2-6', 'SR2-3a', 'SR2-2a', 'SR2-2b', 'SR2-1b', 'SR2-1a',
       'Stairs2', 'LW2.1a'], dtype=object)

In [83]:
output_path = Path.cwd() / 'data' / 'processed' / 'merged_dataset.csv'
merged_df.to_csv(output_path, index=False)

In [84]:
def train_test_split_and_save(df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42, output_dir: str = 'data/processed') -> None:
    # Split the DataFrame into training and testing sets
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    
    # Define output paths
    output_dir_path = Path(output_dir)
    output_dir_path.mkdir(parents=True, exist_ok=True)
    train_output_path = output_dir_path / 'train_dataset.csv'
    test_output_path = output_dir_path / 'test_dataset.csv'
    
    # Save the training and testing sets to CSV files
    train_df.to_csv(train_output_path, index=False)
    test_df.to_csv(test_output_path, index=False)

In [85]:
train_test_split_and_save(merged_df)

In [86]:
merged_df = merged_df.dropna(subset=['ssid'])
filtered_df = merged_df[merged_df['ssid'].str.contains('SMU')]
filtered_df

Unnamed: 0,location,ssid,network type,authentication,encryption,bssid,dbm,radio_type,channel
22,FToilet,SMU_Visitor,Infrastructure,Open,,18:64:72:4c:9a:61,-75.0,802.11n,1
23,FToilet,SMU_Visitor,Infrastructure,Open,,18:64:72:4c:9a:71,-74.0,802.11ac,36
24,FToilet,SMU_Visitor,Infrastructure,Open,,18:64:72:5e:a9:61,-87.0,802.11n,1
25,FToilet,SMU_Visitor,Infrastructure,Open,,18:64:72:5e:a9:71,-86.0,802.11ac,149
26,FToilet,SMU_Visitor,Infrastructure,Open,,38:17:c3:0c:7f:f0,-92.0,802.11ac,161
...,...,...,...,...,...,...,...,...,...
3944,MToilet,SMU_Visitor,,,,b0:b8:67:63:57:50,-95.0,802.11ac,56
3945,MToilet,SMU_Visitor,,,,b4:5d:50:fe:41:d0,-92.5,802.11ac,52
3946,MToilet,SMU_Visitor,,,,b0:b8:67:63:50:b0,-80.0,802.11ac,153
3947,MToilet,SMU_Visitor,,,,18:64:72:5e:a9:71,-50.0,802.11ac,149


In [87]:
filtered_df['ssid'].unique()

array(['SMU_Visitor', 'WLAN-SMU', 'SMU_178902', 'SMU_5G_178902',
       'Pronto Arigato SMU (Guest)'], dtype=object)

In [88]:
filtered_df.to_csv('data/processed/filtered_dataset.csv', index=False)

In [89]:
def get_row_count_by_location(df: pd.DataFrame) -> pd.DataFrame:
    location_counts = df['location'].value_counts().reset_index()
    location_counts.columns = ['Location', 'Row Count']
    return location_counts

In [90]:
get_row_count_by_location(merged_df)

Unnamed: 0,Location,Row Count
0,GSR2-4,287
1,GSR2-3/2,236
2,SR2-3b,231
3,SR2-4b,225
4,GSR2-6,217
5,SR2-4a,212
6,GSR2-1,208
7,FToilet,206
8,CommonArea,184
9,SR2-3a,175


In [91]:
merged_df.describe()

Unnamed: 0,dbm
count,3870.0
mean,-81.421059
std,11.037573
min,-97.5
25%,-89.0
50%,-85.0
75%,-77.0
max,-37.0
