In [65]:
import pandas as pd

In [None]:
def assign_destination(destination: str) -> int:
    if destination == "Broadcast": 
        return 1
    else:
        return 0


def check_null_value(string_to_check: str) -> str:
    if pd.isna(string_to_check):
        return ""
    else:
        return string_to_check


def assign_device(device: str, source: str) -> str:
    if device in ["Phone", "Phone Scan", "Phone Google"]:
        return "Phone"
    else:
        return device


def encode_company_id(company: str, source: str, service_data: str) -> str:
    # Defined, but not named company identifiers f.e. 0x34f5
    if str(company).startswith("0x") and len(company) == 6:
        return "Unknown"
    # Assign the Galaxy S22 "Anonymous" source the company Samsung
    elif pd.isna(company) and source == "Anonymous":
        return "Samsung Electronics Co. Ltd."
    # Reason?
    elif pd.isna(company) and str(service_data).startswith("4a17235"):
        return "Samsung Electronics Co. Ltd."
    elif pd.isna(company):
        return "Undefined"
    else:
        return company


def extract_time(delta_string: str) -> int:
    if pd.isna(delta_string):
        return 0
    else:
        return int(delta_string.replace("\\302\\265s", ""))


def check_company_id_existence(company_id: str) -> int:
    if company_id == "Undefined":
        return 0
    else:
        return 1


def extract_length(entry_to_exctract: str) -> int:
    if pd.isna(entry_to_exctract):
        return 0
    else:
        return len(entry_to_exctract)


def is_adv_channel(channel: int) -> bool:
    if channel in [37, 38, 39]:
        return True
    else:
        return False


def check_entry_existence(entry_to_check: str) -> int:
    if pd.isna(entry_to_check):
        return 0
    else:
        return 1

In [67]:
# Read the input CSV file and rename some columns
def read_data_to_df(file_name: str) -> pd.DataFrame:
    dataset_df = pd.read_csv(file_name, encoding='ISO-8859-1', low_memory=False)
    a = dataset_df.columns
    dataset_df.columns = dataset_df.columns.str.replace("Packet time (start to end)", "packet_start_end")
    dataset_df.columns = dataset_df.columns.str.replace("Delta time (end to start)", "delta_end_start")
    dataset_df.columns = dataset_df.columns.str.replace("Delta time (start to start)", "delta_start_start")
    dataset_df.columns = dataset_df.columns.str.replace(" ", "_")
    dataset_df.columns = dataset_df.columns.str.replace("Test", "Device")
    return dataset_df

def initialize_source_dictionaries(all_unique_sources: list) -> None:
    for source in all_unique_sources:
        if str(source) != "nan":
            source_dictionaries[source] = {}
            source_dictionaries[source]["count"] = 0
            source_dictionaries[source]["malformed_count"] = 0
            source_dictionaries[source]["highest_rssi"] = 0
            source_dictionaries[source]["lowest_rssi"] = -100
            # CHANGED: added average rssi for improved filtering
            source_dictionaries[source]["average_rssi"] = 0
            source_dictionaries[source]["first_occurence"] = -1
            source_dictionaries[source]["last_occurence"] = 0
            source_dictionaries[source]["device"] = "empty"
            source_dictionaries[source]["sub_device"] = ""

In [68]:
# Assign values for each row with either simple transformations or direct value usage
def fill_labelled_columns(packet_data: tuple, device: str, sub_device: str):

    row_data = {
        "time": packet_data.Time,
        "source": packet_data.Source,
        "destination": packet_data.Destination,
        "is_broadcast": assign_destination(packet_data.Destination),
        "length": packet_data.Length,
        "info": packet_data.Info,
        "rssi": packet_data.RSSI,
        "company_id": encode_company_id(packet_data.Company_ID, packet_data.Source, packet_data.Service_Data),
        "has_company_id": check_company_id_existence(encode_company_id(packet_data.Company_ID, packet_data.Source, packet_data.Service_Data)),
        "channel": packet_data.Channel_Index,
        "is_adv_channel": is_adv_channel(packet_data.Channel_Index),
        "device_name": check_null_value(packet_data.Device_Name),
        "uuid16": check_null_value(packet_data.UUID_16),
        "has_uuid16": check_entry_existence(packet_data.UUID_16),
        "len_uuid16": extract_length(packet_data.UUID_16),
        "uuid128": check_null_value(packet_data.UUID128),
        "has_uuid128": check_entry_existence(packet_data.UUID128),
        "data": check_null_value(packet_data.Data),
        "len_data": extract_length(packet_data.Data),
        "ad_type": check_null_value(packet_data.Type),
        "len_ad_type": extract_length(packet_data.Data),
        "service_data": check_null_value(packet_data.Service_Data),
        "len_service_data": extract_length(packet_data.Service_Data),
        "crc": check_null_value(packet_data.CRC),
        "labelled_device": assign_device(device, packet_data.Source),
        "sublabel_device": sub_device,
        "time_start_end": extract_time(packet_data.packet_start_end),
        "delta_end_start": extract_time(packet_data.delta_end_start),
        "delta_start_start": extract_time(packet_data.delta_start_start)
    }

    # To ensure labeling quality, phone-based labels which do not a valid company_id are labeled empty
    # Valid company_ids include: ["Samsung Electronics Co. Ltd.", "Undefined", "Unknown"]
    # Invalid company_ids include: Any other company such as "Apple", "Conneqtech B.V." etc.
    if row_data["labelled_device"] in ['Phone'] and row_data["company_id"] not in phone_companies:
        row_data["labelled_device"] = "empty"
        row_data["sublabel_device"] = "empty"

    return row_data


In [69]:
# Main raw dataset input parsing method
def parse_dataframe(input_dataframe: pd.DataFrame):
    # Define some helper variables for tracking of outliers
    nan_discarded_counter = 0
    malformed_discarded_counter = 0
    smart_tag_list = []
    phone_list = []
    buds_list = []

    dict_mean_rssi = (
        pd.to_numeric(
            input_dataframe["RSSI"].astype(str).str.replace("dBm", "", regex=False).str.strip(),
            errors="coerce"
        )
        .groupby(input_dataframe["Source"])
        .mean()
        .to_dict()
    )

    for row in input_dataframe.itertuples(index=False):
        # Skip the source "nan" and keep track of packets
        if str(row.Source) == "nan":
            nan_discarded_counter += 1
            continue

        # Any source != "nan" is valid, therefore we can increase the counter of source packets here
        source_dictionaries[row.Source]["count"] += 1

        # Skip malformed packets and keep track of occurrences
        if "Malformed Packet" in row.Info:
            source_dictionaries[row.Source]["malformed_count"] += 1
            malformed_discarded_counter += 1
            continue

        current_rssi = row.RSSI

        # Assign and overwrite source dictionary values with the current row
        if source_dictionaries[row.Source]["first_occurence"] == -1:
            source_dictionaries[row.Source]["first_occurence"] = row.Time
        if row.Time > source_dictionaries[row.Source]["last_occurence"]:
            source_dictionaries[row.Source]["last_occurence"] = row.Time
        if current_rssi > source_dictionaries[row.Source]["lowest_rssi"]:
            source_dictionaries[row.Source]["lowest_rssi"] = current_rssi
        if current_rssi < source_dictionaries[row.Source]["highest_rssi"]:
            source_dictionaries[row.Source]["highest_rssi"] = current_rssi
        # CHANGED: inserted average rssi
        if source_dictionaries[row.Source]["average_rssi"] == 0:
            source_dictionaries[row.Source]["average_rssi"] = dict_mean_rssi[row.Source]
            
        # Label conditions for the smart tag
        if row.Device_Name == "Smart Tag":
            smart_tag_list.append(row.Source)
            source_dictionaries[row.Source]["device"] = "Smart Tag"
            source_dictionaries[row.Source]["sub_device"] = "Smart Tag"
        if row.UUID_16 == "Samsung Electronics Co., Ltd.,Samsung Electronics Co., Ltd.":
            smart_tag_list.append(row.Source)
            source_dictionaries[row.Source]["device"] = "Smart Tag"
            source_dictionaries[row.Source]["sub_device"] = "Smart Tag"

        
        # Label conditions for the phone
        if row.UUID_16 == "Google LLC" and (row.Length == 63 or row.Length == 42 or row.Length == 120): 
            phone_list.append(row.Source)
            source_dictionaries[row.Source]["device"] = "Phone"
            source_dictionaries[row.Source]["sub_device"] = "Phone Google"
        if row.Info == "ADV_EXT_IND" and row.Length == 39:
            phone_list.append(row.Source)
            source_dictionaries[row.Source]["device"] = "Phone"
            source_dictionaries[row.Source]["sub_device"] = "Phone Phone"
        if row.UUID_16 == "Samsung Electronics Co. Ltd." and row.Length == 57: 
            phone_list.append(row.Source)
            source_dictionaries[row.Source]["device"] = "Phone"
            source_dictionaries[row.Source]["sub_device"] = "Phone Phone"
        if row.Info == "SCAN_REQ" and row.Length == 38 and row.Destination in target_group_rsp_scrs:
            phone_list.append(row.Source)
            source_dictionaries[row.Source]["device"] = "Phone"
            source_dictionaries[row.Source]["sub_device"] = "Phone Scan"

        
        # Label conditions for the buds
        if row.Source == "dc:b9:92:eb:f6:5d":
            source_dictionaries[row.Source]["device"] = "Buds"
            source_dictionaries[row.Source]["sub_device"] = "Buds"

        # After assigning all values, create a transformed row for the final list of rows
        row_data = fill_labelled_columns(row, source_dictionaries[row.Source]["device"], source_dictionaries[row.Source]["sub_device"])
        final_list.append(row_data)

    return nan_discarded_counter, malformed_discarded_counter

In [70]:
def find_empty_and_malformed_sources():
    only_malformed_sources = []
    empty_sources = []

    for key in source_dictionaries.keys():
        count = source_dictionaries[key]["count"]
        malformed_count = source_dictionaries[key]["malformed_count"]
        device = source_dictionaries[key]["device"]

        # If there are only malformed packets, a source is "malformed-only"
        if (count - malformed_count) == 0:
            only_malformed_sources.append(key)
        
        # Get all sources which are not part of the target group (and contain at least 1 valid packet)
        if (count - malformed_count) != 0 and device == "empty":
            empty_sources.append(key)

    print(f"Only Malformed Sources: {len(only_malformed_sources)}")
    print(f"Empty Sources: {len(empty_sources)}")

    return only_malformed_sources, empty_sources

# Recursively reassign sources due to non-deterministic BLE packet contents
# --> Extensively described in the thesis
def reassign_sources():
    empty_occurrences = 0
    new_final_list = []

    for row_dict in final_list:
        if row_dict["labelled_device"] == "empty" and row_dict["company_id"] in phone_companies:
            current_source = row_dict["source"]
            row_dict["labelled_device"] = source_dictionaries[current_source]["device"]

        if row_dict["labelled_device"] != "empty":
            new_final_list.append(row_dict)
        else:
            new_final_list.append(row_dict)
            empty_occurrences += 1

    return empty_occurrences, new_final_list


In [71]:
# Create the final dataframe based on the final_list rows
def create_labelled_dataframe(entry_list: list):
    new_df = pd.DataFrame(data=entry_list, columns=list((final_list[0].keys())))        
    return new_df

# Write the dataframe to a file
def write_new_dataframe(file_to_write: str, df_to_write: pd.DataFrame):
    df_to_write.to_csv(file_to_write, encoding='utf-8', index=False)
    print(f"New CSV written as: {file_to_write}")

# Extract information about the sources of a label
def info_extractor(df_input: pd.DataFrame, wanted_label: str):
    seen_sources = {}
    for row in df_input.itertuples(index=False):
        if row.source not in seen_sources.keys() and row.sublabel_device == wanted_label:
            seen_sources[row.source] = {}
            seen_sources[row.source]["first"] = row.time
            seen_sources[row.source]["last"] = row.time
            seen_sources[row.source]["count"] = source_dictionaries[row.source]["count"]
            continue
        if row.source in seen_sources.keys() and row.sublabel_device == wanted_label:
            if seen_sources[row.source]["last"] < row.time:
                seen_sources[row.source]["last"] = row.time

    return seen_sources

def assign_majority_label(input_df: pd.DataFrame):
    # Compute the majority label per source
    majority_labels = input_df.groupby('source')['labelled_device'].agg(lambda x: x.mode()[0])
    # Map the majority label back to the dataframe
    input_df['labelled_device'] = input_df['source'].map(majority_labels)

    return input_df


In [72]:
##############################
#        CODE FIELD 0        #
##############################

# Global variables
final_list = []
source_dictionaries = {}
phone_companies = ["Samsung Electronics Co. Ltd.", "Undefined", "Unknown"]
undefined_list = []

In [None]:
##############################
#        CODE FIELD 1        #
##############################

# Setup all lists, dictionaries and the labeled dataframe
path_to_file = "../data/samsung/samsung_uni_group3_2h_raw.csv" # samsung_isolated_group2_6h_raw, samsung_isolated_group3_6h_raw, samsung_uni_group3_2h_raw
raw_dataframe = read_data_to_df(path_to_file) 
all_sources = raw_dataframe["Source"].unique()
target_group_rsp_scrs = raw_dataframe.loc[(raw_dataframe["Info"] == "SCAN_RSP") & (
                                                (raw_dataframe["Device_Name"] == "Smart Tag") | 
                                                (raw_dataframe["Device_Name"] == "Buds FE")), "Source"].unique()
initialize_source_dictionaries(all_sources)
nan_counter, malformed_counter = parse_dataframe(raw_dataframe)
malformed_list, empty_list = find_empty_and_malformed_sources()
empty_counter, filtered_list = reassign_sources()
labelled_dataframe = create_labelled_dataframe(filtered_list)
relevant_sources = labelled_dataframe["source"].unique()

Only Malformed Sources: 117308
Empty Sources: 165632


In [None]:
##############################
#        CODE FIELD 2        #
##############################

# Each source is filtered according to its average RSSI value and count and re-labeled if needed
rssi_limits_per_dataset =  {"samsung_isolated_group2_6h_raw": {"Phone Google": -49,
                                                        "Phone Scan": -33,
                                                        "Phone Phone": -45,
                                                        "Smart Tag": -39},
                            "samsung_isolated_group3_6h_raw": {"Phone Google": -36,
                                                             "Phone Scan": -17,
                                                             "Smart Tag": -24},
                            "samsung_uni_group3_2h_raw": {"Phone Google": -32,
                                                        "Phone Scan": -18,
                                                        "Phone Phone": -27,
                                                        "Smart Tag": -20}
                            }

rssi_limits = rssi_limits_per_dataset[path_to_file.split("/")[-1].split(".")[0]]

for label in rssi_limits.keys():
    sources = info_extractor(labelled_dataframe, label)
    actual_target_group_sources = []
    for source in sources.keys():
        first = round(sources[source]["first"], 3)
        last = round(sources[source]["last"], 3)
        count = sources[source]["count"]
        max_rssi = source_dictionaries[source]["highest_rssi"]
        low_rssi = source_dictionaries[source]["lowest_rssi"]
        avg_rssi = source_dictionaries[source]["average_rssi"]

        # Find Phone Google labeled sources
        if label == "Phone Google" and source_dictionaries[source]["sub_device"] == label:
            if count > 30 and avg_rssi > rssi_limits["Phone Google"]: 
                actual_target_group_sources.append(source)

        # Find Phone Scan labeled sources
        if label == "Phone Scan" and source_dictionaries[source]["sub_device"] == label:
            if count > 30 and avg_rssi >= rssi_limits["Phone Scan"]:
                actual_target_group_sources.append(source)

        if label == "Phone Phone" and source_dictionaries[source]["sub_device"] == label:
            if count > 30 and avg_rssi >= rssi_limits["Phone Phone"]:
                actual_target_group_sources.append(source)
        
        
        # Find Smart Tag labeled sources
        if label == "Smart Tag" and source_dictionaries[source]["sub_device"] == label:
            if count > 30 and avg_rssi >= rssi_limits["Smart Tag"]:
                actual_target_group_sources.append(source)

    # Reassign the labels based on the above conditions
    current_target_group_sources = labelled_dataframe[labelled_dataframe['sublabel_device'] == label]["source"]
    diff_sources = set(current_target_group_sources) - set(actual_target_group_sources)
    labelled_dataframe.loc[labelled_dataframe['source'].isin(diff_sources), 'labelled_device'] = 'empty'

# Print information about dataframe length
print("--------------------------------------------------")
print("FINAL DATAFRAME\n")
print(f"Length Raw Dataframe: {len(raw_dataframe)}")
print(f"Length Final Dataframe: {len(labelled_dataframe)}")
print(f"Removed: {malformed_counter} (malformed), {nan_counter} (NaN), Diff: {len(raw_dataframe)-len(labelled_dataframe)}")

# Assign the majority label to impure sources (for example a phone source may have 3000 "Phone" labels and 1 "empty" labels) --> apply majority label
labelled_dataframe = assign_majority_label(labelled_dataframe)


--------------------------------------------------
FINAL DATAFRAME

Length Raw Dataframe: 3632931
Length Final Dataframe: 3340540
Removed: 261153 (malformed), 31238 (NaN), Diff: 292391


In [76]:
##############################
#        CODE FIELD 3        #
##############################

print("\n", labelled_dataframe.groupby("labelled_device").agg(packets=("labelled_device", "size"), sources=("source", "nunique")).sort_values(by="packets", ascending=False))
# Check if there are still impure sources which have multiple labels
# After assigning the majority label this output should be empty
multi_label_sources = labelled_dataframe.groupby('source')['labelled_device'].nunique()
multi_label_sources = multi_label_sources[multi_label_sources > 1].index

for source in multi_label_sources:
    print("\n", labelled_dataframe[labelled_dataframe["source"] == source]["labelled_device"].value_counts())


                  packets  sources
labelled_device                  
empty            3078478   165907
Phone             203347       78
Buds               52708        1
Smart Tag           6007       17


In [None]:
def group_by_src(df):
    result = (
    df.groupby("source")
      .agg(
          count=("rssi", "count"),
          RSSI_min=("rssi", "min"),
          RSSI_max=("rssi", "max"),
          RSSI_avg=("rssi", "mean"),
          length_counts=("length", lambda x: x.value_counts().to_dict()),
          label=("labelled_device", lambda x: x.unique())
      )
      .reset_index()                
      .sort_values(by="count", ascending=False)
      .reset_index(drop=True)
    )
    return result

In [None]:
##############################
#        CODE FIELD 4        #
##############################

# remove noise entries from isolated dataset
if path_to_file.split("/")[-1].split(".")[0].split("_")[1] == "isolated":
    print(group_by_src(labelled_dataframe[labelled_dataframe["labelled_device"] == "empty"]).to_string())
    # remove rows with empty device label
    to_drop = (labelled_dataframe["labelled_device"] == "empty")
    labelled_dataframe = labelled_dataframe.drop(labelled_dataframe[to_drop].index)
    print(f"Number of entries with label 'empty': {len(labelled_dataframe[labelled_dataframe["labelled_device"] == "empty"])}")

# save labelled dataframe as csv
new_file_name = "_".join(path_to_file.split("/")[-1].split(".")[0].split("_")[:-1])
labelled_dataframe.to_csv(f"D../data/samsung/{new_file_name}_labeled.csv", index=False)