In [None]:
# todo NOTE: this script does not reproduce original result since we removed device IDs

import pandas as pd
from utils import create_df_from_dictionary

###################################################
###   category label gpt identified devices     ###
###################################################

## all file locations
# gpt identified device list 
file_gpt_identified_devices = '../../Device Identification/GPT Identified Device/gpt_identified_devices.parquet'
# coarse-grained mapping file
file_category_mapping_gpt = '../../Device Identification/Device Category Mapping/Mapping Files/categories_mapping_gpt.csv'
# finegrained mapping file
file_category_mapping_comments = '../../Device Identification/Device Category Mapping/Mapping Files/categories_mapping_comments.csv'

# read device identification files and mapping files
gpt_category_mapping_file = pd.read_csv(file_category_mapping_gpt)[['src_category', 'generic_category', 'src_vendor']]
gpt_category_comment_file = pd.read_csv(file_category_mapping_comments)[['src_category', 'generic_category', 'src_vendor']]
gpt_device_file = pd.read_parquet(file_gpt_identified_devices)


# update mapping dictionary based on finegrained mapping comments 
device_generic_category_mapped = create_df_from_dictionary(gpt_category_mapping_file, gpt_category_comment_file)
device_generic_category_mapped.rename(columns={'src_vendor': 'gpt_clean_vendor', 'src_category':'gpt_clean_type'}, inplace=True)

# label devices based on category mapping file
gpt_device_category_merged = pd.merge(gpt_device_file, device_generic_category_mapped, how='left', on=['gpt_clean_vendor', 'gpt_clean_type'])

# todo save to file 
# file_result_location = '../../Device Identification/Device Category Mapping/gpt_device_id_generic_category.csv'
# gpt_device_category_merged.to_csv(file_result_location, index=False)


###################################################
### category label manually  identified devices ###
###################################################


# load all manually identified device files
directory = '../../Device Identification/Manual Device Identification/'
file_finger_bank_inspection = directory + 'finger_banks_manual_inspection.csv'
file_netdisco_inspection = directory + 'netdisco_manual_inspection.csv'
file_user_agent_inspection = directory + 'user_agent_manual_inspection.csv'
finger_bank_inspection = pd.read_csv(file_finger_bank_inspection, low_memory=False)
netdisco_inspection = pd.read_csv(file_netdisco_inspection)
user_agent_inspection = pd.read_csv(file_user_agent_inspection)

# clean files by removing unknown/misc devices 
finger_bank_inspection = finger_bank_inspection[finger_bank_inspection.device_category!='Unknown'][['device_id', 'vendor_name', 'device_category','inspected']]
netdisco_inspection = netdisco_inspection[netdisco_inspection.device_category!='Misc'][['device_id', 'vendor_name', 'device_category', 'inspected']]
user_agent_inspection=user_agent_inspection[user_agent_inspection.device_category!='Unknown'][['device_id', 'vendor_name', 'device_category', 'inspected']]

# concat all files and clean 
manual_labels = pd.concat([finger_bank_inspection, netdisco_inspection, user_agent_inspection], ignore_index=True)
manual_labels.drop_duplicates(subset=['device_id'], keep='last', inplace=True)  # todo NOTE: will not work since we removed device IDS
manual_labels['device_category'] = manual_labels['device_category'].str.lower()
manual_labels=manual_labels[manual_labels.vendor_name.notna()]

## read mapping file
manual_category_mapping = pd.read_csv('../../Device Identification/Device Category Mapping/Mapping Files/category_mapping_manual.csv')[['device_category', 'generic_category']]

# label categories 
manual_device_category_merged = pd.merge(manual_labels, manual_category_mapping, how='left', on=['device_category'])

# todo save in file 
# file_location = '../../Device Identification/Device Category Mapping/manual_device_category_merged.csv'
# manual_device_category_merged.to_csv(file_location, index=False)