This script maps device vendors and the remote endpoint organization 
use iot-py38 interpreter to run this script

In [None]:
import numpy as np
import pandas as pd
from utils import get_only_sld, unique_domain2org, unique_org_name
from utils import get_sld, get_tld_plus_1, get_device_connections, print_statistics, mapping_first_party

# input/output file locations
file_cleaned_flow = '../../Endpoint Mapping Data/Cleaned Flow/cleaned_flow_stat.csv'
file_combined_domain2org = '../../Endpoint Mapping Data/Domain Data/First Party Mapping/domain2org_all_possible_sources.csv'
file_vendor_orgs = '../../Device Identification/Vendor Data/vendor_org_subsidiary.xlsx'
output_file_location = '../../Endpoint Mapping Data/Domain Data/first_party_mapping.csv'

# load cleaned network flows
cleaned_flow = pd.read_csv(file_cleaned_flow)

# load and clean domain to organization information
domain2org = pd.read_csv(file_combined_domain2org)
domain2org = domain2org.replace(np.nan, '')
domain2org = domain2org.drop_duplicates(subset=['remote_hostname'])

## load and clean parent subsidiary information of organizations
vendor_orgs = pd.read_excel(file_vendor_orgs)
vendor_orgs.drop(['device_name', 'comment'], axis=1, inplace=True)


# Combine organization information for a domain from different sources 
domain2org['combined_org'] = domain2org.apply( lambda row: unique_domain2org(row.remote_hostname, row.python_whois_org, row.bash_whois_org, row.bash_openssl_org, row.copyright_org, row.netify_org, row.xclusive_org), axis=1)

# find related organization considering parent-subsidiary relationship 
vendor_orgs['related_orgs'] = vendor_orgs.apply(
    lambda row: unique_org_name(row.device_vendor , row.vendor_organization, row.parent_company, row.subsidiaries), axis=1)


## First Party Mapping
def mapping_parties(vendor_name, remote_hostname):
    try: 
        party_type = mapping_first_party(vendor_orgs['related_orgs'][vendor_orgs['device_vendor'] == vendor_name].item(),
                    domain2org['combined_org'][domain2org['remote_hostname'] == remote_hostname].item())
    except:
        party_type = -1
    
    return party_type


# find vendor name of a device,
# either from manual identification or from gpt identification 
cleaned_flow['super_vendor'] = cleaned_flow.apply(lambda row: row.vendor_name.lower() if row.vendor_name==row.vendor_name else row.gpt_clean_vendor, axis=1)

# get vendor and domain pair
super_vendor_domain = cleaned_flow[['super_vendor', 'domain']].drop_duplicates()


# todo remove the following line to map all the domain vendor pair 
super_vendor_domain = super_vendor_domain.sample(100, random_state=42)


# map vendor and domain pair 
super_vendor_domain['first_party'] = super_vendor_domain.apply(
    lambda row: mapping_parties(row.super_vendor, row.domain), axis=1)

# todo uncomment to save results to file 
# super_vendor_domain.to_csv(output_file_location, index=False)