### Get organization and identify party
Our goal is to identify which organization, first party, support party, or third party, own the domains. 




In [1]:
import ipaddress
import pandas as pd
import numpy as np
import os 
import csv
import json
import time
import subprocess
import tldextract

def get_sld(domain):
    ext = tldextract.extract(domain)
    return f"{ext.domain}.{ext.suffix}"

def is_local_address(ip_str):
    LOCAL_IPS = ['129.10.227.248', '129.10.227.207']
    try:
        if ip_str == "::" or ip_str == "::1":
            return True
        if is_ipv6(ip_str):
            ip = ipaddress.IPv6Address(ip_str)
            return ip.is_multicast or ip.is_link_local or (
                ip.is_private and check_in_network("fc00::/7", ip)
                ) or ip.is_unspecified
        else:
            ip = ipaddress.ip_address(ip_str)
            return ip.is_private or ip.is_multicast or ip == ipaddress.IPv4Address("255.255.255.255") or ip_str in LOCAL_IPS
    except:
        return False

def check_in_network(network_prefix, ip):
    if ip is None or network_prefix is None:
        return False
    network = ipaddress.IPv6Network(network_prefix, strict=False)
    return ip in network

def is_ipv6(address:str) -> bool:
    try:
        ip = ipaddress.ip_address(address)
        if isinstance(ip, ipaddress.IPv6Address):
            # print("{} is an IPv6 address".format(address))
            return True
        else:
            return False
    except ValueError:
        return False
    
def is_valid_ip(i):
    try:
        ipaddress.ip_address(i)
        return True
    except ValueError:
        return False

def get_whois_data(domain):
    try:
        result = subprocess.run(['whois', domain], stdout=subprocess.PIPE)
        return result.stdout.decode()
    except:
        return None

def privacy_protected_org_name(org:str):
    org = org.lower()
    if 'data protected' in org or 'domains by proxy' in org or \
            'redacted' in org or 'not disclosed' in org or \
            'protection' in org or 'privacy' in org:
        return True
    return False
    

def get_organization_command_line(domain):
    whois_data = get_whois_data(domain)
    if whois_data:
        org_list = []
        for line in whois_data.split("\n"):
            if 'Organization' in line.strip() or 'OrgName' in line.strip():
                try:
                    org = line.split(":")[1].strip()
                except:
                    continue
                if privacy_protected_org_name(org):
                    continue
                if len(org) > 0:
                    org_list.append(org)
        # print(domain, org_list)
        if len(org_list) > 0:
            return org_list
        else:
            return None

    return None




def read_input(file):
    data = pd.read_csv(file)
    hosts = np.array(data['hosts'].fillna('').values)
    return hosts


def read_first_party(file):
    device_first_party = {}
    
    with open(file, 'r') as f:
        lines = f.readlines()
        for line in lines:
            if line:
                device = line.split()[0]
                first_party = line.split()[1:]
                device_first_party[device] = first_party
    return device_first_party


def process_single(input_string, device_first_party, domain_org_dict):
    save_org = None

    # Check if the input is a valid IP address and not a local address
    if is_valid_ip(input_string) and not is_local_address(input_string):
        # IP address to organization
        if input_string in domain_org_dict:
            organization = [domain_org_dict[input_string]]
        else:
            organization = get_organization_command_line(input_string)
            save_org = 1
    else:
        # Process domain name
        input_string = input_string.lower()
        if input_string in domain_org_dict:
            organization = [domain_org_dict[input_string]]
        else:
            organization = get_organization_command_line(input_string)
            save_org = 1

    # Handle organization data
    if organization:
        if save_org == 1:
            save_org = organization[0]
            if save_org == 'Charleston Road Registry, Inc.':
                save_org = 'Google Registry'
            print(input_string, organization)
        organization.append(input_string)
    else:
        # Default organization handling for specific domains
        if input_string in ['amcs-tachyon.com', 'a2z.com', 'fireoscaptiveportal.com']:
            organization = ['Amazon Technologies, Inc.']
        else:
            organization = [input_string]

    # Determine the party type
    party = 0
    support_party_list = ['aws', 'cloudflare', 'amazon', 'org', 'neu.edu', 'aka', 'digicert', 'Wikimedia', 'cdn', 'akamai', 'cloudfront', 'fastly', 'dns', 'Electric Imp']
    
    if not isinstance(organization, list):
        print('error:', organization, input_string)
    
    # Check if the organization is a first party
    for first_party in device_first_party:
        first_party = first_party.strip().lower()
        for org in organization:
            if first_party in org.lower():
                party = 1
                organization = org
                break
        if party == 1:
            break
    
    # Check if the organization is a support party
    if party != 1:
        for org in organization:
            for s in support_party_list:
                if s in org.lower():
                    party = 2
                    organization = org
                    break
            if party == 2:
                break
    
    # Default to third party if no match found
    if party == 0:
        party = 3
        organization = organization[0]

    return organization, party, save_org

first_party_list = 'first_party_list.txt'


In [None]:
def run_get_org_single_exp(device_domain_dict, output_file):
    if os.path.isfile('domain_org_dict.json'):
        domain_org_dict = json.load(open('domain_org_dict.json','r'))
    else:
        domain_org_dict = {}
    device_first_party = read_first_party(first_party_list)
    for device in device_domain_dict:
        if device not in device_first_party:
            print('Error:', device)

    tmp_output = []
    for device_name in device_domain_dict:
        if device_name not in device_first_party:
            continue
        for domain in device_domain_dict[device_name]:
            # sld:
            # domain = get_sld(domain)
            
            # * get domain org
            organization, party, save_org = process_single(domain, device_first_party[device_name], domain_org_dict)
            tmp_output.append([ device_name, domain, organization, party])
            if save_org and save_org != 1:
                tmp_domain = domain
                if tmp_domain not in domain_org_dict:
                    domain_org_dict[tmp_domain] = save_org

    json.dump(domain_org_dict, open('domain_org_dict.json', 'w'), indent=4)

    print('Saving to:', output_file)
    header = ['device', 'domain', 'org', 'party']
    with open(output_file, 'w') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        writer.writerows(tmp_output)
                


input_dir = "/home/hutr/iot-longitudinal/output_destination/{device_comparison_set}/{year}{exp}/domain_list"
device_comparison_set = ['common_devices', 'all_devices_remove_duplicates']
years = ['2019', '2021', '2022', '2023', '2024']
exps = ['idle', 'activity', 'power']

for year in years:
    for exp in exps:
        cur_input_dir = input_dir.format(device_comparison_set=device_comparison_set[0], year=year, exp=exp)
        file = f"{cur_input_dir}/contacted_slds_with_all_dns.json"
        
        # Load device domain dictionary
        with open(file, 'r') as f:
            device_domain_dict = json.load(f)
            
        # Remove keys with empty values
        device_domain_dict = {k: v for k, v in device_domain_dict.items() if v} 

        out_dir = os.path.join(os.path.dirname(cur_input_dir), 'domain_org')
        output_file =  os.path.join(out_dir, f'{os.path.basename(file).split('.')[0]}_org.csv')
        if not os.path.exists(out_dir):
            os.system('mkdir -pv %s' % out_dir)
        run_get_org_single_exp(device_domain_dict, output_file)





Saving to: /home/hutr/iot-longitudinal/output_destination/common_devices/2019idle/domain_org/contacted_slds_with_all_dns_org.csv
