In [None]:
!pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.2-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.6/97.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.2


In [None]:
# Check TLD
import pandas as pd
import tldextract

# Read the CSV file
csv_file = './drive/MyDrive/Colab Notebooks/iteration-v2/dataset_url_https.csv'
df = pd.read_csv(csv_file)

# Extract TLDs from URLs
tlds = df['url'].apply(lambda url: tldextract.extract(url).suffix)

# Find unique TLDs
unique_tlds = tlds.unique()

# Print the number of unique TLDs and the TLDs themselves
print(f"Number of unique TLDs: {len(unique_tlds)}")
print("Unique TLDs:")
print(unique_tlds)


Number of unique TLDs: 835
Unique TLDs:
['online' 'in' 'biz' 'ru' 'com' 'ge' 'ir' 'org' 'site' 'top' 'cloud' 'net'
 'fi' 'cc' 'com.ng' 'edu.ar' 'work' 'com.tr' 'lk' 'co.mz' 'ro' 'store'
 'tk' 'pro' 'sbs' 'icu' 'xyz' 'io' 'my.id' 'gr' 'rs' 'aero' 'me' 'de'
 'club' 'com.br' 'shop' 'ai' 'it' 'hu' 'co.in' 'live' 'ltd' 'biz.id' 'pm'
 'website' 'cz' 'cn' 'eu' 'center' 'com.au' 'org.ar' 'dev' 'fr' 'com.ar'
 'com.pe' 'id' 'com.mx' 'co.nz' 'nl' 'app' 'fund' 'co.uk' 'pl' 'com.ba'
 'ca' 'info' 'com.pa' 'hk' 'co.za' 'si' 'cl' 'edu.mx' 'host' 'news'
 'edu.pe' 'ac.id' 'gov.ph' 'co' 'co.il' 'com.tw' 'com.co' 'net.pl' 'nu'
 'es' 'ch' 'link' 'tn' 'com.lb' 'tech' 'buzz' 'tv' 'az' 'be' 'bio' 'vc'
 'ae' 'pr.gov.br' 'world' 'vn' 'wf' 'ppg.br' 'com.sg' 'go.id' 'com.sv'
 'pw' 'czest.pl' 'us' 'pk' 'edu.vn' 'br' 'uz' 'click' 'space' 'com.vn'
 'com.pk' 'ms' 'com.bd' 'cfd' 'gd' 'one' 'se' 'net.vn' 'com.ua' 'web.id'
 'blog' 'zip' 'foundation' 'my' 'sk' 'lat' 'com.cn' 'page' 'pt' 'mx'
 'ac.ir' 'no' 'mw' 'vip' 'by'

In [None]:
# Certificate Features (it has duplicate issue)
import os
import json
from datetime import datetime
import pandas as pd
from urllib.parse import urlparse
import category_encoders as ce
from tqdm import tqdm

# Directory containing JSON files
directory = "/Users/herwonowr/Downloads/output"

# Load URLs and labels from dataset_url_https.csv into a DataFrame
dataset_path = "/Users/herwonowr/Downloads/dataset_url_https.csv"
dataset_df = pd.read_csv(dataset_path)
pd.set_option('future.no_silent_downcasting', True)

# For the testing, only load sample data
dataset_df = dataset_df.sample(n=500, random_state=1)

# Extract domains from URLs and create a dictionary for quick lookups
# url_domain_map = {urlparse(url).netloc: (url, label) for url, label in zip(dataset_df['url'], dataset_df['label'])}
# domains_set = set(urlparse(url).netloc for url in dataset_df['url'])

url_domain_map = {urlparse(url).hostname: (url, label) for url, label in zip(dataset_df['url'], dataset_df['label'])}
domains_set = set(urlparse(url).hostname for url in dataset_df['url'])

# Initialize list to hold data
features_list = []
certificates_list = []

# Process all domains in the dataset
for domain in tqdm(domains_set, desc="Processing domains"):
    json_filename = f"{domain}.json"
    if json_filename in os.listdir(directory):
        with open(os.path.join(directory, json_filename), 'r') as file:
            data = json.load(file)
            if not data:  # Check if the data is empty
                # Find the original URL and label
                original_url, label = url_domain_map[domain]

                # Default features for empty JSON files
                features = {
                    "url": original_url,
                    "domain": domain,
                    "label": label,
                    "tls_lifetime": 0,
                    "tls_average_inter_arrival_time": 0,
                    "tls_number_of_certificates": 0,
                    "tls_average_san_list_size": 0,
                    "tls_average_validity_period": 0,
                }

                # Append the features to the list
                features_list.append(features)
            else:
                for entry in data:
                    entry['domain'] = domain
                    certificates_list.append(entry)

# Create DataFrame from certificates
df = pd.DataFrame(certificates_list)

# Initialize binary encoders for common_names and issuers
encoder_common_names = ce.BinaryEncoder(cols=['common_name'])
encoder_issuers = ce.BinaryEncoder(cols=['issuer_name'])

# Fit the encoders on the full dataset to determine the maximum number of binary columns
encoded_common_names_df = encoder_common_names.fit_transform(df['common_name'])
encoded_issuers_df = encoder_issuers.fit_transform(df['issuer_name'])
max_common_names_columns = encoded_common_names_df.shape[1]
max_issuers_columns = encoded_issuers_df.shape[1]

# Rename columns to match the expected format
encoded_common_names_df.columns = [f'tls_common_names_{i}' for i in range(max_common_names_columns)]
encoded_issuers_df.columns = [f'tls_issuers_{i}' for i in range(max_issuers_columns)]

# Process each domain to extract features
for domain in tqdm(domains_set, desc="Extracting features"):
    if domain in df['domain'].unique():
        domain_certs = df[df['domain'] == domain]

        if not domain_certs.empty:
            domain_certs = domain_certs.sort_values(by="not_before")

            first_cert_issue_date = datetime.fromisoformat(domain_certs.iloc[0]["not_before"])
            last_cert_expire_date = datetime.fromisoformat(domain_certs.iloc[-1]["not_after"])

            lifetime = (last_cert_expire_date - first_cert_issue_date).days

            inter_arrival_times = []
            for i in range(1, len(domain_certs)):
                previous_issue_date = datetime.fromisoformat(domain_certs.iloc[i-1]["not_before"])
                current_issue_date = datetime.fromisoformat(domain_certs.iloc[i]["not_before"])
                inter_arrival_times.append((current_issue_date - previous_issue_date).days)

            average_inter_arrival_time = sum(inter_arrival_times) / len(inter_arrival_times) if inter_arrival_times else 0

            number_of_certificates = len(domain_certs)

            san_list_sizes = [len(cert.split("\n")) for cert in domain_certs["name_value"]]
            average_san_list_size = sum(san_list_sizes) / len(san_list_sizes)

            issuers = list(set(domain_certs["issuer_name"].tolist()))
            common_names = list(set(domain_certs["common_name"].tolist()))

            # Perform binary encoding for common_names and issuers
            encoded_common_names = encoder_common_names.transform(pd.DataFrame(common_names, columns=['common_name']))
            encoded_issuers = encoder_issuers.transform(pd.DataFrame(issuers, columns=['issuer_name']))

            # Rename column
            encoded_common_names.columns = [f'tls_common_names_{i}' for i in range(max_common_names_columns)]
            encoded_issuers.columns = [f'tls_issuers_{i}' for i in range(max_issuers_columns)]

            # Sum the binary encoded values
            encoded_common_names_sum = encoded_common_names.sum().to_dict()
            encoded_issuers_sum = encoded_issuers.sum().to_dict()

            validity_periods = []
            for not_before, not_after in zip(domain_certs["not_before"], domain_certs["not_after"]):
                not_before_date = datetime.fromisoformat(not_before)
                not_after_date = datetime.fromisoformat(not_after)
                validity_periods.append((not_after_date - not_before_date).days)

            average_validity_period = sum(validity_periods) / len(validity_periods)

            original_url, label = url_domain_map[domain]
            features = {
                "url": original_url,
                "domain": domain,
                "label": label,
                "tls_lifetime": lifetime,
                "tls_average_inter_arrival_time": average_inter_arrival_time,
                "tls_number_of_certificates": number_of_certificates,
                "tls_average_san_list_size": average_san_list_size,
                "tls_average_validity_period": average_validity_period,
            }

            # Update features with encoded sums
            features.update(encoded_common_names_sum)
            features.update(encoded_issuers_sum)

            # Append the features to the list
            features_list.append(features)
    else:
        # Handle case where there are no certificates for a domain
        original_url, label = url_domain_map[domain]
        features = {
            "url": original_url,
            "domain": domain,
            "label": label,
            "tls_lifetime": 0,
            "tls_average_inter_arrival_time": 0,
            "tls_number_of_certificates": 0,
            "tls_average_san_list_size": 0,
            "tls_average_validity_period": 0,
        }

        features_list.append(features)

# Fill NaN tls_common_names and tls_issuers binary encoding
for feature in features_list:
    for i in range(max_common_names_columns):
        col_name = f'tls_common_names_{i}'
        if col_name not in feature:
            feature[col_name] = 0.0
    for i in range(max_issuers_columns):
        col_name = f'tls_issuers_{i}'
        if col_name not in feature:
            feature[col_name] = 0.0

# Create DataFrame for features
features_df = pd.DataFrame(features_list)

# Save new dataset
features_df.to_csv('dataset_url_https_tls.csv', index=False)

#  'tls_lifetime',
#  'tls_average_inter_arrival_time',
#  'tls_number_of_certificates',
#  'tls_average_san_list_size',
#  'tls_average_validity_period',
#  'tls_number_common_names_distinct',
#  'tls_number_issuers_distinct',

In [None]:
# Fix duplicate and index issue in Certificate features extraction
import pandas as pd

# Load the dataset
dataset_path = "./drive/MyDrive/Colab Notebooks/iteration-v2/dataset_url_https_tls.csv"
df = pd.read_csv(dataset_path)
pd.set_option('future.no_silent_downcasting', True)

# Drop the 'idx' column
df = df.drop(columns=['idx'])

# Drop duplicate rows
df_cleaned = df.drop_duplicates()

# Save the cleaned dataset to a new file
cleaned_dataset_path = "./drive/MyDrive/Colab Notebooks/iteration-v2/cleaned_dataset_url_https_tls.csv"
df_cleaned.to_csv(cleaned_dataset_path, index=False)

print(f"Cleaned dataset saved to {cleaned_dataset_path}")

In [None]:
# Lexical Features v2
import pandas as pd
import urllib.parse
import re
import math
from collections import Counter
import tldextract
import idna

# Function to calculate Shannon entropy
def shannon_entropy(data):
    if not data:
        return 0
    entropy = 0
    counter = Counter(data)
    length = len(data)
    for count in counter.values():
        p = count / length
        entropy -= p * math.log2(p)
    return entropy

# Function to extract features from a hostname
def extract_hostname_features(url, hostname):
    # Check if the hostname needs Punycode encoding
    try:
        punycode_hostname = idna.encode(hostname).decode('ascii')
        uses_punycode = 1 if punycode_hostname != hostname else 0
    except idna.IDNAError:
        punycode_hostname = hostname
        uses_punycode = 0

    extracted = tldextract.extract(punycode_hostname)
    domain = f"{extracted.domain}.{extracted.suffix}"
    tld = extracted.suffix
    subdomains = extracted.subdomain.split('.') if extracted.subdomain else []
    path_segments = punycode_hostname.split('.')

    # Transform list features into scalar features
    number_of_digits_in_subdomains = [sum(1 for char in sub if char.isdigit()) for sub in subdomains]
    number_of_alphabetic_in_subdomains = [sum(1 for char in sub if char.isalpha()) for sub in subdomains]
    number_of_hyphens_in_subdomains = [sub.count('-') for sub in subdomains]
    vowel_to_consonant_ratio_in_subdomains = [
        sum(1 for char in sub if char in 'aeiouAEIOU') / (sum(1 for char in sub if char.isalpha() and char not in 'aeiouAEIOU') or 1) for sub in subdomains
    ]
    entropy_of_each_subdomain = [shannon_entropy(sub) for sub in subdomains]

    features = {
        # Domain related features
        "url": url,
        "domain": punycode_hostname,
        "length_of_domain": len(domain),
        "number_of_subdomains": len(subdomains),
        "number_of_dots": punycode_hostname.count('.'),
        "tld": tld,

        # Character composition features
        "number_of_digits": sum(1 for char in punycode_hostname if char.isdigit()),
        "number_of_special_characters": sum(1 for char in punycode_hostname if char == '-'),
        "number_of_alphabetic_sequences": len(re.findall(r'[a-zA-Z]+', punycode_hostname)),
        "number_of_vowels": sum(1 for char in punycode_hostname if char in 'aeiouAEIOU'),
        "number_of_consonants": sum(1 for char in punycode_hostname if char.isalpha() and char not in 'aeiouAEIOU'),

        # Density and ratio features
        "vowel_to_consonant_ratio": sum(1 for char in punycode_hostname if char in 'aeiouAEIOU') / (sum(1 for char in punycode_hostname if char.isalpha() and char not in 'aeiouAEIOU') or 1),
        "vowel_density": sum(1 for char in punycode_hostname if char in 'aeiouAEIOU') / len(punycode_hostname),
        "consonant_density": sum(1 for char in punycode_hostname if char.isalpha() and char not in 'aeiouAEIOU') / len(punycode_hostname),
        "digit_density": sum(1 for char in punycode_hostname if char.isdigit()) / len(punycode_hostname),

        # Shannon entropy features
        "hostname_entropy": shannon_entropy(punycode_hostname),
        "shannon_entropy_of_vowels": shannon_entropy([char for char in punycode_hostname if char in 'aeiouAEIOU']),
        "shannon_entropy_of_consonants": shannon_entropy([char for char in punycode_hostname if char.isalpha() and char not in 'aeiouAEIOU']),
        "shannon_entropy_of_digits": shannon_entropy([char for char in punycode_hostname if char.isdigit()]),

        # Path segment features
        "total_number_of_tokens": len(path_segments),
        "maximum_token_length": max((len(segment) for segment in path_segments), default=0),
        "minimum_token_length": min((len(segment) for segment in path_segments), default=0),
        "average_token_length": sum(len(segment) for segment in path_segments) / (len(path_segments) or 1),
        "token_length_variance": (lambda lengths: sum((x - sum(lengths) / len(lengths)) ** 2 for x in lengths) / len(lengths))(list(map(len, path_segments))) if path_segments else 0,
        "mean_token_length": sum(len(segment) for segment in path_segments) / (len(path_segments) or 1),
        "standard_deviation_of_token_length": (lambda lengths: math.sqrt(sum((x - sum(lengths) / len(lengths)) ** 2 for x in lengths) / len(lengths)))(list(map(len, path_segments))) if path_segments else 0,

        # Additional lexical features
        "ratio_of_meaningful_syllables": sum(1 for segment in path_segments for char in segment if char in 'aeiouAEIOU') / (sum(1 for segment in path_segments for char in segment if char.isalpha()) or 1),
        "lexical_density": sum(1 for segment in path_segments if segment.isalpha()) / (len(path_segments) or 1),

        # Longest sequences
        "longest_alphabetic_sequence": max((len(seq) for seq in re.findall(r'[a-zA-Z]+', punycode_hostname)), default=0),
        "longest_digit_sequence": max((len(seq) for seq in re.findall(r'\d+', punycode_hostname)), default=0),

        # Transformed list features
        "sum_digits_in_subdomains": sum(number_of_digits_in_subdomains),
        "mean_digits_in_subdomains": sum(number_of_digits_in_subdomains) / (len(number_of_digits_in_subdomains) or 1),
        "max_digits_in_subdomains": max(number_of_digits_in_subdomains, default=0),
        "min_digits_in_subdomains": min(number_of_digits_in_subdomains, default=0),

        "sum_alphabetic_in_subdomains": sum(number_of_alphabetic_in_subdomains),
        "mean_alphabetic_in_subdomains": sum(number_of_alphabetic_in_subdomains) / (len(number_of_alphabetic_in_subdomains) or 1),
        "max_alphabetic_in_subdomains": max(number_of_alphabetic_in_subdomains, default=0),
        "min_alphabetic_in_subdomains": min(number_of_alphabetic_in_subdomains, default=0),

        "sum_hyphens_in_subdomains": sum(number_of_hyphens_in_subdomains),
        "mean_hyphens_in_subdomains": sum(number_of_hyphens_in_subdomains) / (len(number_of_hyphens_in_subdomains) or 1),
        "max_hyphens_in_subdomains": max(number_of_hyphens_in_subdomains, default=0),
        "min_hyphens_in_subdomains": min(number_of_hyphens_in_subdomains, default=0),

        "sum_vowel_to_consonant_ratio_in_subdomains": sum(vowel_to_consonant_ratio_in_subdomains),
        "mean_vowel_to_consonant_ratio_in_subdomains": sum(vowel_to_consonant_ratio_in_subdomains) / (len(vowel_to_consonant_ratio_in_subdomains) or 1),
        "max_vowel_to_consonant_ratio_in_subdomains": max(vowel_to_consonant_ratio_in_subdomains, default=0),
        "min_vowel_to_consonant_ratio_in_subdomains": min(vowel_to_consonant_ratio_in_subdomains, default=0),

        "sum_entropy_of_subdomains": sum(entropy_of_each_subdomain),
        "mean_entropy_of_subdomains": sum(entropy_of_each_subdomain) / (len(entropy_of_each_subdomain) or 1),
        "max_entropy_of_subdomains": max(entropy_of_each_subdomain, default=0),
        "min_entropy_of_subdomains": min(entropy_of_each_subdomain, default=0),

        # Additional pattern checks (possibly not used)
        # "uses_punycode": uses_punycode
    }

    return features

# Read the CSV file
csv_file = './drive/MyDrive/Colab Notebooks/iteration-v2/cleaned_dataset_url_https_tls.csv'
df = pd.read_csv(csv_file)
pd.set_option('future.no_silent_downcasting', True)

# Add the test domain to the DataFrame
# test_row = pd.DataFrame([{'url': 'https://스타벅스코리아.com', 'label': 1}])
# df = pd.concat([df, test_row], ignore_index=True)

# Extract TLDs from URLs
df['tld'] = df['url'].apply(lambda url: tldextract.extract(url).suffix)

# Calculate TLD frequency (frequency encoding)
tld_frequency = df['tld'].value_counts().to_dict()

# Map TLD to its frequency
df['tld_frequency'] = df['tld'].map(tld_frequency)

# Extract features for each URL
features_list = []
for index, row in df.iterrows():
    url = row['url']
    # label = row['label']
    parsed_url = urllib.parse.urlparse(url)
    hostname = parsed_url.hostname
    if hostname:
        features = extract_hostname_features(url, hostname)
        features['tld_frequency'] = row['tld_frequency']
        # features['label'] = label
        features_list.append(features)

# Create a dataframe from the features
features_df = pd.DataFrame(features_list)

# Merge with the original dataframe, dropping any existing 'tld' and 'tld_frequency' columns
df.drop(columns=['tld', 'tld_frequency'], inplace=True, errors='ignore')
merged_df = df.merge(features_df.drop(columns=['domain']), on='url', how='left')

# Define the order of columns as desired
desired_order = ['url', 'domain', 'tld', 'label'] + [col for col in features_df.columns if col not in ['url', 'domain', 'tld']] + [col for col in df.columns if col not in ['url', 'domain', 'tld', 'label']]

# Reorder the dataframe
final_df = merged_df[desired_order]

# Save the final features DataFrame to a new CSV file
final_df.to_csv('./drive/MyDrive/Colab Notebooks/iteration-v2/dataset_url_https_tls_lexical_features.csv', index=False)

print(f"Total dataset: {final_df.shape}")

Total dataset: (64322, 85)


In [None]:
# Extract PDNS
import pandas as pd
import os
import json

# Load the CSV file and extract the 'domain' column
csv_file_path = './drive/MyDrive/Colab Notebooks/iteration-v2/dataset_url_https_tls_lexical_features.csv'
df = pd.read_csv(csv_file_path)
domain_set = set(df['domain'].tolist())

# Check the number of data in the 'domain' set
num_domains = len(domain_set)
print(f"Number of domains: {num_domains}")

# Initialize the list to hold the final data
final_data = []

# Folder containing JSON files
json_folder_path = 'pdns'

# Set to track domains found in JSON files
json_domain_set = set()

# Process each JSON file in the folder
for json_file in os.listdir(json_folder_path):
    if json_file.endswith('.json'):
        domain = json_file.replace('.json', '')
        json_domain_set.add(domain)
        with open(os.path.join(json_folder_path, json_file), 'r') as file:
            json_content = json.load(file)
            # Create a dictionary for each domain with its JSON content
            data_entry = {'domain': domain}
            data_entry.update(json_content)
            final_data.append(data_entry)

# Add domains not found in JSON files with all features set to 0
default_features = {
    "pdns_record_age": 0,
    "pdns_number_of_ip": 0,
    "pdns_change_frequency": 0,
    "pdns_unique_rrtype": 0,
    "pdns_unique_rrdata": 0
}

for domain in domain_set - json_domain_set:
    data_entry = {'domain': domain}
    data_entry.update(default_features)
    final_data.append(data_entry)

# Create a DataFrame from the final data and save to a new CSV file
final_df = pd.DataFrame(final_data)
output_csv_path = './drive/MyDrive/Colab Notebooks/iteration-v2/dataset_pdns_features.csv'
final_df.to_csv(output_csv_path, index=False)

print(f'Total rows: {final_df.shape[0]}')
print(f"Final data saved to {output_csv_path}")


In [None]:
# Merge PDNS Features
import pandas as pd

# Load the original dataset
original_csv_path = './drive/MyDrive/Colab Notebooks/iteration-v2/dataset_url_https_tls_lexical_features.csv'
original_df = pd.read_csv(original_csv_path)

# Load the domain PDNS features dataset
pdns_csv_path = './drive/MyDrive/Colab Notebooks/iteration-v2/dataset_pdns_features.csv'
pdns_df = pd.read_csv(pdns_csv_path)

# Merge the two dataframes on the 'domain' column
combined_df = pd.merge(original_df, pdns_df, on='domain', how='left')

# Ensure the column order: all columns from original_df followed by columns from pdns_df (excluding 'domain' from pdns_df)
original_columns = list(original_df.columns)
pdns_columns = [col for col in pdns_df.columns if col != 'domain']
combined_columns = original_columns + pdns_columns
combined_df = combined_df[combined_columns]

# Save the combined DataFrame to a new CSV file
output_csv_path = './drive/MyDrive/Colab Notebooks/iteration-v2/dataset_url_https_tls_lexical_pdns_features.csv'
combined_df.to_csv(output_csv_path, index=False)

print(f"Combined data saved to {output_csv_path}")

duplicates = combined_df.duplicated()
if duplicates.any():
    print(f"Warning: There are {duplicates.sum()} duplicate rows in the combined dataset.")

Combined data saved to ./drive/MyDrive/Colab Notebooks/iteration-v2/dataset_url_https_tls_lexical_pdns_features.csv


In [None]:
import pandas as pd
csv_file = './drive/MyDrive/Colab Notebooks/iteration-v2/v2_dataset_url_https_tls_lexical_pdns_features.csv'
df = pd.read_csv(csv_file)
# df.shape[0]
df.columns.tolist()

['url',
 'domain',
 'tld',
 'label',
 'length_of_domain',
 'number_of_subdomains',
 'number_of_dots',
 'number_of_digits',
 'number_of_special_characters',
 'number_of_alphabetic_sequences',
 'number_of_vowels',
 'number_of_consonants',
 'vowel_to_consonant_ratio',
 'vowel_density',
 'consonant_density',
 'digit_density',
 'hostname_entropy',
 'shannon_entropy_of_vowels',
 'shannon_entropy_of_consonants',
 'shannon_entropy_of_digits',
 'total_number_of_tokens',
 'maximum_token_length',
 'minimum_token_length',
 'average_token_length',
 'token_length_variance',
 'mean_token_length',
 'standard_deviation_of_token_length',
 'ratio_of_meaningful_syllables',
 'lexical_density',
 'longest_alphabetic_sequence',
 'longest_digit_sequence',
 'sum_digits_in_subdomains',
 'mean_digits_in_subdomains',
 'max_digits_in_subdomains',
 'min_digits_in_subdomains',
 'sum_alphabetic_in_subdomains',
 'mean_alphabetic_in_subdomains',
 'max_alphabetic_in_subdomains',
 'min_alphabetic_in_subdomains',
 'sum_hyp