In [1]:
! python --version

Python 3.10.8


In [2]:
import tensorflow as tf
print(tf.__version__)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


1.13.1


In [None]:
# This is a Jupyter Notebook version of the given TensorFlow 1.13 training script

import re
import time
import datetime
import os
import pickle
import numpy as np
from tqdm import tqdm
from bisect import bisect_left
import tensorflow as tf
from tensorflow.contrib import learn
from tflearn.data_utils import to_categorical, pad_sequences
from TextCNN import *
from utils import *

# Configuration dictionary to replace argparse parameters
config = {
    "data": {
        "max_len_words": 200,
        "max_len_chars": 200,
        "max_len_subwords": 20,
        "min_word_freq": 1,
        "dev_pct": 0.1,
        "data_dir": 'train_10000.txt',
        "delimit_mode": 1
    },
    "model": {
        "emb_dim": 32,
        "filter_sizes": "3,4,5,6",
        "emb_mode": 5
    },
    "train": {
        "nb_epochs": 5,
        "batch_size": 128,
        "l2_reg_lambda": 0.0,
        "lr": 0.001
    },
    "log": {
        "output_dir": "runs/10000/",
        "print_every": 50,
        "eval_every": 500,
        "checkpoint_every": 500
    }
}

# Print configuration settings
for section, params in config.items():
    for key, value in params.items():
        print(f"{section}.{key}={value}")

# Load data
with h5py.File('../data/phishing_output.h5', 'r') as h5_file:
    # Pull 'urls' and 'labels' from the 'train' dataset
    train_urls = [url.decode('utf-8') for url in h5_file['train/urls'][:]]
    train_labels = h5_file['train/labels'][:]

    # Pull 'urls' and 'labels' from the 'dev' dataset
    dev_urls = [url.decode('utf-8') for url in h5_file['dev/urls'][:]]
    dev_labels = h5_file['dev/labels'][:]

    # Concatenate the 'urls' and 'labels'
    urls = train_urls + dev_urls
    labels = np.concatenate((train_labels, dev_labels))

high_freq_words = None
if config["data"]["min_word_freq"] > 0:
    x1, word_reverse_dict = get_word_vocab(urls, config["data"]["max_len_words"], config["data"]["min_word_freq"])
    high_freq_words = sorted(list(word_reverse_dict.values()))
    print("Number of words with freq >= {}: {}".format(config["data"]["min_word_freq"], len(high_freq_words)))

x, word_reverse_dict = get_word_vocab(urls, config["data"]["max_len_words"])
word_x = get_words(x, word_reverse_dict, config["data"]["delimit_mode"], urls)
ngramed_id_x, ngrams_dict, worded_id_x, words_dict = ngram_id_x(word_x, config["data"]["max_len_subwords"], high_freq_words)

chars_dict = ngrams_dict
chared_id_x = char_id_x(urls, chars_dict, config["data"]["max_len_chars"])

# Split data into positive and negative samples
pos_x = []
neg_x = []
for i in range(len(labels)):
    label = labels[i]
    if label == 1:
        pos_x.append(i)
    else:
        neg_x.append(i)
print("Overall Mal/Ben split: {}/{}".format(len(pos_x), len(neg_x)))
pos_x = np.array(pos_x)
neg_x = np.array(neg_x)

x_train, y_train, x_test, y_test = prep_train_test(pos_x, neg_x, config["data"]["dev_pct"])

x_train_char = get_ngramed_id_x(x_train, ngramed_id_x)
x_test_char = get_ngramed_id_x(x_test, ngramed_id_x)

x_train_word = get_ngramed_id_x(x_train, worded_id_x)
x_test_word = get_ngramed_id_x(x_test, worded_id_x)

x_train_char_seq = get_ngramed_id_x(x_train, chared_id_x)
x_test_char_seq = get_ngramed_id_x(x_test, chared_id_x)

# Training and Evaluation Functions
def train_dev_step(x, y, emb_mode, is_train=True):
    if is_train: 
        p = 0.5
    else: 
        p = 1.0
    if emb_mode == 1: 
        feed_dict = {
            cnn.input_x_char_seq: x[0],
            cnn.input_y: y,
            cnn.dropout_keep_prob: p}  
    elif emb_mode == 2: 
        feed_dict = {
            cnn.input_x_word: x[0],
            cnn.input_y: y,
            cnn.dropout_keep_prob: p}
    elif emb_mode == 3: 
        feed_dict = {
            cnn.input_x_char_seq: x[0],
            cnn.input_x_word: x[1],
            cnn.input_y: y,
            cnn.dropout_keep_prob: p}
    elif emb_mode == 4: 
        feed_dict = {
            cnn.input_x_word: x[0],
            cnn.input_x_char: x[1],
            cnn.input_x_char_pad_idx: x[2],
            cnn.input_y: y,
            cnn.dropout_keep_prob: p}
    elif emb_mode == 5:  
        feed_dict = {
            cnn.input_x_char_seq: x[0],
            cnn.input_x_word: x[1],
            cnn.input_x_char: x[2],
            cnn.input_x_char_pad_idx: x[3],
            cnn.input_y: y,
            cnn.dropout_keep_prob: p}
    if is_train:
        _, step, loss, acc = sess.run([train_op, global_step, cnn.loss, cnn.accuracy], feed_dict)
    else: 
        step, loss, acc = sess.run([global_step, cnn.loss, cnn.accuracy], feed_dict)
    return step, loss, acc

def make_batches(x_train_char_seq, x_train_word, x_train_char, y_train, batch_size, nb_epochs, shuffle=False):
    if config["model"]["emb_mode"] == 1:  
        batch_data = list(zip(x_train_char_seq, y_train))
    elif config["model"]["emb_mode"] == 2:  
        batch_data = list(zip(x_train_word, y_train))
    elif config["model"]["emb_mode"] == 3:  
        batch_data = list(zip(x_train_char_seq, x_train_word, y_train))
    elif config["model"]["emb_mode"] == 4:
         batch_data = list(zip(x_train_char, x_train_word, y_train))
    elif config["model"]["emb_mode"] == 5:  
        batch_data = list(zip(x_train_char, x_train_word, x_train_char_seq, y_train))
    batches = batch_iter(batch_data, batch_size, nb_epochs, shuffle)

    if nb_epochs > 1: 
        nb_batches_per_epoch = int(len(batch_data)/batch_size)
        if len(batch_data)%batch_size != 0:
            nb_batches_per_epoch += 1
        nb_batches = int(nb_batches_per_epoch * nb_epochs)
        return batches, nb_batches_per_epoch, nb_batches
    else:
        return batches 

def prep_batches(batch):
    if config["model"]["emb_mode"] == 1:
        x_char_seq, y_batch = zip(*batch)
    elif config["model"]["emb_mode"] == 2:
        x_word, y_batch = zip(*batch)
    elif config["model"]["emb_mode"] == 3:
        x_char_seq, x_word, y_batch = zip(*batch)
    elif config["model"]["emb_mode"] == 4:
        x_char, x_word, y_batch = zip(*batch)
    elif config["model"]["emb_mode"] == 5:
        x_char, x_word, x_char_seq, y_batch = zip(*batch)

    x_batch = []
    if config["model"]["emb_mode"] in [1, 3, 5]:
        x_char_seq = pad_seq_in_word(x_char_seq, config["data"]["max_len_chars"])
        x_batch.append(x_char_seq)
    if config["model"]["emb_mode"] in [2, 3, 4, 5]:
        x_word = pad_seq_in_word(x_word, config["data"]["max_len_words"])
        x_batch.append(x_word)
    if config["model"]["emb_mode"] in [4, 5]:
        x_char, x_char_pad_idx = pad_seq(x_char, config["data"]["max_len_words"], config["data"]["max_len_subwords"], config["model"]["emb_dim"])
        x_batch.extend([x_char, x_char_pad_idx])
    return x_batch, y_batch

# TensorFlow Session Setup
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    sess = tf.Session(config=session_conf)

    with sess.as_default():
        cnn = TextCNN(
            char_ngram_vocab_size=len(ngrams_dict) + 1,
            word_ngram_vocab_size=len(words_dict) + 1,
            char_vocab_size=len(chars_dict) + 1,
            embedding_size=config["model"]["emb_dim"],
            word_seq_len=config["data"]["max_len_words"],
            char_seq_len=config["data"]["max_len_chars"],
            l2_reg_lambda=config["train"]["l2_reg_lambda"],
            mode=config["model"]["emb_mode"],
            filter_sizes=list(map(int, config["model"]["filter_sizes"].split(",")))
        )

        global_step = tf.Variable(0, name="global_step", trainable=False)
        optimizer = tf.train.AdamOptimizer(config["train"]["lr"])
        grads_and_vars = optimizer.compute_gradients(cnn.loss)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        print("Writing to {}\n".format(config["log"]["output_dir"]))
        if not os.path.exists(config["log"]["output_dir"]):
            os.makedirs(config["log"]["output_dir"])

        # Save dictionary files
        ngrams_dict_dir = config["log"]["output_dir"] + "subwords_dict.p"
        pickle.dump(ngrams_dict, open(ngrams_dict_dir, "wb"))
        words_dict_dir = config["log"]["output_dir"] + "words_dict.p"
        pickle.dump(words_dict, open(words_dict_dir, "wb"))
        chars_dict_dir = config["log"]["output_dir"] + "chars_dict.p"
        pickle.dump(chars_dict, open(chars_dict_dir, "wb"))

        # Save training and validation logs
        train_log_dir = config["log"]["output_dir"] + "train_logs.csv"
        with open(train_log_dir, "w") as f:
            f.write("step,time,loss,acc\n")
        val_log_dir = config["log"]["output_dir"] + "val_logs.csv"
        with open(val_log_dir, "w") as f:
            f.write("step,time,loss,acc\n")

        # Save model checkpoints
        checkpoint_dir = config["log"]["output_dir"] + "checkpoints/"
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        checkpoint_prefix = checkpoint_dir + "model"
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

        sess.run(tf.global_variables_initializer())

        # Prepare batches
        train_batches, nb_batches_per_epoch, nb_batches = make_batches(x_train_char_seq, x_train_word, x_train_char, y_train,
                                                                      config["train"]["batch_size"],
                                                                      config['train']['nb_epochs'], True)

        min_dev_loss = float('Inf')
        dev_loss = float('Inf')
        dev_acc = 0.0
        print("Number of batches in total: {}".format(nb_batches))
        print("Number of batches per epoch: {}".format(nb_batches_per_epoch))

        # Training loop
        it = tqdm(range(nb_batches), desc="emb_mode {} delimit_mode {} train_size {}".format(
            config["model"]["emb_mode"], config["data"]["delimit_mode"], x_train.shape[0]), ncols=0)
        for idx in it:
            batch = next(train_batches)
            x_batch, y_batch = prep_batches(batch)
            step, loss, acc = train_dev_step(x_batch, y_batch, emb_mode=config["model"]["emb_mode"], is_train=True)
            if step % config["log"]["print_every"] == 0:
                with open(train_log_dir, "a") as f:
                    f.write("{:d},{:s},{:e},{:e}\n".format(step, datetime.datetime.now().isoformat(), loss, acc))
                it.set_postfix(
                    trn_loss='{:.3e}'.format(loss),
                    trn_acc='{:.3e}'.format(acc),
                    dev_loss='{:.3e}'.format(dev_loss),
                    dev_acc='{:.3e}'.format(dev_acc),
                    min_dev_loss='{:.3e}'.format(min_dev_loss))
            if step % config["log"]["eval_every"] == 0 or idx == (nb_batches - 1):
                total_loss = 0
                nb_corrects = 0
                nb_instances = 0
                test_batches = make_batches(x_test_char_seq, x_test_word, x_test_char, y_test,
                                            config['train']['batch_size'], 1, False)
                for test_batch in test_batches:
                    x_test_batch, y_test_batch = prep_batches(test_batch)
                    step, batch_dev_loss, batch_dev_acc = train_dev_step(x_test_batch, y_test_batch,
                                                                         emb_mode=config["model"]["emb_mode"],
                                                                         is_train=False)
                    nb_instances += x_test_batch[0].shape[0]
                    total_loss += batch_dev_loss * x_test_batch[0].shape[0]
                    nb_corrects += batch_dev_acc * x_test_batch[0].shape[0]

                dev_loss = total_loss / nb_instances
                dev_acc = nb_corrects / nb_instances
                with open(val_log_dir, "a") as f:
                    f.write("{:d},{:s},{:e},{:e}\n".format(step, datetime.datetime.now().isoformat(), dev_loss, dev_acc))
                if step % config["log"]["checkpoint_every"] == 0 or idx == (nb_batches - 1):
                    if dev_loss < min_dev_loss:
                        path = saver.save(sess, checkpoint_prefix, global_step=step)
                        min_dev_loss = dev_loss


Finished build vocabulary and mapping to x in 0.9714772701263428
Size of word vocabulary: 5792
Number of words with freq >= 1: 5792
Finished build vocabulary and mapping to x in 0.810955286026001
Size of word vocabulary: 73498
Processing #url 0
Size of ngram vocabulary: 82
Size of word vocabulary: 5588
Index of <UNKNOWN> word: 1182
Overall Mal/Ben split: 23963/39798
Train Mal/Ben split: 21567/35819
Test Mal/Ben split: 2396/3979
Train/Test split: 57386/6375
Train/Test split: 57386/6375
Writing to runs/10000/

Number of batches in total: 2245
Number of batches per epoch: 449


emb_mode 5 delimit_mode 1 train_size 57386: 100% 2245/2245 [27:30<00:00,  1.36it/s, dev_acc=9.515e-01, dev_loss=1.410e-01, min_dev_loss=1.410e-01, trn_acc=9.844e-01, trn_loss=7.134e-02]  


In [32]:
# Import necessary libraries
import pickle
import time
import numpy as np
from tqdm import tqdm
import tensorflow.compat.v1 as tf
from utils import *
import h5py

# Disable TensorFlow 2 behaviors
tf.disable_v2_behavior()

# Configuration
config = {
    'data': {
        'max_len_words': 200,
        'max_len_chars': 200,
        'max_len_subwords': 20,
        'data_dir': 'path/to/data',
        'delimit_mode': 1,
        'subword_dict_dir': 'runs/10000/subwords_dict.p',  # Directory of subword dictionary
        'word_dict_dir': 'runs/10000/words_dict.p',  # Directory of word dictionary
        'char_dict_dir': 'runs/10000/chars_dict.p',  # Directory of character dictionary
    },
    'model': {
        'emb_dim': 32,
        'emb_mode': 5
    },
    'test': {
        'batch_size': 128
    },
    'log': {
        'output_dir': '../data/',
        'checkpoint_dir': 'runs/10000/checkpoints/model-2245'
    }
}

# Load data
with h5py.File('../data/phishing_output.h5', 'r') as h5_file:
    # Pull 'urls' and 'labels' from the 'producthunt' dataset
    urls = [url.decode('utf-8') for url in h5_file['test/urls'][:]]
    labels = h5_file['test/labels'][:]
    

# Prepare data
x, word_reverse_dict = get_word_vocab(urls, config['data']['max_len_words'])
word_x = get_words(x, word_reverse_dict, config['data']['delimit_mode'], urls)

ngram_dict = pickle.load(open(config['data']['subword_dict_dir'], "rb"))
print("Size of subword vocabulary (train): {}".format(len(ngram_dict)))
word_dict = pickle.load(open(config['data']['word_dict_dir'], "rb"))
print("Size of word vocabulary (train): {}".format(len(word_dict)))
ngramed_id_x, worded_id_x = ngram_id_x_from_dict(word_x, config['data']['max_len_subwords'], ngram_dict, word_dict)
chars_dict = pickle.load(open(config['data']['char_dict_dir'], "rb"))
chared_id_x = char_id_x(urls, chars_dict, config['data']['max_len_chars'])

print("Number of testing urls: {}".format(len(labels)))

# Evaluation function
def test_step(x, emb_mode):
    p = 1.0
    if emb_mode == 1:
        feed_dict = {
            input_x_char_seq: x[0],
            dropout_keep_prob: p
        }
    elif emb_mode == 2:
        feed_dict = {
            input_x_word: x[0],
            dropout_keep_prob: p
        }
    elif emb_mode == 3:
        feed_dict = {
            input_x_char_seq: x[0],
            input_x_word: x[1],
            dropout_keep_prob: p
        }
    elif emb_mode == 4:
        feed_dict = {
            input_x_word: x[0],
            input_x_char: x[1],
            input_x_char_pad_idx: x[2],
            dropout_keep_prob: p
        }
    elif emb_mode == 5:
        feed_dict = {
            input_x_char_seq: x[0],
            input_x_word: x[1],
            input_x_char: x[2],
            input_x_char_pad_idx: x[3],
            dropout_keep_prob: p
        }
    preds, s = sess.run([predictions, scores], feed_dict)
    return preds, s

# Load model and run evaluation
checkpoint_file = config['log']['checkpoint_dir']
total_time = 0
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    session_conf.gpu_options.allow_growth = True
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        if config['model']['emb_mode'] in [1, 3, 5]:
            input_x_char_seq = graph.get_operation_by_name("input_x_char_seq").outputs[0]
        if config['model']['emb_mode'] in [2, 3, 4, 5]:
            input_x_word = graph.get_operation_by_name("input_x_word").outputs[0]
        if config['model']['emb_mode'] in [4, 5]:
            input_x_char = graph.get_operation_by_name("input_x_char").outputs[0]
            input_x_char_pad_idx = graph.get_operation_by_name("input_x_char_pad_idx").outputs[0]
        dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0]

        predictions = graph.get_operation_by_name("output/predictions").outputs[0]
        scores = graph.get_operation_by_name("output/scores").outputs[0]

        # Create batches
        if config['model']['emb_mode'] == 1:
            batches = batch_iter(list(chared_id_x), config['test']['batch_size'], 1, shuffle=False)
        elif config['model']['emb_mode'] == 2:
            batches = batch_iter(list(worded_id_x), config['test']['batch_size'], 1, shuffle=False)
        elif config['model']['emb_mode'] == 3:
            batches = batch_iter(list(zip(chared_id_x, worded_id_x)), config['test']['batch_size'], 1, shuffle=False)
        elif config['model']['emb_mode'] == 4:
            batches = batch_iter(list(zip(ngramed_id_x, worded_id_x)), config['test']['batch_size'], 1, shuffle=False)
        elif config['model']['emb_mode'] == 5:
            batches = batch_iter(list(zip(ngramed_id_x, worded_id_x, chared_id_x)), config['test']['batch_size'], 1, shuffle=False)

        all_predictions = []
        all_scores = []

        nb_batches = int(len(labels) / config['test']['batch_size'])
        if len(labels) % config['test']['batch_size'] != 0:
            nb_batches += 1
        print("Number of batches in total: {}".format(nb_batches))
        it = tqdm(range(nb_batches), desc="emb_mode {} delimit_mode {} test_size {}".format(config['model']['emb_mode'], config['data']['delimit_mode'], len(labels)), ncols=0)
        for idx in it:
            batch = next(batches)

            if config['model']['emb_mode'] == 1:
                x_char_seq = batch
            elif config['model']['emb_mode'] == 2:
                x_word = batch
            elif config['model']['emb_mode'] == 3:
                x_char_seq, x_word = zip(*batch)
            elif config['model']['emb_mode'] == 4:
                x_char, x_word = zip(*batch)
            elif config['model']['emb_mode'] == 5:
                x_char, x_word, x_char_seq = zip(*batch)

            x_batch = []
            if config['model']['emb_mode'] in [1, 3, 5]:
                x_char_seq = pad_seq_in_word(x_char_seq, config['data']['max_len_chars'])
                x_batch.append(x_char_seq)
            if config['model']['emb_mode'] in [2, 3, 4, 5]:
                x_word = pad_seq_in_word(x_word, config['data']['max_len_words'])
                x_batch.append(x_word)
            if config['model']['emb_mode'] in [4, 5]:
                x_char, x_char_pad_idx = pad_seq(x_char, config['data']['max_len_words'], config['data']['max_len_subwords'], config['model']['emb_dim'])
                x_batch.extend([x_char, x_char_pad_idx])

            start_time = time.time()
            batch_predictions, batch_scores = test_step(x_batch, config['model']['emb_mode'])
            total_time += time.time() - start_time
            all_predictions = np.concatenate([all_predictions, batch_predictions])
            all_scores.extend(batch_scores)

            it.set_postfix()

# Calculate accuracy
if labels is not None:
    # Assuming all_predictions and labels are NumPy arrays with binary values (0 or 1)
    TP = np.sum((all_predictions == 1) & (labels == 1))  # True Positives
    FP = np.sum((all_predictions == 1) & (labels == 0))  # False Positives
    FN = np.sum((all_predictions == 0) & (labels == 1))  # False Negatives
    TN = np.sum((all_predictions == 0) & (labels == 0))  # True Negatives

    # Accuracy
    correct_preds = TP + TN
    accuracy = correct_preds / float(len(labels))

    # Precision, Recall, and F1 Score
    precision = TP / float(TP + FP) if (TP + FP) > 0 else 0
    recall = TP / float(TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Output all the counts and metrics
    print(f"True Positives (TP): {TP}")
    print(f"False Positives (FP): {FP}")
    print(f"False Negatives (FN): {FN}")
    print(f"True Negatives (TN): {TN}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1_score}")

# Save test results
save_test_result(urls, labels, all_predictions, all_scores, config['log']['output_dir'])


Finished build vocabulary and mapping to x in 0.09513211250305176
Size of word vocabulary: 9601
Size of subword vocabulary (train): 82
Size of word vocabulary (train): 5588
Index of <UNKNOWN> word: 1182
Processing url #0
Number of testing urls: 7137
INFO:tensorflow:Restoring parameters from runs/10000/checkpoints/model-2245
Number of batches in total: 56


emb_mode 5 delimit_mode 1 test_size 7137: 100% 56/56 [00:16<00:00,  3.48it/s]


True Positives (TP): 2508
False Positives (FP): 109
False Negatives (FN): 199
True Negatives (TN): 4321
Accuracy: 0.9568446125823175
Precision: 0.9583492548719909
Recall: 0.9264868858514961
F1 Score: 0.9421487603305785


IsADirectoryError: [Errno 21] Is a directory: '../data/'

In [33]:
false_positives = [urls[i] for i in range(len(labels)) if all_predictions[i] == 1 and labels[i] == 0]
false_negatives = [urls[i] for i in range(len(labels)) if all_predictions[i] == 0 and labels[i] == 1]

print("Number of false positives: {}".format(len(false_positives)))
print("Number of false negatives: {}".format(len(false_negatives)))
print("False Positives: ", false_positives)
print("False Negatives: ", false_negatives)


Number of false positives: 109
Number of false negatives: 199
False Positives:  ['https://romeo-template.webflow.io/', 'https://ogc.irmau.com/site/content/', 'https://sekaimura.base.shop/', 'https://tigerdirect.com/td/td-sunset.html', 'https://www.plaync.com/support/500', 'https://link-in-bio.webflow.io/', 'https://kinobar.vip/', 'http://siap-online.com/', 'http://fantajia35.seesaa.net/', 'https://meumomentodevida.libertyseguros.com.br/', 'https://skwpspace.com/', 'https://mtcg.glitch.me/', 'https://allpropavingasphalt.com/', 'http://ferien.nessmersiel-nordsee.com/', 'http://www.all-nationz.com/', 'https://ascrew.shop/', 'http://mapabc.wikidot.com/', 'https://webflow-stripe-shopify.webflow.io/', 'https://sapporo-kataduke-bz.com/', 'https://benefits.proskauerpodcasts.com/', 'https://cruize-v.webflow.io/', 'https://h-o-g-w-a-r-t-s.com/', 'https://en.smarttools.xyz/', 'http://pinkfloyd.cocolog-nifty.com/', 'http://streamlinewindows.com.au/index.html', 'https://lamp-app.surge.sh/', 'https:

In [14]:
print(labels)

[0 0 0 ... 0 0 0]
