In [1]:
from adblockparser import AdblockRules
import tldextract
import re
import collections
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.sankey import Sankey
from ipysankeywidget import SankeyWidget
from ipywidgets import Layout


partner_dictionary = {}

def adblockparser_parse(rule_list):
    return AdblockRules(rule_list)

def return_domain(rule_line):
    rule = []
    rule.append(rule_line)
    easylist_adblock_rules = adblockparser_parse(rule)
    domain = ''

    # print("rule list ----->", rule)
    # print("Value of r ---->", easylist_adblock_rules.rules)
    for r in easylist_adblock_rules.rules:
        y = r.rule_text
        x = y
        if x[:2] == '||':
            if '^' in x:
                split = x.index("^")
                x = x[2:split]
            else:
                x = x[2:]
        elif x[:1] =='|':
            x = x[1:]

        # print("from return domain -->",x)
        domain = tldextract.extract(x).domain

    return domain

def check_if_valid_domain(hostname):
    if len(hostname) > 255:
        return False
    hostname = hostname.rstrip(".")
    allowed = re.compile('^[a-z0-9]([a-z0-9\-\_]{0,61}[a-z0-9])?$',
                         re.IGNORECASE)
    labels = hostname.split(".")

    # the TLD must not be all-numeric
    if re.match(r"^[0-9]+$", labels[-1]):
        return False

    return all(allowed.match(x) for x in labels)

def extract_partner_domain():

    global partner_dictionary
    partner_count = 0

    with open('processed-dec-2019-exceptionrules-latest.txt') as fexception:
        partner = ''
        domain=''
        for line in fexception:
            # print(line)
            line = line.rstrip()

            if line[0] == '[':
                partner = line[1:-1]
                if partner not in partner_dictionary.keys():
                    partner_dictionary.update({partner:{'collaborators':[],'surrogate':[]}})
                    partner_count +=1

            if 'sitekey' in line:
                split = line.index("sitekey")
                line = line[:split]
                # print("Line cut 1--->",line)
                if line[-1] == '$' or line[-1] ==',':
                    line = line[:-1]
                # print("Line cut 2---->",line)

            if '#@#' or '###' in line:
                split=-1
                if '#@#' in line:
                    split = line.index("#@#")
                if '###' in line:
                    split = line.index("###")

                if split !=0:
                    line = line[:split]
                    # print("Line cut 3--->",line)

            if line[0] != '[':
                domain = return_domain(line)
                if check_if_valid_domain(domain) == False:
                    continue
                else:
                    partner_dictionary[partner]['collaborators'].append(domain)


            if line.find("$") >= 0 and partner != '':
                if (check_if_valid_domain(return_domain(line))) != False:
                    option_separator_index = line.find("$")
                    options_list = line[option_separator_index + 1:]
                    while options_list != "":
                        option_full = ""
                        split = options_list.index(",") if "," in options_list else -1
                        if split == -1:
                            option_full = options_list
                            options_list = ""
                        else:
                            option_full = options_list[:split]

                        list_exists = option_full.index("=") if "=" in option_full else -1

                        # case when lists are present - domain, sitekey cases
                        if list_exists != -1:
                            option_name = option_full[:list_exists]
                            # print("options_name -->"+option_name)
                            if option_name == 'domain':
                                option_full = option_full[list_exists + 1:]
                                while option_full != "":
                                    next_index = option_full.index("|") if "|" in option_full else -1

                                    # case when there are no more lists present
                                    if next_index == -1:
                                        # print(option_full)
                                        if option_full[0] == '~':
                                                option_full=option_full[1:]
                                                partner_dictionary[partner]['surrogate'].append(return_domain(option_full))
                                        else:
                                            partner_dictionary[partner]['surrogate'].append(return_domain(option_full))

                                        option_full = ""

                                    # case when there are options present in list
                                    else:

                                        curr_option = option_full[:next_index]
                                        if curr_option[0] == '~':
                                            curr_option = curr_option[1:]
                                            partner_dictionary[partner]['surrogate'].append(return_domain(curr_option))
                                        else:
                                            partner_dictionary[partner]['surrogate'].append(return_domain(curr_option))
                                        option_full = option_full[next_index + 1:]

                        options_list = options_list[split + 1:]
                        
extract_partner_domain()

    # print(partner_dictionary)

    # with open('partner-collaborator.txt','w') as f:
    #     for partner,partner_details in partner_dictionary.items():
    #         f.write('partner------>'+partner)
    #         f.write('\n')
    #         for partner_type ,partner_list in partner_details.items():
    #             f.write("partner type ----->"+partner_type)
    #             for item in partner_list:
    #                 f.write('\n')
    #                 f.write(item)
    #             f.write('\n')

In [2]:
# the following doesnt give the graph but runs successfully, debug later
layout = Layout(width="300", height="200")
def sankey(margin_top=10, **value):
    """Show SankeyWidget with default values for size and margins"""
    return SankeyWidget(layout=layout,
                        margins=dict(top=margin_top, bottom=0, left=30, right=60),
                        **value)

In [3]:
# get the data to plot it in javascript and display in the html file attached as ER-sankey-generation
def graph_get_data():
    global partner_dictionary 
    
    l=[]
    for key in partner_dictionary.keys():
        l.extend(partner_dictionary[key]['collaborators'])

    w = collections.Counter(l)

    w_dup ={}

    listofTuples = sorted(w.items(), key=lambda x: x[1], reverse=True)

    for elem in listofTuples:
        if elem[1] >10:
            w_dup.update({elem[0]:elem[1]})
            
    top_collab=[]
    i=0
    for key in w_dup.keys():
        if i == 5:
            break
        top_collab.append(key)
        i+=1
        
#     print(top_collab)

    sankey_data ={}
    for item in top_collab:
        sankey_data.update({item:[]})
        
#     print(sankey_data)

    for key in partner_dictionary.keys():
        for collab in top_collab:
            cnt = partner_dictionary[key]['collaborators'].count(collab)
            if cnt >0:
                sankey_data[collab].append({key:cnt})
    
#     print(sankey_data)

    sankey_links = []
    
    for key in sankey_data.keys():
        for item in sankey_data[key]:
            for partner in item.keys():
                link={}
                link = {'source':key, 'target':partner, 'value':item[partner]}
                sankey_links.append(link)
                
#     print(sankey_links)
    for elem in sankey_links:
        print('[\'',elem['source'],'\',\'',elem['target'],'\',',elem['value'],'],')


    
    
    layout = Layout(width="300", height="200")
    SankeyWidget(links=sankey_links, layout=layout).auto_save_png('test.png')
    
graph_get_data()
    

[' google ',' netzwelt.de ', 2 ],
[' google ',' Sedo ', 1 ],
[' google ',' 1und1 Mail und Media GmbH ', 6 ],
[' google ',' Renego ', 3 ],
[' google ',' izito.com ', 5 ],
[' google ',' Vysimo ', 8 ],
[' google ',' Leaf Group Ltd. ', 5 ],
[' google ',' chip.de ', 4 ],
[' google ',' Opendi AG ', 4 ],
[' google ',' Cylex ', 4 ],
[' google ',' winfuture.de ', 3 ],
[' google ',' ParkingCrew ', 1 ],
[' google ',' Digimedia ', 1 ],
[' google ',' media.net ', 1 ],
[' google ',' System1 Search ', 4 ],
[' google ',' System1 parking ', 1 ],
[' google ',' Ennovative ', 1 ],
[' google ',' Bodis ', 1 ],
[' google ',' Netsphere ', 2 ],
[' google ',' Google ', 1711 ],
[' google ',' Tradedoubler ', 2 ],
[' doubleclick ',' netzwelt.de ', 3 ],
[' doubleclick ',' BuySellAds ', 2 ],
[' doubleclick ',' heise.de ', 2 ],
[' doubleclick ',' StandS4 ', 1 ],
[' doubleclick ',' SimilarGroup ', 6 ],
[' doubleclick ',' Chitika ', 1 ],
[' doubleclick ',' Leaf Group Ltd. ', 30 ],
[' doubleclick ',' Recycled Mistakes '

[' doubleclick ',' grandoldteam.com ', 4 ],
[' doubleclick ',' keengamer.com ', 4 ],
[' doubleclick ',' yachts-boat.com ', 2 ],
[' doubleclick ',' ros-bot.com ', 3 ],
[' doubleclick ',' blog.guillaume-gomez.fr ', 1 ],
[' doubleclick ',' expatriate.pl ', 2 ],
[' doubleclick ',' strojar.com ', 2 ],
[' doubleclick ',' pechenegfx.org ', 2 ],
[' doubleclick ',' afterhoursprogramming.com ', 1 ],
[' doubleclick ',' linuxquestions.org ', 1 ],
[' doubleclick ',' stop-nepey.ru ', 3 ],
[' doubleclick ',' looksharp.com ', 1 ],
[' doubleclick ',' stephanearguin.com ', 1 ],
[' doubleclick ',' btrans.by ', 1 ],
[' doubleclick ',' cuve ', 2 ],
[' doubleclick ',' Wykop Sp. z o.o. ', 3 ],
[' doubleclick ',' dianomi ltd ', 4 ],
[' doubleclick ',' tampermonkey.net ', 1 ],
[' doubleclick ',' tech-alg.com ', 5 ],
[' doubleclick ',' golocal.de ', 1 ],
[' doubleclick ',' krasbaks.ru ', 1 ],
[' doubleclick ',' speechtexter.com ', 1 ],
[' doubleclick ',' kangsigit.com ', 4 ],
[' doubleclick ',' abaixoassinado.o