## Backlinking source analyis
#### Load libraries

In [1]:
import pandas as pd
from collections import Counter

#### Load data

In [3]:
# djoser.nl
df_djoser = pd.read_csv(filepath_or_buffer="./data/djoser_utf8.csv", sep=";", encoding='utf-8')

# inholland.nl
df_inholland = pd.read_csv(filepath_or_buffer="./data/inholland_utf8.csv", sep=";", encoding='utf-8')

# vattenfall.nl
df_vattenfall = pd.read_csv(filepath_or_buffer="./data/vattenfall_utf8.csv", sep=";", encoding='utf-8')

#### Top-30 most frequent backlink domains (domain, frequency)

In [23]:
def top_30(df):
    # distill backlink domain
    backlinking_domains = [url.split('/')[2].split('.')[-2] for url in df['Referring page URL']]

    # freq count
    counter = Counter(backlinking_domains)

    # find top 30
    top_30 = counter.most_common(30)
    return(top_30)

#### Djoser

In [24]:
top_30_djoser = top_30(df_djoser)
top_30_djoser

[('blogspot', 1706),
 ('vakantieboeken2022', 301),
 ('prlog', 254),
 ('reisbijbel', 221),
 ('bestereistijd', 219),
 ('djoser', 195),
 ('avontuurlijk-reizen', 175),
 ('rondreizen-zwerftochten', 146),
 ('goedkopegroepsreis', 141),
 ('rondreiskoning', 120),
 ('jouwpagina', 114),
 ('rondreizen', 109),
 ('wereldreis', 106),
 ('landenkompas', 100),
 ('singleplus', 100),
 ('groepsrondreis', 93),
 ('startpagina', 92),
 ('jouwzonvakantie', 92),
 ('keywordsbasket', 86),
 ('wazzzup', 85),
 ('verrereizenmetkinderen', 83),
 ('vakantievoortieners', 79),
 ('azie', 78),
 ('travelersmagazine', 78),
 ('startbewijs', 74),
 ('reizenlink', 73),
 ('reizen-in-azie', 71),
 ('travelcompleta', 66),
 ('vakantie-met-kinderen', 64),
 ('travelvalley', 61)]

#### Print startpagina.nl domains

In [38]:
# unique startpagina domains
startpagina_ls = list(set([url for url in df_djoser['Referring page URL'] if 'startpagina' in url]))
print(len(startpagina_ls))
startpagina_ls

71


['https://brazilie.startpagina.net/',
 'https://safari.startpagina.net/',
 'https://duurzame-vakantie.startpagina.nl/',
 'https://singles-reizen.startpagina.nl/',
 'https://reisorganisaties.startpagina.nl/',
 'https://groepsreizen.startpagina.nl/',
 'https://wereldreis.startpagina.nl/',
 'https://srilanka.startpagina.nl/',
 'https://rondreizen.startpagina.net/',
 'https://vakantie.startpaginasite.nl/rubrieken/groepsreizen/',
 'https://fietstochten.startpaginas.org/',
 'https://buitenland-reizen.jouw-startpagina.nl/',
 'https://schipholparkeren.jestartpagina.nl/',
 'https://vakanties.startpagina.net/',
 'https://madagascar.startpagina.nl/',
 'https://vanvonderenfinever37.blogspot.com/2021/08/west-amerika-kaart-amerikastartpaginanl.html',
 'https://reizen.startpagina.net/',
 'https://india.startpagina.nl/forum/topic/864572/eerste-keer-naar-india/?page=4',
 'http://vakanties-startpagina.nl/',
 'https://vakantie-reis.coole-startpagina.nl/',
 'https://reizen.startpagina.nl/',
 'https://azie

#### Inholland

In [5]:
top_30_inholland = top_30(df_inholland)
top_30_inholland

[('blogspot', 481),
 ('cedeo', 459),
 ('c99', 342),
 ('symbaloo', 220),
 ('freekeyworddifficultytool', 106),
 ('com', 94),
 ('fica', 93),
 ('linkddl', 84),
 ('co', 74),
 ('maevemusic', 74),
 ('google', 69),
 ('azztimes', 69),
 ('nl-inloggen', 68),
 ('wazzzup', 66),
 ('wijinholland', 66),
 ('nationaleonderwijsgids', 66),
 ('cordylink', 64),
 ('wikibacklink', 64),
 ('bestkeywordtools', 62),
 ('prlog', 60),
 ('analyzim', 59),
 ('yurls', 57),
 ('hmaillogin', 54),
 ('inlogportal', 53),
 ('personalpages', 52),
 ('office', 51),
 ('inlogdatabase', 48),
 ('find-study-now', 48),
 ('startpagina', 47),
 ('inloggenl', 47)]

#### Vattenfall

In [6]:
top_30_vattenfall = top_30(df_vattenfall)
top_30_vattenfall

[('fok', 2147),
 ('wazzzup', 324),
 ('blogspot', 312),
 ('nl-inloggen', 121),
 ('inlogportal', 98),
 ('inloggenl', 82),
 ('c99', 77),
 ('polderpv', 76),
 ('inlogdatabase', 72),
 ('azurewebsites', 69),
 ('maevemusic', 58),
 ('vattenfall', 52),
 ('nieuws', 45),
 ('startpagina', 38),
 ('analyzim', 38),
 ('kontactr', 36),
 ('bolo-bolo', 36),
 ('spiritguidesociety', 35),
 ('directorylib', 35),
 ('tweakers', 34),
 ('loginlist', 33),
 ('co', 33),
 ('startbewijs', 30),
 ('symbaloo', 29),
 ('jouwpagina', 29),
 ('expertpagina', 28),
 ('personalpages', 27),
 ('wikibacklink', 26),
 ('freekeyworddifficultytool', 23),
 ('uwpagina', 22)]

#### Intersection between top-30 domains

In [7]:
djoser_top_30_ls = [elm[0] for elm in top_30_djoser]
inholland_top_30_ls = [elm[0] for elm in top_30_inholland]
vattenfall_top_30_ls = [elm[0] for elm in top_30_vattenfall]

In [8]:
list(set(djoser_top_30_ls) & set(inholland_top_30_ls) & set(vattenfall_top_30_ls))

['startpagina', 'wazzzup', 'blogspot']

#### Not only frequency of backlinkg, also rating and traffic of backlinking domains counts for SEO 

In [9]:
def summary_table(top_30_ls,df):

    # initialize lists
    freq_ls = []
    rating_ls = []
    traffic_ls = []

    for domain in top_30_ls:
        # domain freq in backlinks
        df_domain = df[df['Referring page URL'].str.contains(domain)]
        freq_ls.append(df_domain.shape[0])

        # average domain rating
        avg_domain_rating = df_domain['Domain rating'].mean()
        rating_ls.append(avg_domain_rating)

        # average domain traffic
        avg_domain_traffic = df_domain['Domain traffic'].mean()
        traffic_ls.append(avg_domain_traffic)

    # display format large numbers
    pd.set_option('display.float_format', '{:.2f}'.format)

    # create new dataframe
    summary_table = pd.DataFrame([freq_ls,rating_ls,traffic_ls], columns=top_30_ls).T
    summary_table.columns = ['freq','rating','traffic']
    return(summary_table)

In [10]:
summary_table(djoser_top_30_ls,df_djoser)

Unnamed: 0,freq,rating,traffic
blogspot,1707.0,0.13,17.58
vakantieboeken2022,301.0,0.0,26.0
prlog,254.0,70.0,5854.0
reisbijbel,221.0,29.0,1657.0
bestereistijd,219.0,37.0,77365.0
djoser,825.0,37.94,1265241.85
avontuurlijk-reizen,177.0,17.49,254.9
rondreizen-zwerftochten,146.0,10.0,4.0
goedkopegroepsreis,141.0,25.0,916.0
rondreiskoning,120.0,9.0,72.0


In [11]:
summary_table(inholland_top_30_ls,df_inholland)

Unnamed: 0,freq,rating,traffic
blogspot,481.0,1.56,8.48
cedeo,459.0,32.87,230.67
c99,342.0,35.0,49297.0
symbaloo,220.0,76.0,141590.0
freekeyworddifficultytool,106.0,36.0,138.0
com,3638.0,27.24,1352006.56
fica,119.0,13.7,8389.71
linkddl,84.0,34.0,178.0
co,4464.0,28.54,1737561.36
maevemusic,74.0,30.0,229.0


In [12]:
summary_table(vattenfall_top_30_ls,df_vattenfall)

Unnamed: 0,freq,rating,traffic
fok,2148.0,63.98,126980.57
wazzzup,324.0,2.6,58948.0
blogspot,312.0,0.83,123.76
nl-inloggen,168.0,2.51,294417.39
inlogportal,98.0,0.1,75611.0
inloggenl,82.0,0.3,239811.0
c99,77.0,34.0,52506.0
polderpv,76.0,32.0,147.0
inlogdatabase,72.0,0.4,131287.0
azurewebsites,69.0,5.35,23.81
