In [461]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
import re
import numpy as np
from wordcloud import WordCloud 
from PIL import Image

In [462]:
# Setting up variables for grabbing the pagination
pagination = 1
forbidden_pages = ["Vorige", "Volgende"] # Filtering out prev/next buttons
all_pages = [1]
all_page_content = []

# Dumpert user input
keyword = input("Search Query: ")
keyword = keyword.replace(" ","e%20")

Search Query: zwarte piet


In [463]:
# Functions for getting pagination and getting all individual pages around our keyword. 
def loadsite():
    req = requests.get(f"https://www.dumpert.nl/search/ALL/{keyword}/{pagination}")
    if req.status_code != 200:
        print(f"something is wrong - {req.status_code}")
    else:
        print(f"{req.status_code} - all good. We're scraping it...")
    html = req.text
    soup = BeautifulSoup(html, "lxml")
    return soup

def scrape():
    global pagination
    soup = loadsite()
    page_results = soup.select(".dumpthumb")
    index = 0
    
    # Getting every item on the page
    for blockitem in page_results:
        url = blockitem['href']
        short_description = soup.select(".description")[index].get_text().strip()
        item = {
            "url": url,
            "short_description": short_description,
            "search_page": pagination
            }
        all_page_content.append(item)
        index = index + 1
    
    # Update pagination    
    dumpert_pagination = soup.select(".pagination li a")
    for page in dumpert_pagination:
        page = page.get_text().strip()
        if page not in forbidden_pages:
            page = int(page)
            if page not in all_pages:
                all_pages.append(page)
    
    # Check if on last page
    pagination = pagination + 1
    print(f'Currently on page number: {pagination}')
        
    # Check if on last page, else start over again.
    if pagination in all_pages:
        scrape()
    else:
        print("")
        print("no more content found! we're done here.")

# Calling in the main function        
scrape()

# Showing all pages that have been scraped
print("")
print("Full Pagination:")
print(all_pages)

200 - all good. We're scraping it...
Currently on page number: 2
200 - all good. We're scraping it...
Currently on page number: 3
200 - all good. We're scraping it...
Currently on page number: 4
200 - all good. We're scraping it...
Currently on page number: 5
200 - all good. We're scraping it...
Currently on page number: 6
200 - all good. We're scraping it...
Currently on page number: 7
200 - all good. We're scraping it...
Currently on page number: 8
200 - all good. We're scraping it...
Currently on page number: 9
200 - all good. We're scraping it...
Currently on page number: 10
200 - all good. We're scraping it...
Currently on page number: 11
200 - all good. We're scraping it...
Currently on page number: 12
200 - all good. We're scraping it...
Currently on page number: 13
200 - all good. We're scraping it...
Currently on page number: 14
200 - all good. We're scraping it...
Currently on page number: 15
200 - all good. We're scraping it...
Currently on page number: 16
200 - all good. We

In [465]:
# Checking our results.
df = pd.DataFrame(all_page_content)
df.head(10)

Unnamed: 0,search_page,short_description,url
0,1,De Nederlandse traditie steekt de grens over.,https://www.dumpert.nl/mediabase/7554565/c2c53...
1,1,Boodschap vanaf de Bovenwindse Eilanden voor S...,https://www.dumpert.nl/mediabase/7541103/f8ad9...
2,1,En Sylvana houdt haar mond.,https://www.dumpert.nl/mediabase/7541047/cad2f...
3,1,Twan Huys geeft geen neuk om afspraken. \r\n-e...,https://www.dumpert.nl/mediabase/7540975/3f6f0...
4,1,Bij de rechtbank met het Friese volkslied,https://www.dumpert.nl/mediabase/7540567/f6916...
5,1,"Ja meneer de rechter, echt waar",https://www.dumpert.nl/mediabase/7537715/45cd2...
6,1,Blacklight doet het weer,https://www.dumpert.nl/mediabase/7512333/f5cb0...
7,1,Het is allemaal de schuld van Apple (Via AT5),https://www.dumpert.nl/mediabase/7487965/7d50a...
8,1,"Snel gehandeld, bijna geen Friet van Piet meer...",https://www.dumpert.nl/mediabase/7450889/d45f5...
9,1,Zwarte Piet is vroeg dit jaar!,https://www.dumpert.nl/mediabase/7437735/627ab...


In [466]:
# Getting data from each individual URL, mainly the tags being important.
def get_pageinfo():
    for item in all_page_content:
        req = requests.get(item["url"])
        # progress
        hundredpercent = len(all_page_content)
        currentlocation = all_page_content.index(item) + 1
        currentpercent = round((currentlocation / hundredpercent) * 100.0,1)
        print(f"Progress: {currentlocation}/{hundredpercent} ({currentpercent}%)")
        if req.status_code != 200:
            print(f"yikes, something is wrong - {req.status_code}.")
        else:
            print(f"{req.status_code} - all good. Scraping the following URL:")
            print(item['url'])
            print("")
        html = req.text
        soup = BeautifulSoup(html, "lxml")
        
        # meta info
        details = soup.select(".dump-amt")
        kudos = details[0].get_text().strip()
        kudos = kudos.replace("â€‘","-")
        today_kudos = details[1].get_text().strip()
        views = details[2].get_text().strip()
        today_views = details[3].get_text().strip()
        tags = soup.select("div.dump-desc > div > ul > li")
        tags_clean = []
        for tag in tags:
             tags_clean.append(tag.get_text().strip())
        
        # putting it back into the lists
        item["tags"] = tags_clean
        item["views"] = views
        item["kudos"] = kudos
        
        # need to parse the URL because the PAGE ID's aren't all the same length within the links unfortunately...
        url = item['url']
        parse_object = urlparse(url)
        path = parse_object.path
        path_cut = path.rsplit("/")
        page_id = (f"{path_cut[2]}/{path_cut[3]}")
        item["page_id"] = page_id
        
# Main function, getting the data of the individual pages
get_pageinfo()

print("Done. All pages scraped for metadata.")

Progress: 1/365 (0.3%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7554565/c2c53239/intocht_van_de_pietendiscussie.html

Progress: 2/365 (0.5%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7541103/f8ad9316/even_wat_zwarte_pieten_aan_de_tand_voelen.html

Progress: 3/365 (0.8%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7541047/cad2fc90/simon_is_piet.html

Progress: 4/365 (1.1%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7540975/3f6f0b2e/de_late_night_tafel_is_van_iedereen.html

Progress: 5/365 (1.4%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7540567/f691611f/blokkeerfriezen_worden_als_helden_onthaald.html

Progress: 6/365 (1.6%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7537715/45cd22be/blokkeerfries_heeft_lolbroek_aan.html

Progress: 7/365 (1.9%)
200 - all good. Scraping the following U

Progress: 55/365 (15.1%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7248125/8291a5b5/zwarte_piet_zingt_een_liedje.html

Progress: 56/365 (15.3%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7216821/3f0fab11/paulusma_doet_het_met_shakira.html

Progress: 57/365 (15.6%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7104625/3278b42b/selfie_met_piet_paulusma.html

Progress: 58/365 (15.9%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7081309/86b67561/jan_en_piet_geven_zichzelf_aan.html

Progress: 59/365 (16.2%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7068827/457fef64/dumpertreeten_(94).html

Progress: 60/365 (16.4%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/7063699/4e22fc3b/vrouw_over_zwarte_piet.html

Progress: 61/365 (16.7%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/m

Progress: 109/365 (29.9%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6923585/0597f032/kut_blokker_.html

Progress: 110/365 (30.1%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6922841/265aecb9/dumpert_korte_film.html

Progress: 111/365 (30.4%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6922027/22d2a3c5/lil_wayne_weet_niet_wat_racisme_is.html

Progress: 112/365 (30.7%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6920027/69d15578/zo_trots_als_een_aapje.html

Progress: 113/365 (31.0%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6904755/d8cb5020/kkk_vangt_zwarte_piet.html

Progress: 114/365 (31.2%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6903501/c2be7eff/15_euro,_alles_mogelijk_in_overleg.html

Progress: 115/365 (31.5%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/medi

Progress: 163/365 (44.7%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6632921/08684bb0/pesten_op_het_werk.html

Progress: 164/365 (44.9%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6630914/a427ca0f/grand_canyon_doet_gek_met_wolken.html

Progress: 165/365 (45.2%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6628774/ed2e894a/nog_meer_sint_piet_bloopers.html

Progress: 166/365 (45.5%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6628733/59069963/daniel_over_zwarte_piet.html

Progress: 167/365 (45.8%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6628745/6a9903b7/sint_piet_bloopert.html

Progress: 168/365 (46.0%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6628687/b0c70af0/zwarte_pietdocu_zwart_als_roet.html

Progress: 169/365 (46.3%)
200 - all good. Scraping the following URL:
https://www.dumpert.

Progress: 215/365 (58.9%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6620854/e5161952/re_ah_advertenties.html

Progress: 216/365 (59.2%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6620601/c0e7c6a9/en_dit_is_nederland.html

Progress: 217/365 (59.5%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6620495/30b684a9/gevonden_een_zeurpiet.html

Progress: 218/365 (59.7%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6620486/82101f20/pieten_boycotten_albert_heijn.html

Progress: 219/365 (60.0%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6620357/aea791a0/sinterklaas_en_de_islamiet_piet.html

Progress: 220/365 (60.3%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6620343/705fa32e/sjaak_heeft_de_zwarte_pieten_oplossing_.html

Progress: 221/365 (60.5%)
200 - all good. Scraping the following URL:
https://www

Progress: 269/365 (73.7%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6538477/2dc3de84/hongaarse_reaguurders.html

Progress: 270/365 (74.0%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6530612/252ae051/dumpert_heeft_talent.html

Progress: 271/365 (74.2%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6447032/2fba46d8/zwarte_pieten_voor_santa.html

Progress: 272/365 (74.5%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6444232/3ce1c986/piet_geeft_kado.html

Progress: 273/365 (74.8%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6442602/e3e51d1a/zwarte_piet_blijkt_van_onderen_wit.html

Progress: 274/365 (75.1%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6435922/ad3256c7/h_man.html

Progress: 275/365 (75.3%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/6377601/4c62267c

Progress: 327/365 (89.6%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/316321/492c53e3/piet_paulusma.html

Progress: 328/365 (89.9%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/316241/91d4dc79/propaganda_piet.html

Progress: 329/365 (90.1%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/316131/e815ed3f/they_re_back_.html

Progress: 330/365 (90.4%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/315951/67ebea02/wat_verwacht_je.html

Progress: 331/365 (90.7%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/315941/7656da60/hefner_piet.html

Progress: 332/365 (91.0%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/315921/1cce12a8/drostepiet.html

Progress: 333/365 (91.2%)
200 - all good. Scraping the following URL:
https://www.dumpert.nl/mediabase/315911/5647357f/piet_bekend_kleur.html

Progress: 334/365 (91.5%

In [467]:
# Putting it into dataframe for visibility
extended_df = pd.DataFrame(all_page_content)
extended_df
#extended_df.to_csv("extended.csv")

Unnamed: 0,kudos,page_id,search_page,short_description,tags,url,views
0,7581,7554565/c2c53239,1,De Nederlandse traditie steekt de grens over.,"[sinterklaas, ideale, wereld, zwarte, piet, pi...",https://www.dumpert.nl/mediabase/7554565/c2c53...,129313
1,12383,7541103/f8ad9316,1,Boodschap vanaf de Bovenwindse Eilanden voor S...,"[racisme, zwarte, piet, sinterklaas, feest, cu...",https://www.dumpert.nl/mediabase/7541103/f8ad9...,423030
2,9292,7541047/cad2fc90,1,En Sylvana houdt haar mond.,"[tahamata, simon, voetbal, zwarte, piet, sylvana]",https://www.dumpert.nl/mediabase/7541047/cad2f...,451084
3,‑5725,7540975/3f6f0b2e,1,Twan Huys geeft geen neuk om afspraken. \r\n-e...,"[zwarte, piet, discussie, racisme, gezeik, blo...",https://www.dumpert.nl/mediabase/7540975/3f6f0...,326759
4,8624,7540567/f691611f,1,Bij de rechtbank met het Friese volkslied,"[rechtbank, friesland, blokkeren, zingen, zwar...",https://www.dumpert.nl/mediabase/7540567/f6916...,320988
5,4059,7537715/45cd22be,1,"Ja meneer de rechter, echt waar","[wijsneus, blokkeerfries, friesland, zwarte, p...",https://www.dumpert.nl/mediabase/7537715/45cd2...,409402
6,4327,7512333/f5cb0934,1,Blacklight doet het weer,"[win, piet, paulusma, autist, boos, kees, vide...",https://www.dumpert.nl/mediabase/7512333/f5cb0...,329448
7,4898,7487965/7d50a524,1,Het is allemaal de schuld van Apple (Via AT5),"[fiets, fietser, tunnel, fail, apple, navigati...",https://www.dumpert.nl/mediabase/7487965/7d50a...,541218
8,566,7450889/d45f5f2d,1,"Snel gehandeld, bijna geen Friet van Piet meer...","[cctv, vlam, snackbar, vuur, fail, frituur]",https://www.dumpert.nl/mediabase/7450889/d45f5...,145897
9,685,7437735/627ab99d,1,Zwarte Piet is vroeg dit jaar!,"[cctv, achtervolging, auto, dak, slinger, zo, ...",https://www.dumpert.nl/mediabase/7437735/627ab...,135528


In [468]:
# Getting the comments into the dataframe
index = 0

comment_super_heaven = [] 

for item in all_page_content:
    page_id = extended_df['page_id'][index]
    req = requests.get(f"https://comments.dumpert.nl/embed/{page_id}/comments")
    html = req.text
    soup = BeautifulSoup(html, "lxml")
    all_comments = soup.select(".cmt-content")
    index = index + 1
    comment_heaven = []
    
    # Adding to dataframe, also put in loose list so we can get straight to visualizing it.
    for comment in all_comments:
        comment_text = comment.get_text().strip()
        comment_heaven.append(comment_text)
        comment_super_heaven.append(comment_text)
    
    # Progress
    amount_of_comments = len(comment_heaven)
    amount_of_pages = len(all_page_content)
    progress_percentage = round((index/amount_of_pages) * 100.0,1)
    item["comment_amount"] = amount_of_comments
    print(f"got {amount_of_comments} comments from page id: {page_id} ({progress_percentage}%)")
    item["comments"] = comment_heaven

final_df = pd.DataFrame(all_page_content)
comments_galore = final_df['comment_amount'].sum()
print(f"Done - {comments_galore} total comments.")

got 97 comments from page id: 7554565/c2c53239 (0.3%)
got 132 comments from page id: 7541103/f8ad9316 (0.5%)
got 157 comments from page id: 7541047/cad2fc90 (0.8%)
got 293 comments from page id: 7540975/3f6f0b2e (1.1%)
got 305 comments from page id: 7540567/f691611f (1.4%)
got 111 comments from page id: 7537715/45cd22be (1.6%)
got 100 comments from page id: 7512333/f5cb0934 (1.9%)
got 420 comments from page id: 7487965/7d50a524 (2.2%)
got 91 comments from page id: 7450889/d45f5f2d (2.5%)
got 111 comments from page id: 7437735/627ab99d (2.7%)
got 174 comments from page id: 7395695/4ea7c610 (3.0%)
got 45 comments from page id: 7383659/a7b3b1bf (3.3%)
got 334 comments from page id: 7372861/1c64befc (3.6%)
got 155 comments from page id: 7313789/da347255 (3.8%)
got 88 comments from page id: 7309751/1e9fc81b (4.1%)
got 162 comments from page id: 7304351/3b83b82b (4.4%)
got 60 comments from page id: 7303027/32f171d9 (4.7%)
got 57 comments from page id: 7302781/f77621c3 (4.9%)
got 205 comments

got 45 comments from page id: 6687498/fd852c1e (41.1%)
got 78 comments from page id: 6678575/0b9072cb (41.4%)
got 45 comments from page id: 6670169/b90242a6 (41.6%)
got 34 comments from page id: 6667781/01cd1021 (41.9%)
got 45 comments from page id: 6659002/d89ea461 (42.2%)
got 53 comments from page id: 6657964/c252fde0 (42.5%)
got 27 comments from page id: 6656848/b7478db7 (42.7%)
got 22 comments from page id: 6656558/eff6806f (43.0%)
got 60 comments from page id: 6653153/06ba8639 (43.3%)
got 27 comments from page id: 6640751/4a0f2063 (43.6%)
got 54 comments from page id: 6638487/5d19ab02 (43.8%)
got 66 comments from page id: 6635625/d6caf3cb (44.1%)
got 27 comments from page id: 6632762/99626c27 (44.4%)
got 30 comments from page id: 6632921/08684bb0 (44.7%)
got 32 comments from page id: 6630914/a427ca0f (44.9%)
got 31 comments from page id: 6628774/ed2e894a (45.2%)
got 50 comments from page id: 6628733/59069963 (45.5%)
got 24 comments from page id: 6628745/6a9903b7 (45.8%)
got 55 com

got 9 comments from page id: 317541/55a1dfca (81.9%)
got 20 comments from page id: 316971/e6c07364 (82.2%)
got 8 comments from page id: 317021/0607dd46 (82.5%)
got 10 comments from page id: 317201/b3d742e5 (82.7%)
got 5 comments from page id: 316641/5c302236 (83.0%)
got 5 comments from page id: 316651/faef93cf (83.3%)
got 10 comments from page id: 316631/3e7fe095 (83.6%)
got 1 comments from page id: 316621/db2f32df (83.8%)
got 7 comments from page id: 316531/6e6e5730 (84.1%)
got 2 comments from page id: 316481/299480a8 (84.4%)
got 4 comments from page id: 316151/461d56cd (84.7%)
got 2 comments from page id: 316011/c78f15f7 (84.9%)
got 3 comments from page id: 315891/87e9034e (85.2%)
got 2 comments from page id: 317071/930c32ce (85.5%)
got 7 comments from page id: 317151/b9a27c0c (85.8%)
got 10 comments from page id: 316881/d8069bd8 (86.0%)
got 20 comments from page id: 316741/7a585c15 (86.3%)
got 19 comments from page id: 316541/6f32f710 (86.6%)
got 21 comments from page id: 316591/4c1

In [469]:
final_df

Unnamed: 0,comment_amount,comments,kudos,page_id,search_page,short_description,tags,url,views
0,97,[Belgen die een Nederlandse onzinnige discussi...,7581,7554565/c2c53239,1,De Nederlandse traditie steekt de grens over.,"[sinterklaas, ideale, wereld, zwarte, piet, pi...",https://www.dumpert.nl/mediabase/7554565/c2c53...,129313
1,132,[Als Sylvana niet het verschil ziet tussen de ...,12383,7541103/f8ad9316,1,Boodschap vanaf de Bovenwindse Eilanden voor S...,"[racisme, zwarte, piet, sinterklaas, feest, cu...",https://www.dumpert.nl/mediabase/7541103/f8ad9...,423030
2,157,[Ik word altijd misselijk als ik Sylvana zie. ...,9292,7541047/cad2fc90,1,En Sylvana houdt haar mond.,"[tahamata, simon, voetbal, zwarte, piet, sylvana]",https://www.dumpert.nl/mediabase/7541047/cad2f...,451084
3,293,"[-weggejorist-, Wat is die vent een bal., www....",‑5725,7540975/3f6f0b2e,1,Twan Huys geeft geen neuk om afspraken. \r\n-e...,"[zwarte, piet, discussie, racisme, gezeik, blo...",https://www.dumpert.nl/mediabase/7540975/3f6f0...,326759
4,305,"[Woord van het jaar 2018: blokkeerfries, 5 dec...",8624,7540567/f691611f,1,Bij de rechtbank met het Friese volkslied,"[rechtbank, friesland, blokkeren, zingen, zwar...",https://www.dumpert.nl/mediabase/7540567/f6916...,320988
5,111,[Waarom moet een randstadneger in friesland ga...,4059,7537715/45cd22be,1,"Ja meneer de rechter, echt waar","[wijsneus, blokkeerfries, friesland, zwarte, p...",https://www.dumpert.nl/mediabase/7537715/45cd2...,409402
6,100,"[piet, ik zou je dochter doen, Kees, ik zou je...",4327,7512333/f5cb0934,1,Blacklight doet het weer,"[win, piet, paulusma, autist, boos, kees, vide...",https://www.dumpert.nl/mediabase/7512333/f5cb0...,329448
7,420,[Nee als een zombie blindelings je navigatie v...,4898,7487965/7d50a524,1,Het is allemaal de schuld van Apple (Via AT5),"[fiets, fietser, tunnel, fail, apple, navigati...",https://www.dumpert.nl/mediabase/7487965/7d50a...,541218
8,91,[Water op olie? Dit had anders moeten aflopen ...,566,7450889/d45f5f2d,1,"Snel gehandeld, bijna geen Friet van Piet meer...","[cctv, vlam, snackbar, vuur, fail, frituur]",https://www.dumpert.nl/mediabase/7450889/d45f5...,145897
9,111,[Grosjean die thuis komt van een weekendje F1....,685,7437735/627ab99d,1,Zwarte Piet is vroeg dit jaar!,"[cctv, achtervolging, auto, dak, slinger, zo, ...",https://www.dumpert.nl/mediabase/7437735/627ab...,135528


In [470]:
# Checking search objects against tags. Have to contain BOTH if there's more than one search word. 
if "e%20" in keyword:
    keyword = keyword.split("e%20")

edited_all = []
edited_onlycomments = []
all_keyw_length = len(keyword)
    
# REMOVING THE UNWANTED ROWS
for item in all_page_content:
    keywordcounter = 0
    for word in keyword:
        if word in item["tags"]:
            keywordcounter = keywordcounter + 1
    if keywordcounter == all_keyw_length:
        edited_all.append(item)
        edited_onlycomments.append(item["comments"])

In [471]:
# Checking how many entries there are after filtering by TAGS. 
amount_matching_entries = len(edited_all)
print(f"{amount_matching_entries} entries that match the following keyword(s): {keyword}")

225 entries that match the following keyword(s): ['zwarte', 'piet']


In [473]:
edited_df = pd.DataFrame(edited_all)
edited_df.tail(5)

Unnamed: 0,comment_amount,comments,kudos,page_id,search_page,short_description,tags,url,views
220,17,"[Kijkcijferkanon! +1, mijn nieuwe ringtone!, M...",788,33725/f8468566,24,Sinterklaasliedje!,"[sinterklaas, zwarte, piet, depla, manon]",https://www.dumpert.nl/mediabase/33725/f846856...,26541
221,19,"[whahahahahahhaa +1, wow, een zwarte die werkt...",769,33449/bf5208f8,24,Zie ginds komt de huisvuilbelader,"[zwarte, piet, vuilnis, sinterklaas]",https://www.dumpert.nl/mediabase/33449/bf5208f...,27455
222,69,"[HAHAHAHAHAHAHAHAH, niet leuk ^oS, -1 Zielig m...",2056,32245/c3d9aff0,24,"Wie zoet is krijgt lekkers, wie stout is de HAK.","[sinterklaas, zwarte, piet]",https://www.dumpert.nl/mediabase/32245/c3d9aff...,73459
223,32,[Kijkt dat wijf nou echt zo scheel of had ik d...,307,31948/79573866,24,Met ECHTE zwarte Pieten,"[sinterklaas, zwarte, piet, nps, dieuwertje]",https://www.dumpert.nl/mediabase/31948/7957386...,23595
224,30,"[ga es deaud met je óófd, sneu volk daar.........",3791,28308/f436d1c3,24,Ook in de mijnen van Vladiwostok vieren ze sin...,"[zwarte, piet, rusland, bezopen, dronken]",https://www.dumpert.nl/mediabase/28308/f436d1c...,111712


In [458]:
#edited_onlycomments[1]

In [448]:
comment_endboss = []

for item in edited_onlycomments:
    comment_string = ' '.join(item)
    comment_endboss.append(comment_string)

one_big_comment_string = ' '.join(comment_endboss).lower()

# 'for item in comment_super_heaven:' could've been used for all comments in one string without the extra filtering with the tags we did before.    

In [449]:
# Comment Cleaning, so we can get meaningful results.
endclean = re.sub(r'[^a-z ]', '', one_big_comment_string)
word_list = endclean.split()
forbidden_words = ["mensen","vind","gewoon","weer","zelf","iedereen","willen","laten","juist","that","word","staan","waren","artikel","werd","bent","geweest","zitten","weten","genoeg","ander","lang","persoon","best","gebruiken","hadden","bijvoorbeeld","tussen","hand","zitten","bent","geweest","waarom","zeggen","zoals","paar","misschien","zien","zich","altijd","naar","gezien","mijn","want","beetje","echt","niet","maar","zijn","voor","achter","over","onder","hebben", "door","meer","heeft","hier","geven","niets","niks","laat","eigenlijk","zullen","iemand","steeds","deleted","lijkt","komen","komt","geen","worden","wordt","helemaal","kunnen","alsof","even","deze","heel","alleen","hele","veel","waar","moeten","omdat","toch","worden","wordt","gaan","moet","andere","eens","doen","daar","iets","gaat","maken","nooit","allemaal","alle","manier","anders","grote","weet","dingen","denken","vraag","alles","this","enige","maakt","krijgen","doet","ziet","zegt","zouden","zonder","keer","alsof"]
word_string = ' '.join([word for word in word_list if word not in forbidden_words])
word_string = re.sub(r'\b\w{1,3}\b', '', word_string)
word_string = re.sub(' +',' ', word_string)


In [450]:
word_string

' sylvana verschil pikzwart geschminkte pieten bruine racist viel blanke mannenracist sylvana simons zwarte mannen valt probleem vrouwen zwart zwarte mannen vallen vaak lopen tegen dezelfde ontrouw onbetrouwbaarheid onvermogen mannen gevoelens praten nederlandse vriendin donkere vriend samen vakantie noem krijgt opeens telefoontje blijkt punt trouwenen donkere mannen gewelddadiger blanke antwoordt zelfs indruk weggejorist mand antonjensen wanneer gezegd benieuwd verder zwartepietendiscussie mengen tegen onzinredenaties thecone onzin sylvana verschil herkent zelfs blackface opzoekt internet leest daarna inhoudelijk reagerenen alsjeblieft burgeroorlog onbenullig onderwerp veroorzaken verder plezier ermee verschil shirt churandy martina bijna vorig jaar curacao tijdens sannikolasde pieten konden zwart schooltjes hingen versiering winkeliers nederland forse glasschade sarcasm slavernijverleden inderdaad traumatische ervaring jewelste sarcasm grappig feitje niemand praat ontzettend profijt 

In [454]:
# Visualization, creating wordcloud
mask = np.array(Image.open("wordcloudBG.png"))
wc = WordCloud(background_color="white",mask=mask,max_words=250)
wc.generate(word_string)
wc.to_file("dumpert_wordcloud.png")

<wordcloud.wordcloud.WordCloud at 0x12788c62b38>