In [1]:
import requests
import os
import glob
from bs4 import BeautifulSoup
import pandas as pd
import string 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer
import matplotlib.pyplot as plt 
from sklearn.metrics.pairwise import cosine_similarity 
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
import numpy as np

# Creating the dataframe

In [3]:
# load in the html files
path = "NPO/NPO/shows"

data = []

#Loop over all the files:

def create_npo(path):
    titles = []
    descriptions = []
    pictures = []
    for file in os.listdir(path):
        filename = path + "/" + file
        if './data/show' not in file:
            soup = BeautifulSoup(open(filename, encoding="utf8"), 'html.parser')

            # extract the title, description and image from each file
            if soup.find('meta',  attrs={'property': 'og:title'}):
                title = soup.find('meta',  attrs={'property': 'og:title'})['content']
                title2 = title.replace(" gemist? Start met kijken op NPO Start", " ")
                titles.append(title2)
            else:
                titles.append('')

            if soup.find('meta',  attrs={'property': 'og:description'}):
                descriptions.append(soup.find('meta',  attrs={'property': 'og:description'})['content'])
            else:
                descriptions.append(" ")

            if soup.find('meta',  attrs={'property': 'og:image'}):
                pictures.append(soup.find('meta',  attrs={'property': 'og:image'})['content'])
            else:
                pictures.append('')

    list_of_tuples = list(zip(titles,descriptions,pictures))
    df = pd.DataFrame(list_of_tuples,
                      columns = ['titles','descriptions','pictures'])

    return(df)

In [4]:
df = create_npo(path)

In [149]:
df.head(3)

Unnamed: 0,titles,descriptions,pictures,ID,spacy,spacy2,total,nature,history,travel,romantic,diversity,educational,final_text,k_means,crime,action,politics,teen,reality
0,1 Euro per gesprek,Bert Kuizenga gaat open in gesprek met voorbij...,https://images.npo.nl/header/2560x1440/1652869...,0,"[bert, kuizenga, gaan, open, gesprek, voorbijg...",bert kuizenga gaan open gesprek voorbijganger ...,1 euro per gesprek bert kuizenga gaan open ge...,0,0,0,0,1,0,euro per gesprek bert kuizenga gaan open ge...,1,0,0,0,0,0
1,1 Voor de Verkiezingen,"In de maand september, middenin het hart van d...",https://images.npo.nl/header/2560x1440/139875.jpg,1,"[maand, september, middenin, hart, nederlands,...",maand september middenin hart nederlands verki...,1 voor de verkiezingen maand september midden...,0,0,0,0,0,0,voor de verkiezingen maand september midden...,11,0,0,0,0,0
2,10 jaar TV Draait Door,De leukste rubriek van Nederland bestaat 10 ja...,https://images.npo.nl/header/2560x1440/579842.jpg,2,"[leuk, rubriek, nederland, bestaan, jaar, vier...",leuk rubriek nederland bestaan jaar vieren ter...,10 jaar draait door leuk rubriek nederland b...,0,0,0,0,0,0,jaar draait door leuk rubriek nederland b...,11,0,0,0,0,0


# Lots of data cleaning

In [8]:
df = df[df.titles != '']
df = df[df.descriptions != '']
df = df[df.descriptions != ' ']
df = df[df.pictures != '']

In [9]:
len(df)

2698

In [10]:
#Add index ID column
df['ID'] = range(0, len(df)-1 + 1)

In [11]:
#Create new df to do analysis:
npo_df = df


In [12]:
import spacy

nlp = spacy.load("nl_core_news_sm")
flatten = lambda t: [item for sublist in t for item in sublist]

def lemmatize(texts):
    
    # nouns and verbs
    type_of_words = ['NOUN', 'ADJ','VERB','PROPN']
    
    processed_texts = [text for text in nlp.pipe(texts, 
                                              disable=["ner",
                                                       "parser"])]
    
    # the lemma of a word if the word is not a stop word and the word is a noun or a verb
    tokenized_text = [[word.lemma_.lower() for word in processed_text if word.pos_ in type_of_words
                                and not word.is_stop] for processed_text in processed_texts]
    #tokenized_text = [word.lemma_.lower() for word in processed_text if word.pos_ in type_of_words and not word.is_stop]
    
    
    
    # return the tokenized, lemmatized text
    return tokenized_text

In [13]:
#Define the function preprocess to that strips out any punctionation from the input.
def preprocess(text):
    return text.translate(str.maketrans('', '', string.punctuation))


In [14]:
npo_df['spacy'] = lemmatize(npo_df['descriptions']) 

In [65]:
npo_df['spacy2'] = [' '.join([str(c) for c in lst]) for lst in npo_df['spacy']]
npo_df['total'] = npo_df['titles'].astype(str) + " " + npo_df['spacy2'].astype(str)
npo_df['total'] = npo_df['total'].str.lower()
npo_df.head(2)

Unnamed: 0,titles,descriptions,pictures,ID,spacy,spacy2,total,nature,history,travel,romantic,diversity,educational,final_text,k_means
0,1 Euro per gesprek,Bert Kuizenga gaat open in gesprek met voorbij...,https://images.npo.nl/header/2560x1440/1652869...,0,"[bert, kuizenga, gaan, open, gesprek, voorbijg...",bert kuizenga gaan open gesprek voorbijganger ...,1 euro per gesprek bert kuizenga gaan open ge...,0,0,0,0,1,0,euro per gesprek bert kuizga gaan op gespre...,2
1,1 Voor de Verkiezingen,"In de maand september, middenin het hart van d...",https://images.npo.nl/header/2560x1440/139875.jpg,1,"[maand, september, middenin, hart, nederlands,...",maand september middenin hart nederlands verki...,1 voor de verkiezingen maand september midden...,0,0,0,0,0,0,voor verkiezing maand september midnin har...,1


In [66]:
l = ['tv', 'programma','televisie', 'serie']
npo_df['total'] = npo_df.total.str.replace('|'.join(l), '', regex=True).str.strip()

In [67]:
npo_df['final_text'] = npo_df['total'].str.replace('[^a-zA-Z]', ' ')
npo_df.head(2)

  npo_df['final_text'] = npo_df['total'].str.replace('[^a-zA-Z]', ' ')


Unnamed: 0,titles,descriptions,pictures,ID,spacy,spacy2,total,nature,history,travel,romantic,diversity,educational,final_text,k_means
0,1 Euro per gesprek,Bert Kuizenga gaat open in gesprek met voorbij...,https://images.npo.nl/header/2560x1440/1652869...,0,"[bert, kuizenga, gaan, open, gesprek, voorbijg...",bert kuizenga gaan open gesprek voorbijganger ...,1 euro per gesprek bert kuizenga gaan open ge...,0,0,0,0,1,0,euro per gesprek bert kuizenga gaan open ge...,2
1,1 Voor de Verkiezingen,"In de maand september, middenin het hart van d...",https://images.npo.nl/header/2560x1440/139875.jpg,1,"[maand, september, middenin, hart, nederlands,...",maand september middenin hart nederlands verki...,1 voor de verkiezingen maand september midden...,0,0,0,0,0,0,voor de verkiezingen maand september midden...,1


# The genre labeling

In [126]:
nature = ['natuur', 'de jungle', 'de oceaan', 'dieren', 'milieu', 'klimaat', 'onder water', 'wildernis', 'onderwaterleven', 'waddeneiland']
history = ['geschiedenis','jaar geleden','val van de muur','berlijnse muur','oorlog','joodse leven','joodse familie','gouden eeuw','koloniale','kruistocht','1935','1936','1937','1938','1939','1940','1941','1942','1943','1944','1945','1946','1947','1948','1949','1950','1951','1952','1953','1954','1955','1956','1957','1958','1959','1960','1961','1962','1963','1964','1965','1966','1967','1968','1969','1970','1971','1972','1973','1974','1975','1976','1977','1978','1979','1980','1981','1982','1983','1984','1985','1986','1987','1988','1989','1990','1991','1992','1993','1994','1995','1996','1997','1998']
travel = ['reis', 'reizen']
romantic = [ 'romantiek','romantisch', 'liefde','verliefd', 'trouwen','huwelijk']
diversity = ['subculturen', 'culturen','gemeenschappen', 'samenleving','religie', 'religies','suriname', 'islam','islamitische', 'hindoe','christelijk','indiase', 'seksualiteit']
crime = ['politie', 'recherche', 'flikken', 'geheime diensten', 'agent', 'recherche', 'smeris', 'spoorloos']
action = ['killing eve', 'thriller', 'misdaad', 'moord', 'maffia']
politiek = ['politiek', 'minister', 'president', 'politici', 'rutte']
teen = ['npo zapp', 'tieners','vlogger', 'verliefd op','van god', 'teenage', 'populaire popgroep', 'vlogs', 'mattie', 'matties', 'kinderen voor', "new musical", "ali b"]
reality = ['boer zoekt', 'the great british', 'vier handen', 'is de mol', 'rijdende rechter', 'baby te huur', 'dream school', 'reality', 'first dates', 'streetlab', 'singles', 'reallife']
educational = ['aardrijkskunde','scheikunde','aarde','techniek','educatief','het nieuws','biologische','natuurkundig','natuurkunde','wetenschap']

In [127]:
nat = []

for i in npo_df.total:
    if any(p in i for p in nature):
        nat.append(1)
    else:
        nat.append(0)
        
npo_df['nature'] = nat
npo_df.nature.value_counts()

0    2580
1     118
Name: nature, dtype: int64

In [128]:
his = []

for i in npo_df.total:
    if any(p in i for p in history):
        his.append(1)
    else:
        his.append(0)
        
npo_df['history'] = his
npo_df.history.value_counts()

0    2549
1     149
Name: history, dtype: int64

In [129]:
tra = []

for i in npo_df.total:
    if any(p in i for p in travel):
        tra.append(1)
    else:
        tra.append(0)
        
npo_df['travel'] = tra
npo_df.travel.value_counts()

0    2537
1     161
Name: travel, dtype: int64

In [130]:
rom = []

for i in npo_df.total:
    if any(p in i for p in romantic):
        rom.append(1)
    else:
        rom.append(0)
        
npo_df['romantic'] = rom
npo_df.romantic.value_counts()

0    2586
1     112
Name: romantic, dtype: int64

In [131]:
div = []

for i in npo_df.total:
    if any(p in i for p in diversity):
        div.append(1)
    else:
        div.append(0)
        
npo_df['diversity'] = div
npo_df.diversity.value_counts()

0    2567
1     131
Name: diversity, dtype: int64

In [132]:
edu = []

for i in npo_df.total:
    if any(p in i for p in educational):
        edu.append(1)
    else:
        edu.append(0)
        
npo_df['educational'] = edu
npo_df.educational.value_counts()

0    2582
1     116
Name: educational, dtype: int64

In [133]:
cri = []

for i in npo_df.total:
    if any(p in i for p in crime):
        cri.append(1)
    else:
        cri.append(0)
        
npo_df['crime'] = cri
npo_df.crime.value_counts()

0    2570
1     128
Name: crime, dtype: int64

In [134]:
act = []

for i in npo_df.total:
    if any(p in i for p in action):
        act.append(1)
    else:
        act.append(0)
        
npo_df['action'] = act
npo_df.action.value_counts()

0    2631
1      67
Name: action, dtype: int64

In [135]:
pol = []

for i in npo_df.total:
    if any(p in i for p in politiek):
        pol.append(1)
    else:
        pol.append(0)
        
npo_df['politics'] = pol
npo_df.politics.value_counts()

0    2617
1      81
Name: politics, dtype: int64

In [136]:
tee = []

for i in npo_df.total:
    if any(p in i for p in teen):
        tee.append(1)
    else:
        tee.append(0)
        
npo_df['teen'] = tee
npo_df.teen.value_counts()

0    2675
1      23
Name: teen, dtype: int64

In [137]:
rea = []

for i in npo_df.total:
    if any(p in i for p in reality):
        rea.append(1)
    else:
        rea.append(0)
        
npo_df['reality'] = rea
npo_df.reality.value_counts()

0    2664
1      34
Name: reality, dtype: int64

# The clustering

In [138]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn import cluster

vectorizer = TfidfVectorizer(min_df=2, max_df=0.9, norm='l2')
X = vectorizer.fit_transform(npo_df.final_text.values.astype('str'))
tf_idf = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names())
tf_idf



Unnamed: 0,aan,aanbieden,aanbod,aandacht,aandeel,aandoening,aangaan,aangenaam,aanhanger,aankaarten,...,zwangerschap,zwart,zwarte,zweden,zweeds,zweet,zwembad,zwemmen,zwitsers,zwolle
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [139]:
vectorizer = TfidfVectorizer(min_df=2, max_df=0.9, norm='l2')
X = vectorizer.fit_transform(npo_df.final_text.values.astype('str'))
tf_idf = pd.DataFrame(data = X.toarray(), columns=vectorizer.get_feature_names())
tf_idf

clusters = 15
kmeanModel = KMeans(n_clusters=clusters, init='k-means++', max_iter=3000, random_state=0)
mod = kmeanModel.fit_transform(tf_idf)
npo_df['k_means'] = kmeanModel.predict(tf_idf)

npo_df.k_means.value_counts()



1     950
10    223
7     158
0     149
11    145
5     143
13    143
12    123
2     118
14    116
8     101
9      98
4      80
3      78
6      73
Name: k_means, dtype: int64

In [140]:
order_centroids = kmeanModel.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
dict = []
for i in range(15):
  print('%d' % i, sep='', end=', '),
  for ind in order_centroids[i, :20]:
    print(terms[ind], sep='', end=', ')
  print('')

0, leven, mens, dood, komen, documentaire, van, de, werk, staan, dagelijks, vrouw, gaan, het, volgen, nieuw, vraag, gesprek, hindoe, jaar, belangrijk, 
1, jaar, maken, komen, the, mens, gaan, bekend, krijgen, max, en, nieuw, het, staan, jan, documentaire, volgen, goed, van, zien, dag, 
2, jong, jaar, volgen, gaan, leven, mens, komen, probleem, leeftijd, talent, vrouw, krijgen, oud, de, documentaire, strijd, prinses, experiment, laten, confronteren, 
3, avontuur, animatie, beleven, vriend, spannend, muis, klein, wonen, jongen, vrolijk, meisje, goed, zappelin, knofje, de, nieuw, pip, beer, joe, kind, 
4, nos, verslag, koningin, voetbal, koning, laat, beatrix, nieuws, journaal, alexander, sport, jaar, ek, samenvatting, willem, live, studio, xima, nationaal, belangrijk, 
5, wereld, de, mens, duiken, van, nemen, bekend, zien, jaar, reizen, goed, laten, kijker, uniek, wetenschap, maken, verhaal, bijzonder, jong, wonder, 
6, klas, filemon, de, gaan, in, lichaam, onderwerp, menselijk, behandel



In [150]:
# sampled dfs because we had too much content

dfs=[]

for i in range(15):
    df_i = npo_df[npo_df['k_means']==i]
    df_s = df_i.sample(50)
    dfs.append(df_s)
    

In [142]:
samp_df = pd.concat(dfs,axis=0)
samp_df.head(2)

Unnamed: 0,titles,descriptions,pictures,ID,spacy,spacy2,total,nature,history,travel,romantic,diversity,educational,final_text,k_means,crime,action,politics,teen,reality
100,Ali B geeft antwoord,Theatervoorstelling. Het publiek mag Ali B het...,https://images.npo.nl/header/2560x1440/217219.jpg,91,"[theatervoorstelling, publiek, ali, b, hemd, l...",theatervoorstelling publiek ali b hemd lijf vr...,ali b geeft antwoord theatervoorstelling publ...,0,0,0,1,0,1,ali b geeft antwoord theatervoorstelling publ...,0,0,0,0,1,0
1537,Leven in de Brouwerij,Bierliefhebber Patrick Stoof neemt vriend Kasp...,https://images.npo.nl/header/2560x1440/leven_i...,1462,"[bierliefhebber, patrick, stoof, nemen, vriend...",bierliefhebber patrick stoof nemen vriend kasp...,leven in de brouwerij bierliefhebber patrick ...,0,0,0,0,0,0,leven in de brouwerij bierliefhebber patrick ...,0,0,0,0,0,0


In [146]:
samp_df.reality.value_counts()

0    741
1      9
Name: reality, dtype: int64

In [148]:
samp_df[['ID','titles','descriptions','pictures','k_means', 'nature','history', 'travel','romantic','diversity','educational','crime', 'action', 'politics', 'teen', 'reality','k_means']].to_csv('sample_npo.csv')