In [3]:
from summa import keywords,summarizer
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

file_name = 'aligned_epg_transcriptions_npo1_npo2.csv'
data = pd.read_csv(file_name)
texts = data['text']



In [6]:

from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('nl_core_news_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Parse text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):           
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if sum(abs(previous_pr - pr))  < self.min_diff:
                break
            else:
                previous_pr = pr

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight


In [8]:
tr4w = TextRank4Keyword()
for idx,t in enumerate(texts[0:10]):
    
    tr4w.analyze(t, candidate_pos = ['NOUN', 'PROPN'], window_size=4, lower=False)
    print(f"document id:{idx}")
    tr4w.get_keywords(10)
    print("\n\n\n")


document id:0
mensen - 2.036226432882566
GroenLinks - 1.8395260089859353
bedrijven - 1.7209105644826668
middenklasse - 1.6209761868850232
baas - 1.5726853901780742
samenleving - 1.5585820120615836
klimaat - 1.4807793676022936
inkomens - 1.4128768160593093
partijen - 1.3766265013674306
idee - 1.255540175650657
Nederland - 1.255540175650657
keer - 1.2409463414947313




document id:1
mensen - 3.050191290357387
vrede - 2.4279536024742896
betekenis - 1.773446162451204
tijd - 1.6354075961692502
ouders - 1.447041264329045
gevoel - 1.3375352590335101
gemeenschap - 1.2981882732854677
taal - 1.2790424572838086
doosje - 1.2632770096957922
status - 1.2632770096957917
Munsterkerk - 1.2501362963519203
meter - 1.1270074668149355




document id:2
euro - 1.0
klimaatwet - 1.0




document id:3
mensen - 4.559352417677301
agenten - 3.334064489495194
hotel - 3.0261007784144547
terroristen - 2.980256936729628
Turken - 2.7879815105371315
bodycams - 2.628487183191716
Sri - 2.3183859555114106
Noord - 2.25005

In [13]:
texts[8]

"'Maar als je dan verplaatst in het', 'gevoel van Veda, begrijp je haar.' 'Dat maakt het niet makkelijker.', 'Maar ik begrijp haar wel.' 'Veda geeft zelf ook te kennen', 'dat ze ermee worstelt...' 'met het proces van doodgaan.' 'Ik ben gewoon heel bang', 'dat als ik doodga...' '...dat ik me dan misschien', 'ergens nog bewust ben van...' 'wat er gebeurt of... dat ik het', verdriet van m'n familie meekrijg. 'Of dat ik eigenlijk', 'sowieso nog iets voel of iets...' 'Ehm... ja...' 'Dan zou het hele doel van niet meer', 'leven zou dan eigenlijk weg zijn.' 'En dan zou ik voor niks', 'iedereen pijn hebben gedaan.' 'Moeilijk.' 'Ja, maar ook kraakhelder, he!' 'Hier valt niets', 'niet aan te begrijpen!' 'Zij voert het woord', 'namens een grote doelgroep...' 'en dat vind ik heel erg knap.' 'Hoe groot is deze doelgroep?', 'De cijfers ken ik niet.' 'Maar ik kom veel in klinieken', 'waar wij filmen...' 'en we zien veel jongeren', 'die worstelen met het leven.' 'Met de druk die ze ervaren, welke', 's