In [54]:
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.nl.stop_words import STOP_WORDS
import pandas as pd
import zipfile 

Loading the spacy model#
import gensim
model_data_path = "../39.zip"

with zipfile.ZipFile(model_data_path, "r") as archive:
    stream = archive.open("model.txt")

# 1. How to load a model from the Nordic Language Processing Laboratory
model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=False, unicode_errors='replace')

nlp = spacy.blank('nl')
keys = []
for idx in range(len(model.vocab)):
    keys.append(model.index2word[idx])
nlp.vocab.vectors = spacy.vocab.Vectors(data=model.syn0, keys=keys)
####
doc_test = nlp("De hond is blij")
doc_test2 = nlp("De hond is boos")
print(doc_test.similarity(doc_test2))


0.9267710722495327


In [None]:
file_name = 'aligned_epg_transcriptions_npo1_npo2.csv'
data = pd.read_csv(file_name)
texts = data['text']
nlp2 = spacy.load('nl_core_news_sm')
nlp2.vocab.vectors = spacy.vocab.Vectors(data=model.syn0, keys=keys)

class GraphWord2Vec:
    def __init__(self,d=0.85):
        self.d = d # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight
        self.nlp = nlp2
  
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = self.nlp.vocab[word]
            lexeme.is_stop = True  
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(self.nlp(token.text.lower()))
                    else:
                        selected_words.append(token)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        check = set()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word.text not in check:
                    vocab[word] = i
                    i += 1
                    check.add(word.text)
        return vocab
    
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1 in vocab:
            for word2 in vocab:
                i, j = vocab[word1], vocab[word2]
                g[i][j] = word1.similarity(word2)
                #print(f"{word1} {word2} -- {g[i][j]}")
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm
    
    def get_keywords(self, number=10):
        """Print top number keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        
        for i, (key, value) in enumerate(node_weight.items()):
            print(str(key) + ' - ' + str(value))
            if i > number:
                break
    
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Parse text by spaCy
        doc = self.nlp(text)
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for step in range(self.steps):
            
            pr = (1-self.d) + self.d * np.dot(g, pr)
            diff = sum(abs(previous_pr - pr))

            print(f"step: {step}, diff:{diff}")
            if diff  < self.min_diff:
                break
            else:
                previous_pr = pr

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        self.node_weight = node_weight


In [99]:
gw2v = GraphWord2Vec()
for idx,t in enumerate(texts[0:10]):
    
    gw2v.analyze(t, candidate_pos = ['NOUN', 'PROPN'], lower=False)
    print(f"document id:{idx}")
    gw2v.get_keywords(10)
    print("\n\n\n")

step: 0, diff:71.0
step: 1, diff:0.6209977490269728
step: 2, diff:0.08020954223430021
step: 3, diff:0.011247136082861653
step: 4, diff:0.0016139238664212296
step: 5, diff:0.0002334524550259287
step: 6, diff:3.3859468946739746e-05
step: 7, diff:4.918535672193691e-06
document id:0
samenleving - 1.228980939557856
politiek - 1.22139055727897
democratie - 1.219258265590387
klimaatprobleem - 1.2000731758882277
visie - 1.1705185899195145
vrijheid - 1.1672637688368095
discussie - 1.153302186972872
economie - 1.1516164784093892
verkiezingsprogramma - 1.1487423319478878
debat - 1.1453504116238857
vvd - 1.1393156135673719
groenlinks - 1.1182405806734093




step: 0, diff:66.00000000000001
step: 1, diff:0.7377189109090271
step: 2, diff:0.0893723064386015
step: 3, diff:0.011124584493476197
step: 4, diff:0.0014056200657055484
step: 5, diff:0.00017976503660266374
step: 6, diff:2.3073905870418798e-05
step: 7, diff:2.9658829462375635e-06
document id:1
liefdesgebod - 1.4189526779668489
geforceerdheid - 

In [None]:
gw2v = GraphWord2Vec()
for idx,t in enumerate(texts[0:10]):
    
    gw2v.analyze(t, candidate_pos = ['NOUN', 'PROPN'], lower=False)
    print(f"document id:{idx}")
    gw2v.get_keywords(10)
    print("\n\n\n")

step: 0, diff:71.00000000000001
step: 1, diff:0.4240849229685585
step: 2, diff:0.05084254446470027
step: 3, diff:0.006606511980149943
step: 4, diff:0.0008933470362887386
step: 5, diff:0.0001222998087266891
step: 6, diff:1.676781713078057e-05
step: 7, diff:2.2964990059604062e-06
document id:0
samenleving - 1.2232116195387013
politiek - 1.1951026605986352
visie - 1.1815777429414163
vrijheid - 1.1764776792366156
klimaatprobleem - 1.1710597241262026
economie - 1.1428751599956604
discussie - 1.1384841906190888
idee - 1.1207609095684972
mensen - 1.1078249801339464
hoop - 1.1031467002280264
debat - 1.101061301185581
middelen - 1.091382628525278




step: 0, diff:65.99999999999999
step: 1, diff:0.5533956826180509
step: 2, diff:0.061557920975567226
step: 3, diff:0.007271417789087575
step: 4, diff:0.0008848403239863911
step: 5, diff:0.00010879161721444319
step: 6, diff:1.3406739447496818e-05
step: 7, diff:1.6545773434684108e-06
document id:1
liefdesgebod - 1.4091448277854353
geforceerdheid - 1.2