In [17]:
import os
import sys
import glob
import re
import string
from operator import itemgetter

import numpy as np

import pandas as pd
from nltk.corpus import wordnet as wn

from tqdm import tqdm
import bs4

import tika
from tika import parser

from mlutil.parallel import mapp
from mlutil.topic_modeling import top_topic_words, topic_coherence
import mlutil.parallel as parallel

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.summarization import keywords as textrank_keywords, summarize as textrank_summarize

import langdetect

## Data path

Change this to point to your documents folder

In [2]:
data_path = '/home/kuba/Downloads/książki_nauka'

In [3]:
tika.initVM()

In [4]:
def get_article_name(path):
    return os.path.relpath(path, data_path)


def clear_markup(xml_content):
    return bs4.BeautifulSoup(xml_content, 'lxml').get_text()


def clean_text(text):
    return re.sub('\W+', ' ', re.sub('\d+', ' ', text).lower())


def truncate_pages(xml_content, select_pages=5):
    pages = xml_content.split('<div class="page">', max(select_pages))
    selected_pages = [pages[i] for i in select_pages]    
    return ' '.join(selected_pages)


def extract_pdf_text(pdf_path, long_document_threshold=50, short_document_pages=5, long_document_pages=range(5, 10)):
    try:
        parsed = tika.parser.from_file(pdf_path, xmlContent=True)
        num_pages = int(parsed['metadata']['xmpTPg:NPages'])
        if num_pages > long_document_threshold:
            seleted_content = truncate_pages(parsed['content'], long_document_pages)
        else:
            seleted_content = truncate_pages(parsed['content'], range(short_document_pages))
        return clean_text(clear_markup(seleted_content))
    except:
        return None

    
def textrank_summarize_with_fallback(text):
    try:
        summary = textrank_summarize(text, ratio=0.5)
    except:
        print('WARNING: article with invalid text')
        summary = ''
    return summary


def is_in_wn(word):
    try:
        synsets = list(wn.synsets(word))
    except:
        return False
    return len(synsets) > 0


def legal_words(text):
    for w in text.split():
        if is_in_wn(w):
            yield w


def illegal_vectorizer_words(texts):
    # use these as stopwords for actual vectorizer
    base_vectorizer = TfidfVectorizer()
    base_vectorizer.fit(texts)
    return frozenset(w for w in base_vectorizer.vocabulary_.keys() if not is_in_wn(w))


def get_topic_coherences(keywords_per_topic, n_used_top_keywords=10):
    n_topics = keywords_per_topic.shape[0]
    return pd.Series([
        topic_coherence(keywords.values, n_top_keywords=n_used_top_keywords)
        for (__, keywords) in tqdm(keywords_per_topic.iterrows(), total=n_topics)
    ])


def get_most_representative_article_names(topic_index, article_names, topic_features):
    topic_scores = topic_features[:,topic_index]
    sorted_topic_scores = sorted(enumerate(topic_scores), key=itemgetter(1), reverse=True)
    most_representative_articles_indices = list(map(itemgetter(0), sorted_topic_scores))
    return [(article_names[i], topic_score) for (i, topic_score) in sorted_topic_scores]

In [5]:
pdf_paths = glob.glob(os.path.join(data_path, '**', '*.pdf'), recursive=True)
len(pdf_paths)

705

In [6]:
%%time
texts = list(mapp(extract_pdf_text, pdf_paths))

2019-05-02 12:09:25,908 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:26,014 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:26,001 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:26,281 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:30,955 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:31,046 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:31,066 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:31,326 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:35,978 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:36,072 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
2019-05-02 12:09:36,086 [MainThread  ] [WARNI]  Fa

CPU times: user 844 ms, sys: 388 ms, total: 1.23 s
Wall time: 9min 58s


In [7]:
legal_texts_with_filenames = [
    (get_article_name(path), text)
    for (path, text) in tqdm(zip(pdf_paths, texts), total=len(texts))
    if text is not None and len(text) > 100 and langdetect.detect(text[:100]) == 'en'
]

100%|██████████| 705/705 [00:07<00:00, 112.07it/s]


In [8]:
legal_texts = [text for (__, text) in legal_texts_with_filenames]
legal_filenames = [filename for (filename, __) in legal_texts_with_filenames]

In [9]:
len(legal_texts)

492

## Vectorize text

We'll use TF-IDF features (they work best for NMF).
Tokens that aren't in WordNet are deleted so that:
- we throw out some nonsensical tokens
- we keep only words that can be used to evaluate topic coherence using WordNet based similarity

In [23]:
illegal_words = illegal_vectorizer_words(legal_texts)
stop_words = illegal_words.union(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS)
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words=stop_words)
count_vectorizer = CountVectorizer(max_features=5000, stop_words=stop_words)
text_tfidf_vectors = tfidf_vectorizer.fit_transform(legal_texts)
text_count_vectors = count_vectorizer.fit_transform(legal_texts)

feature_names = tfidf_vectorizer.get_feature_names()

n_samples, n_features = text_tfidf_vectors.shape
n_topics = 10

In [11]:
print("n_samples=%d and n_features=%d..." % (n_samples, n_features))

n_samples=492 and n_features=5000...


### Topic modeling

### NMF model (Frobenius norm) with tf-idf features 

In [12]:
%%time
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5)

nmf_features = nmf.fit_transform(text_tfidf_vectors)

CPU times: user 2.66 s, sys: 1.54 s, total: 4.19 s
Wall time: 2.18 s


In [13]:
nmf_keywords_per_topic = top_topic_words(nmf, feature_names, 100)
display(nmf_keywords_per_topic.iloc[:,:10])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
topic_0,data,model,probability,distribution,algorithm,log,figure,set,time,learning
topic_1,let,theorem,proof,set,space,lemma,spaces,finite,definition,exercise
topic_2,type,list,java,string,code,class,function,object,chapter,method
topic_3,word,words,language,sentence,document,al,sentences,corpus,text,semantic
topic_4,learning,neural,networks,training,layer,network,input,hidden,deep,gradient
topic_5,michael,department,science,computer,image,dictionary,patch,sparse,patches,pursuit
topic_6,spark,data,apache,cluster,streaming,mahout,python,user,model,https
topic_7,category,categories,set,object,objects,isomorphism,limits,diagram,arrows,arrow
topic_8,quantum,classical,state,learning,mechanics,doi,unitary,states,machine,gates
topic_9,matrix,vector,matrices,factorization,gradient,convex,vectors,norm,function,rn


### LDA model

In [24]:
%%time
lda = LatentDirichletAllocation(n_components=n_topics, random_state=1, n_jobs=-1, max_iter=10)

lda_features = lda.fit_transform(text_count_vectors)

CPU times: user 1.24 s, sys: 291 ms, total: 1.53 s
Wall time: 1min 59s


In [25]:
lda_keywords_per_topic = top_topic_words(lda, feature_names, 100)
display(lda_keywords_per_topic.iloc[:,:10])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
topic_0,ideas,things,time,idea,way,make,like,does,mind,terms
topic_1,function,list,type,value,let,functions,programming,use,using,example
topic_2,set,type,category,function,theory,object,definition,proof,logic,objects
topic_3,image,selection,al,images,theory,computer,vision,shape,species,evolutionary
topic_4,data,learning,word,al,model,based,words,information,language,used
topic_5,probability,problem,quantum,algorithm,random,log,function,graph,information,figure
topic_6,data,model,matrix,function,learning,set,using,training,linear,algorithm
topic_7,let,set,theorem,space,proof,lemma,sets,finite,map,spaces
topic_8,class,code,method,string,java,new,use,example,data,object
topic_9,time,figure,function,tree,signal,point,space,section,problem,case


## Topic coherence

NMF

In [14]:
%%time
nmf_topic_coherences = get_topic_coherences(nmf_keywords_per_topic)

print('MEAN COHERENCE')
print(nmf_topic_coherences.mean())

100%|██████████| 10/10 [03:10<00:00, 19.16s/it]

MEAN COHERENCE
1.1596969944474094
CPU times: user 3min 6s, sys: 3.62 s, total: 3min 10s
Wall time: 3min 10s





In [15]:
nmf_topic_coherences.describe()

count    10.000000
mean      1.159697
std       0.532871
min       0.597904
25%       0.781418
50%       0.992176
75%       1.386998
max       2.304092
dtype: float64

LDA

In [26]:
%%time
lda_topic_coherences = get_topic_coherences(lda_keywords_per_topic)

print('MEAN COHERENCE')
print(lda_topic_coherences.mean())

100%|██████████| 10/10 [03:32<00:00, 21.22s/it]

MEAN COHERENCE
1.1696035203464452
CPU times: user 3min 27s, sys: 4.56 s, total: 3min 31s
Wall time: 3min 32s





In [27]:
lda_topic_coherences.describe()

count    10.000000
mean      1.169604
std       0.264538
min       0.732235
25%       0.998660
50%       1.150736
75%       1.351127
max       1.577204
dtype: float64

One topic seems to contain machine learning keywords.
Let's check what documents are on ML according to topic model:

In [28]:
ml_topic_index = 4
get_most_representative_article_names(ml_topic_index, legal_filenames, lda_features)[:25]

[('ml/nlp text mining/Recent Trends in Deep Learning Based.pdf',
  0.9999021155265143),
 ('ml/podsumowania/Query Focused Abstractive Summarization.pdf',
  0.9996339005057867),
 ('ml/nlp text mining/Neural Information Retrieval A Literature Review.pdf',
  0.9888375239467582),
 ('ml/podsumowania/Exploring Content Models for Multi-Document Summarization (KL-Sum).pdf',
  0.9760659406296509),
 ('ml/nlp text mining/dl4nlp/cs224n-2017-notes6.pdf', 0.9760568556297047),
 ('ml/podsumowania/The Impact of Frequency on Summarization (SumBasic).pdf',
  0.9752691553055433),
 ('ml/nlp text mining/Automatic Evaluation of Topic Coherence.pdf',
  0.9564416384636428),
 ('ml/audio/One Deep Music Representation to Rule Them All.pdf',
  0.9398165724906762),
 ('ml/podsumowania/Centroid-based Text Summarization through Compositionality of Word Embeddings.pdf',
  0.9308322064644541),
 ('ml/nlp text mining/artykuły/A survey of named entity recognition and classification .pdf',
  0.9189643287685103),
 ('ml/nlp te

In [29]:
import pyLDAvis
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()

In [32]:
%%time
pyLDAvis.sklearn.prepare(lda, text_count_vectors, count_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
