In [1]:
import os
import sys
import glob
import re
import string
from operator import itemgetter

import numpy as np

import pandas as pd
from nltk.corpus import wordnet as wn

from tqdm import tqdm
import bs4

import tika
from tika import parser

from mlutil.parallel import mapp
from mlutil.topic_modeling import top_topic_words, topic_coherence
import mlutil.parallel as parallel

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from gensim.summarization import keywords as textrank_keywords, summarize as textrank_summarize

import langdetect

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


## Data path

Change this to point to your documents folder

In [2]:
data_path = '/home/kuba/Downloads/książki_nauka'

In [3]:
tika.initVM()

In [5]:
def get_article_name(path):
    return os.path.relpath(path, data_path)



def clear_markup(xml_content):
    return bs4.BeautifulSoup(xml_content, 'lxml').get_text()


def clean_text(text):
    return re.sub('\W+', ' ', re.sub('\d+', ' ', text).lower())


def truncate_pages(xml_content, select_pages=5):
    pages = xml_content.split('<div class="page">', max(select_pages))
    selected_pages = [pages[i] for i in select_pages]    
    return ' '.join(selected_pages)


def extract_pdf_text(pdf_path, long_document_threshold=50, short_document_pages=5, long_document_pages=range(5, 10)):
    try:
        parsed = tika.parser.from_file(pdf_path, xmlContent=True)
        num_pages = int(parsed['metadata']['xmpTPg:NPages'])
        if num_pages > long_document_threshold:
            seleted_content = truncate_pages(parsed['content'], long_document_pages)
        else:
            seleted_content = truncate_pages(parsed['content'], range(short_document_pages))
        return clean_text(clear_markup(seleted_content))
    except:
        return None

    
def textrank_summarize_with_fallback(text):
    try:
        summary = textrank_summarize(text, ratio=0.5)
    except:
        print('WARNING: article with invalid text')
        summary = ''
    return summary


def is_in_wn(word):
    try:
        synsets = list(wn.synsets(word))
    except:
        return False
    return len(synsets) > 0


def legal_words(text):
    for w in text.split():
        if is_in_wn(w):
            yield w


def illegal_vectorizer_words(texts):
    # use these as stopwords for actual vectorizer
    base_vectorizer = TfidfVectorizer()
    base_vectorizer.fit(texts)
    return frozenset(w for w in base_vectorizer.vocabulary_.keys() if not is_in_wn(w))


def get_most_representative_article_names(topic_index, article_names, topic_features):
    topic_scores = topic_features[:,topic_index]
    sorted_topic_scores = sorted(enumerate(topic_scores), key=itemgetter(1), reverse=True)
    most_representative_articles_indices = list(map(itemgetter(0), sorted_topic_scores))
    return [(article_names[i], topic_score) for (i, topic_score) in sorted_topic_scores]

In [6]:
pdf_paths = glob.glob(os.path.join(data_path, '**', '*.pdf'), recursive=True)
len(pdf_paths)

705

In [7]:
%%time
texts = list(mapp(extract_pdf_text, pdf_paths))

2019-05-01 14:14:13,890 [MainThread  ] [WARNI]  Tika server returned status: 422
2019-05-01 14:22:02,390 [MainThread  ] [WARNI]  Tika server returned status: 422
2019-05-01 14:22:17,961 [MainThread  ] [WARNI]  Tika server returned status: 422


CPU times: user 797 ms, sys: 468 ms, total: 1.27 s
Wall time: 9min 20s


In [8]:

legal_texts_with_filenames = [
    (get_article_name(path), text)
    for (path, text) in tqdm(zip(pdf_paths, texts), total=len(texts))
    if text is not None and len(text) > 100 and langdetect.detect(text[:100]) == 'en'
]

100%|██████████| 705/705 [00:08<00:00, 86.55it/s] 


In [9]:
legal_texts = [text for (__, text) in legal_texts_with_filenames]
legal_filenames = [filename for (filename, __) in legal_texts_with_filenames]

## Vectorize text

We'll use TF-IDF features (they work best for NMF).
Tokens that aren't in WordNet are deleted so that:
- we throw out some nonsensical tokens
- we keep only words that can be used to evaluate topic coherence using WordNet based similarity

In [10]:
illegal_words = illegal_vectorizer_words(legal_texts)
stop_words = illegal_words.union(sklearn.feature_extraction.text.ENGLISH_STOP_WORDS)
vectorizer = TfidfVectorizer(max_features=2500, stop_words=stop_words)
text_vectors = vectorizer.fit_transform(legal_texts)

n_samples, n_features = text_vectors.shape
n_topics = 10

### Topic modeling

We'll use NMF model (Frobenius norm) with tf-idf features 

In [11]:
%%time
print("n_samples=%d and n_features=%d..." % (n_samples, n_features))
nmf = NMF(n_components=n_topics, random_state=1,cd 
          alpha=.1, l1_ratio=.5)

nmf_features = nmf.fit_transform(text_vectors)

feature_names = vectorizer.get_feature_names()

n_samples=498 and n_features=2500...
CPU times: user 1.93 s, sys: 1.1 s, total: 3.03 s
Wall time: 1.6 s


In [12]:
nmf_keywords_per_topic = top_topic_words(nmf, feature_names, 100)
display(nmf_keywords_per_topic.iloc[:,:10])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
topic_0,data,matrix,model,algorithm,probability,function,distribution,log,figure,using
topic_1,let,theorem,proof,set,space,lemma,spaces,finite,definition,proposition
topic_2,type,list,java,string,code,class,function,chapter,object,program
topic_3,word,words,language,sentence,document,al,corpus,sentences,text,topic
topic_4,learning,training,neural,networks,layer,network,deep,input,hidden,model
topic_5,michael,department,science,computer,image,dictionary,sparse,patch,patches,min
topic_6,spark,data,apache,cluster,python,model,user,https,import,docs
topic_7,things,world,god,mind,philosophy,life,human,ideas,man,time
topic_8,category,set,categories,object,objects,isomorphism,limits,arrows,arrow,diagram
topic_9,quantum,classical,state,mechanics,learning,doi,unitary,states,machine,gate


## Topic coherence

In [13]:
%%time
topic_coherences = [topic_coherence(keywords.values, n_top_keywords=10) for (__, keywords) in tqdm(nmf_keywords_per_topic.iterrows(), total=n_topics)]

print('MEAN COHERENCE')
print(sum(topic_coherences) / n_topics)

100%|██████████| 10/10 [03:23<00:00, 20.94s/it]

MEAN COHERENCE
1.1402957764788306
CPU times: user 3min 18s, sys: 3.94 s, total: 3min 22s
Wall time: 3min 23s





In [14]:
pd.Series(topic_coherences).describe()

count    10.000000
mean      1.140296
std       0.506678
min       0.665035
25%       0.835518
50%       1.023436
75%       1.152266
max       2.304092
dtype: float64

One topic seems to contain machine learning keywords.
Let's check what documents are on ML according to topic model:

In [15]:
ml_topic_index = 4
get_most_representative_article_names(ml_topic_index, legal_filenames, nmf_features)[:25]

[('ml/Machine learning cheat sheet.pdf', 0.3609000427699512),
 ('ml/teoria/slt/Elements of Statistical Learning.pdf', 0.3607585551283273),
 ('ml/teoria/Bayesian Reasoning and Machine Learning.pdf', 0.3466737472842171),
 ('ml/artykuły/Determinantal point processes for machine learning.pdf',
  0.30551959916850985),
 ('dsp itd/Natural Image Statistics.pdf', 0.30156890619324384),
 ('ml/teoria/Generalized Principal Component Analysis.pdf',
  0.28624916359073466),
 ('ml/teoria/slt/An Introduction to Statistical learning theory.pdf',
  0.2858934308293298),
 ('ml/teoria/slt/Statistical Learning With Sparsity.pdf', 0.282509741028948),
 ('ml/teoria/algo/Randomized methods for computing low-rank approximations of matrices.pdf',
  0.2817193620861067),
 ('ml/szeregi czasowe/Time Series Analysis and Its Applications With R Examples.pdf',
  0.2803886556281992),
 ('dsp itd/cv/Computer vision_ A modern approach.pdf', 0.2794899468763625),
 ('ml/teoria/glrm.pdf', 0.27845025628927944),
 ('ml/sparse/Sparse