In [16]:
# pdfTextMiner.py
# Python 2.7.6
# For Python 3.x use pdfminer3k module
# This link has useful information on components of the program
# https://euske.github.io/pdfminer/programming.html
# http://denis.papathanasiou.org/posts/2010.08.04.post.html

# need to pip install pdfminer and gensim


''' Important classes to remember
PDFParser - fetches data from pdf file
PDFDocument - stores data parsed by PDFParser
PDFPageInterpreter - processes page contents from PDFDocument
PDFDevice - translates processed information from PDFPageInterpreter to whatever you need
PDFResourceManager - Stores shared resources such as fonts or images used by both PDFPageInterpreter and PDFDevice
LAParams - A layout analyzer returns a LTPage object for each page in the PDF document
PDFPageAggregator - Extract the decive to page aggregator to get LT object elements
'''

' Important classes to remember\nPDFParser - fetches data from pdf file\nPDFDocument - stores data parsed by PDFParser\nPDFPageInterpreter - processes page contents from PDFDocument\nPDFDevice - translates processed information from PDFPageInterpreter to whatever you need\nPDFResourceManager - Stores shared resources such as fonts or images used by both PDFPageInterpreter and PDFDevice\nLAParams - A layout analyzer returns a LTPage object for each page in the PDF document\nPDFPageAggregator - Extract the decive to page aggregator to get LT object elements\n'

In [1]:
import os
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
# From PDFInterpreter import both PDFResourceManager and PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# Import this to raise exception whenever text extraction from PDF is not allowed
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from pdfminer.converter import PDFPageAggregator

import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords



In [6]:
''' This is what we are trying to do:
1) Transfer information from PDF file to PDF document object. This is done using parser
2) Get a list of PDF files (papers) from the Papers directory
3) For each PDF file, open and parse the file using PDFParser object
4) Assign the parsed content to PDFDocument object
5) Now the information in this PDFDocumet object has to be processed. For this we need
   PDFPageInterpreter, PDFDevice and PDFResourceManager
6) Finally process the file page by page 
'''

base_path = "C:\Users\Linh B Ngo\Google Drive\CloudLab2\Papers"
 
my_files = [f for f in os.listdir(base_path)]

# depending on operating system, you might have to modify this code to remove system files
my_files.remove('desktop.ini')
print (my_files)

['Active Learning in Performance Analysis.pdf', 'An information infrastructure framework for smart grids leveraging SDN and cloud.pdf', 'ARM Virtualization Performance and Architectural Implications.pdf', 'Brados Declarative Programmable Object Storage.pdf', 'CQSTR - Securing Cross\xacTenant Applications with Cloud Containers.pdf', 'High-Performance ACID via Modular Concurrency Control.pdf', 'JetStream ClusterScale Parallelization of Information Flow Queries.pdf', 'One Bit Flips One Cloud Flops Cross VM Row Hammer Attacks and Privilege Escalation.pdf', 'Paving the Way for NFV Simplifying Middlebox Modifications using StateAlyzr.pdf', 'Reproducible Scientific Computing Environment with Overlay Cloud Architecture - IEEE Conference Publication.pdf', 'SEINA - A Stealthy and Effective Internal Attack in Hadoop Systems.pdf', 'Self-configuring Software-defined Overlay Bypass for Seamless Inter and Intra-cloud Virtual Networking.pdf', 'Split-level IO scheduling.pdf', 'Subways - A Case for Redu

In [7]:
papers = []
papers_title = []

for f in my_files:
    password = ""
    extracted_text = ""    
    papers_title.append(f.split('.pdf', 1)[0].replace(' ','_'))
    
    print(f)
    fp = open(os.path.join(base_path, f), "rb")    
    parser = PDFParser(fp)    
    document = PDFDocument(parser, password)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed    
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Ok now that we have everything to process a pdf document, lets process it page by page
    for page in PDFPage.create_pages(document):
        # As the interpreter processes the page stored in PDFDocument object
        interpreter.process_page(page)
        # The device renders the layout from interpreter
        layout = device.get_result()
        # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
    fp.close()
    
    # Some preliminary data cleaning
    extracted_text = extracted_text.lower()
    headless_text = extracted_text.split("abstract", 1)[1].split('introduction', 1)[1]
    final_text = headless_text.rsplit('references', 1)[0].replace('-\n','').replace('\n', ' ').replace(ur'\u201c', ' ').replace(ur'\u201d', ' ').replace(ur'\ufb01', 'fi').replace(ur'\u2022', ' ').replace(ur'\u2013', ' ')
    
    papers.append(final_text)

Active Learning in Performance Analysis.pdf
An information infrastructure framework for smart grids leveraging SDN and cloud.pdf
ARM Virtualization Performance and Architectural Implications.pdf
Brados Declarative Programmable Object Storage.pdf
CQSTR - Securing Cross�Tenant Applications with Cloud Containers.pdf
High-Performance ACID via Modular Concurrency Control.pdf
JetStream ClusterScale Parallelization of Information Flow Queries.pdf
One Bit Flips One Cloud Flops Cross VM Row Hammer Attacks and Privilege Escalation.pdf
Paving the Way for NFV Simplifying Middlebox Modifications using StateAlyzr.pdf
Reproducible Scientific Computing Environment with Overlay Cloud Architecture - IEEE Conference Publication.pdf
SEINA - A Stealthy and Effective Internal Attack in Hadoop Systems.pdf
Self-configuring Software-defined Overlay Bypass for Seamless Inter and Intra-cloud Virtual Networking.pdf
Split-level IO scheduling.pdf
Subways - A Case for Redundant, Inexpensive Data Center Edge Links.pd

In [8]:
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
COMMONWORDS = ['cid:31','figure','two','cid:27', \
               'cid:29','also','cid:28','cid:30', \
               'cid:25','cid:26','performance','data', \
               'cid:24','cid:21','cid:22','cid:23', 'cid:20']

def clean_text(text):    
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and t not in COMMONWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in papers:
    tokenized_data.append(clean_text(text)) 

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary, iterations = 2000)

print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # print("Topic #%s:" % idx, lda_model.print_topic(idx, 25))
    # print without probability
    tmp_topics = ''
    for term in lda_model.get_topic_terms(idx, 25):
        tmp_topics += dictionary.get(term[0]) + '; '
    print('Topic #%s:' % idx, tmp_topics)    

LDA Model:
('Topic #0:', u'cloud; time; state; memory; computing; used; system; using; containers; execution; application; network; epoch; container; set; virtual; code; first; task; use; design; work; dift; applications; analysis; ')
('Topic #1:', u'transactions; transaction; callas; write; file; control; system; order; using; grid; concurrency; runtime; operations; mechanism; schedulers; scheduling; example; information; block; level; different; however; throughput; first; nexus; ')
('Topic #2:', u'memory; row; bit; hammer; attacks; network; one; physical; address; server; bits; page; first; infrastructure; local; set; records; used; packet; power; number; section; addresses; time; type; ')
('Topic #3:', u'task; attack; node; tasks; time; execution; arm; job; hadoop; type; system; xen; hypervisor; running; cluster; defense; map; set; hardware; server; speculative; one; delay; using; cost; ')
('Topic #4:', u'state; packet; network; cloud; container; service; variables; access; overlay

In [9]:
title_idx = 0
for text in papers:
    print('Topics for paper %s: ' % papers_title[title_idx], lda_model.get_document_topics(dictionary.doc2bow(clean_text(text))))
    title_idx += 1

('Topics for paper Active_Learning_in_Performance_Analysis: ', [(0, 0.23146869), (2, 0.659017), (3, 0.109302126)])
('Topics for paper An_information_infrastructure_framework_for_smart_grids_leveraging_SDN_and_cloud: ', [(2, 0.9996127)])
('Topics for paper ARM_Virtualization_Performance_and_Architectural_Implications: ', [(7, 0.99984515)])
('Topics for paper Brados_Declarative_Programmable_Object_Storage: ', [(0, 0.052128315), (8, 0.027605034), (9, 0.920102)])
('Topics for paper CQSTR_-_Securing_Cross\xacTenant_Applications_with_Cloud_Containers: ', [(4, 0.7341224), (9, 0.26572886)])
('Topics for paper High-Performance_ACID_via_Modular_Concurrency_Control: ', [(1, 0.015021807), (6, 0.88405126), (7, 0.10081105)])
('Topics for paper JetStream_ClusterScale_Parallelization_of_Information_Flow_Queries: ', [(0, 0.29185686), (9, 0.7080001)])
('Topics for paper One_Bit_Flips_One_Cloud_Flops_Cross_VM_Row_Hammer_Attacks_and_Privilege_Escalation: ', [(2, 0.9998557)])
('Topics for paper Paving_the_

### Insights:

- This allows us to quickly examine papers without expending human resources
- Additional analysis are needed to determine an ideal number of topics
    - Can we use aggregated keyword counts to inform this number?
- Exploratory analysis with only 10 topics and 16 papers seems to be encouraging. For example, 
Topic 8 seems to focus on security subjects, and is relevant to only two papers (Hammer Attacks and SEINA) that specifically study security.

### Notes:

- Some PDFs are printout from HTML screen rather than direct download. This might impact the quality of pdfminer
- More papers are needed


### Follow-up:
- Explore gensim to develop bigram and trigram vocabulary instead of single-word tokens. For example, "volatile memory" is more meaningful than "volatile" and "memory". 
- Explore parallelism in parsing PDFs to texts. 
- Explore PDFMiner and see if PDFs schematic can be extracted for localized topic models on sections such as introduction only or conclusion only. 


### Notes April 10, 2018

- Topics being pursued by CloudLab users
- Visualize topics
- Identify pocket of researchers working on the same topic

- Expanding number of topics
- Venues of publication

- Mine Google Groups: https://gist.github.com/punchagan/7947337

- For a given project that uses CloudLab, which papers belong to that project. Can it be automated?
