In [1]:
import os
import sys
import re
import logging

from pprint import pprint
from time import time

import nltk
import numpy as np
import pandas as pd
import PyPDF2 as pyPdf

from bs4 import BeautifulSoup
from gensim import models
from nltk.corpus import stopwords
# Needed only if you want to remove stop words.
# nltk.download()

# Sklearn imports
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
# Configurable parameters
CATEGORY_FILE = 'categories'
EXPERIMENT_FILE = 'e1'
DATA_DIR = './data'

# Logging level
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger()

In [3]:
def read_document(filename):
    """ Read input pdf file and parse text from it 
        TODO : Some words are not space separated 
    """
    if not filename.endswith(".pdf"):
        raise Exception('Input file is not in pdf format !!')
    text = []
    pdf = pyPdf.PdfFileReader(open(filename, "rb"))
    for page in pdf.pages:
        raw_text = page.extractText()
        lines = re.split('\n', raw_text)
    return lines

In [4]:
class CategoryMatcher(object):
    """ Category matcher built from existing categories file.
        Can be used to match categories for terms.
        TODO : Handle variations like phrases, typos etc.
    """
    
    def __init__(self, category_file, data_dir):
        self.category_file = category_file
        self.data_dir = data_dir
        # Dict to store categories for a given term.
        # TODO: Conisder using Prefix Tree instead of Dictionary.
        self.term_categories = {}
        # Dict to store terms for a given category.
        self.category_terms = {}
        self.categories = set([])
        self.is_built = False

    def get_categories(self):
        return self.categories
    
    @staticmethod
    def process_text(text):
        if text is None:
            return text
        # Strip leading and ending whitespace
        text = text.strip()
        # Convert to lower case
        text = text.lower()
        # Remove non-alphabetic characters
        text = re.sub("[^a-zA-Z]", " ", text)
        return text
    
    def add_term_category(self, term, category):
        self.category_terms[category].add(term)
        if term not in self.term_categories:
            self.term_categories[term] = set([category])
        else:
            self.term_categories[term].add(category)
    
    def build(self):
        """ Parses the category file and builds a set of categories for each term.
        
            Assumes following format for category file:
            Category1,Word1,Word2 ...
            Category2,Word1,Word2 ...
            .
            .
            .
            
            NOTE: One term can have multiple categories
        """
        with open(self.category_file, 'rb') as f:
            for line in f:
                line = line.strip()
                # Skip blank lines
                if len(line) == 0:
                    continue
                # Skip comments
                if line.startswith('#'):
                    continue
                    
                terms = line.split(',')
                category = self.process_text(terms[0])
                if category not in self.categories:
                    self.categories.add(category)
                    self.category_terms[category] = set([])
                for term in terms[1:]:
                    term = self.process_text(term)
                    self.add_term_category(term, category)
        self.is_built = True
        
    def get_term(self, words, index):
        """ Returns longest n-gram starting at given index, 
            which is present in currently seen terms
        """
        num_words = len(words)
        # Computer gram, bigram and trigram
        gram, bigram, trigram = self.process_text(words[index]), None, None
        if (index + 1) < num_words:
            bigram = ' '.join([gram, self.process_text(words[index + 1])])
        if (index + 2) < num_words:
            trigram = ' '.join([bigram, self.process_text(words[index + 2])])
        
        # Return largest matching n-gram
        if trigram in self.term_categories:
            return trigram
        elif bigram in self.term_categories:
            return bigram
        else:
            return gram
        
    def get_terms(self, line):
        terms = []
        words = line.split()
        i = 0
        while i < len(words):
#             print('Calling get term for index: %d out of %d' %(i, len(words)))
            term = self.get_term(words, i)
            i += len(term.split())
            terms.append(term)
        return terms
    
    def get_representative_category(self, terms):
        category_count = {}
        for category in self.categories:
            category_count[category] = 0
        for term in terms:
            if term not in self.term_categories:
                continue
            for category in self.term_categories[term]:
                category_count[category] += get_tf_idf(category, term)
        return max(category_count, key=category_count.get)
    
    def process_paragraph(self, lines):
        # TODO: Figure out appropriate context window length
        # for finding representative category. Currently we 
        # are using sentences.
        print('\n\n\n\nProcessing paragraph: ', lines)        
        for line in lines:
            print('\nProcessing Line: ', line)
            terms = self.get_terms(line)
            print(len(terms), terms)
            print('Calling get representative category for: ', terms)
            category = self.get_representative_category(terms)
            print('Representative category is: ', category)
            for term in terms:
                print('Adding term category: ', term, category)
                self.add_term_category(term, category)
                
    def update(self):
        for category_dir in os.listdir(self.data_dir):
            if category_dir != 'TriaxialLabData':
                continue
            logger.info("Processing Category: {0}".format(category_dir))
            documents = os.listdir(os.path.join(self.data_dir, category_dir))
            for document in documents:
                if not document.endswith(".pdf"):
                    print(document, ' is not a pdf. Skipping')
                    continue
                logger.info("Processing Document: {0}".format(document))
                lines = read_document(os.path.join(self.data_dir, category_dir, document))
                paragraph_length = 4
                num_paragraphs = len(lines)/paragraph_length
                for i in range(num_paragraphs):
                    start = i * paragraph_length
                    end = min((i+1) * paragraph_length, len(lines) + 1)
                    self.process_paragraph(lines[start:end])
                
    def get_term_categories(self, term):
        """ Returns set of categories for given term
        Attributes
            term -- Word for which we want to retrieve categories.
        Returns
            Set of categories given term belongs to.
        """
        if not self.is_built:
            raise Exception('Category Matcher not built !!')
        term = self.process_text(term)
        if term in self.term_categories:
            return self.term_categories[term]
        # Return empty set if term categories are not present.
        logger.debug('"{0}" not present. Returning empty set'.format(term))
        return set([])
    
    def get_tf(self, category, word):
        tf = 0
        total = 0
        for term in self.category_terms[category]:
            for term_word in term.split(' '):
                total += 1
                if word == term_word:
                    tf += 1
        return tf/total
    
    def get_idf(self, term):
        num_term_categories = 0
        # Term contains a single word
        if len(term.split(' ')) == 1: 
            for category in self.categories:
                is_present = False
                for term in self.category_terms[category]:
                    for term_word in term.split(' '):
                        if word == term_word:
                            num_term_categories += 1
                            is_present = True
                            break
                    if (is_present):
                        break
        else: # Term contains multiple words
            num_term_categories = self.get_term_categories(term)
            
        if num_term_categories == 0:
            raise Exception('IDF called for a term not present in Category Matcher')
        num_categories = len(self.categories)
        return np.log(num_categories / num_term_categories)
    
    def get_tf_idf(self, category, word):
        return self.get_tf(category, word) * self.get_idf() 
        
    def get_category_vector(self, term):
        """ Returns a category vector representation for give term.
        Attributes
            term -- Word for which we want to category vector
        Returns
            Vector correponding to categories of given term.
        """
        term_categories = self.get_term_categories(term)
        category_vector = [category in term_categories for category in sorted(self.categories)]
        return np.array(category_vector, dtype=int)

In [5]:
class CategoryPredictor(object):
    """ Categorry Predictor for documents. Learnt from training data """
    # TODO: Retain high level structure
    def __init__(self, category_file, data_dir):
        self.category_matcher_ = CategoryMatcher(category_file, data_dir)
        self.category_matcher_.build()
        self.raw_labels_ = list(self.category_matcher_.get_categories())
        self.label_encoder_ = preprocessing.LabelEncoder()
        self.data_dir_ = data_dir
        self.X_train_ = []
        self.y_train_ = []
        
    @staticmethod
    def read_document(filename):
        """ Read input pdf file and parse text from it 
            TODO : Some words are not space separated 
        """
        if not filename.endswith(".pdf"):
            raise Exception('Input file is not in pdf format !!')
        text = []
        pdf = pyPdf.PdfFileReader(open(filename, "rb"))
        for page in pdf.pages:
            raw_text = page.extractText()
            raw_text = re.split('\W+', raw_text)
            clean_text = [word.strip() for word in raw_text]
            text = text + clean_text
        # text = list(text)
        return ' '.join(text)

    @staticmethod
    def preprocess(text, remove_stopwords = True):
        """ Function to convert raw text to a sequence of words,
            optionally removing stop words. Returns a list of words.
        """
        text = text.strip()
        if len(text) == 0:
            return text
        
        # 1. Remove HTML
        text = BeautifulSoup(text).get_text()
          
        # 2. Remove \n and non-digit characters.
        # TODO: Revisit this to see what characters should be kept.
        # Probably numbers, -, _ etc
        text = re.sub(r"\\n"," ", text)
        text = re.sub("[^a-zA-Z]", " ", text)
        
        # 3. Convert words to lower case and split them
        words = text.lower().split()
        
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words('english'))
            words = [w for w in words if not w in stops]
        
        # 5. Join the words back into one string separated by space, 
        # and return the result.
        return (" ".join(words))
    
    def prepare_data(self):
        self.label_encoder_.fit(self.raw_labels_)
        logger.debug('Training Category Predictor for following categories:\n{0}'
                     .format('\t'.join(self.raw_labels_)))
        
        for category_dir in os.listdir(self.data_dir_):
            logger.info("Processing Category: {0}".format(category_dir))
            documents = os.listdir(os.path.join(self.data_dir_, category_dir))
            for document in documents:
                raw_text = self.read_document(os.path.join(self.data_dir_, category_dir, document))
                # This might not be required if CountVectorizer provides 
                # all required processing capabilities
                clean_text = self.preprocess(raw_text)
                self.X_train_.append(clean_text)
                self.y_train_.append(self.label_encoder_.transform([category_dir])[0])
                
    def get_document_vector(self, document_dir, document):
        raw_text = self.read_document(os.path.join(self.data_dir_, category_dir, document))
        # Consider using count vectorizer for this.
        clean_text = self.preprocess(raw_text)
                
    def train_model(self):
        pipeline = Pipeline([
            ('vect', TfidfVectorizer(analyzer='word', stop_words='english',
                                     strip_accents='unicode', ngram_range=(1,2))),
            ('clf', SGDClassifier(penalty='l2'))
        ])

        # Uncommenting more parameters will give better exploring power but will
        # increase processing time in a combinatorial way
        parameters = {
            'vect__max_df': (0.5, 0.75, 1.0),
            #'vect__max_features': (None, 5000, 10000, 50000),
            #'tfidf__use_idf': (True, False),
            #'tfidf__norm': ('l1', 'l2'),
            'clf__alpha': (0.00001, 0.000001),
#             'clf__penalty': ('l2', 'elasticnet'),
            #'clf__n_iter': (10, 50, 80),
        }
                
        # Find the best parameters for both the feature extraction and the
        # classifier
        grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=0)

        print("Performing grid search...")
        print("pipeline:", [name for name, _ in pipeline.steps])
        print("parameters:")
        pprint(parameters)
        t0 = time()
        grid_search.fit(self.X_train_, self.y_train_)
        print("done in %0.3fs" % (time() - t0))
        print()

        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [6]:
if __name__ == '__main__':
    category_matcher = CategoryMatcher(CATEGORY_FILE, DATA_DIR)
    category_matcher.build()
#     with open(EXPERIMENT_FILE, 'rb') as f:
#         for line in f:
#             for term in line.split(' '):
#                 categories = category_matcher.get_term_categories(term)
#                 logger.debug(term, len(categories))
#     category_predictor = CategoryPredictor(CATEGORY_FILE, DATA_DIR)
#     category_predictor.prepare_data()

In [7]:
print(category_matcher.term_categories)

{'centrifuge': set(['centrifuge']), 'force': set(['centrifuge', 'earthquake']), 'pressure': set(['centrifuge']), 'magnitude': set(['earthquake']), 'rotator': set(['centrifuge']), 'centrifuge pump': set(['centrifuge']), 'richter': set(['earthquake']), 'earthquake': set(['earthquake'])}


In [None]:
category_matcher.update()

INFO:root:Processing Category: TriaxialLabData
('Device Configuration File- Version 1.txt', ' is not a pdf. Skipping')
('Experimental Methods.docx', ' is not a pdf. Skipping')
INFO:root:Processing Document: Experimental Methods.pdf
('\n\n\n\nProcessing paragraph: ', [u'11  The back pressure is controlled by ', u'the computer and kept constant at the initial value after ', u'consolidation, which is usually 100 kPa. Note that shear is started under drained loading ', u''])
('\nProcessing Line: ', u'11  The back pressure is controlled by ')


In [None]:
category_matcher.update()

In [None]:
category_matcher.category_terms

In [None]:
category_matcher.term_categories