In [89]:
import os
import sys
import re
import logging

from pprint import pprint
from time import time

import nltk
import numpy as np
import pandas as pd
import PyPDF2 as pyPdf

from bs4 import BeautifulSoup
from gensim import models
from nltk.corpus import stopwords
# Needed only if you want to remove stop words.
# nltk.download()

# Sklearn imports
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [90]:
# Configurable parameters
CATEGORY_FILE = 'categories'
EXPERIMENT_FILE = 'e1'
DATA_DIR = './data'

# Logging level
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging.getLogger()

In [91]:
class CategoryMatcher(object):
    """ Category matcher built from existing categories file.
        Can be used to match categories for terms.
        TODO : Handle variations like phrases, upper/lower case, typos etc.
    """
    
    def __init__(self, category_file):
        self.category_file = category_file
        self.term_categories = {}
        self.categories = set([])
        self.is_built = False

    def get_categories(self):
        return self.categories
    
    def build(self):
        """ Parses the category file and builds a set of categories for each term.
        
            Assumes following format for category file:
            Category1 Word1 Word2 ...
            Category2 Word1 Word2 ...
            .
            .
            .
            
            NOTE: One term can have multiple categories
        """
        with open(self.category_file, 'rb') as f:
            for line in f:
                terms = line.split(' ')
                category = terms[0]
                if category not in self.categories:
                    self.categories.add(category)
                for term in terms[1:0]:
                    if term not in self.term_categories:
                        logger.debug('Term not present in dict: {0}'.format(term))
                        self.term_categories[term] = set([category])
                    else:
                        logger.debug('Adding new category for term: {0}'.format(term))
                        self.term_categories[term].add(category)
        self.is_built = True
                        
    def get_term_categories(self, term):
        """ Returns set of categories for given term
        Attributes
            term -- Word for which we want to retrieve categories.
        Returns
            Set of categories given term belongs to.
        """
        if not self.is_built:
            raise Exception('Category Matcher not built !!')
        if term in self.term_categories:
            return self.term_categories[term]
        # Return empty set if term categories are not present.
        logger.debug('"{0}" not present. Returning empty set'.format(term))
        return set([])

In [92]:
class CategoryPredictor(object):
    """ Categorry Predictor for documents. Learnt from training data """
    
    def __init__(self, category_file, data_dir):
        self.category_matcher_ = CategoryMatcher(category_file)
        self.category_matcher_.build()
        self.raw_labels_ = list(self.category_matcher_.get_categories())
        self.label_encoder_ = preprocessing.LabelEncoder()
        self.data_dir_ = data_dir
        self.X_train_ = []
        self.y_train_ = []
        
    @staticmethod
    def read_document(filename):
        """ Read input pdf file and parse text from it 
            TODO : Some words are not space separated 
        """
        if not filename.endswith(".pdf"):
            raise Exception('Input file is not in pdf format !!')
        text = []
        pdf = pyPdf.PdfFileReader(open(filename, "rb"))
        for page in pdf.pages:
            raw_text = page.extractText()
            raw_text = re.split('\W+', raw_text)
            clean_text = [word.strip() for word in raw_text]
            text = text + clean_text
        # text = list(text)
        return ' '.join(text)

    @staticmethod
    def preprocess(text, remove_stopwords = True):
        """ Function to convert raw text to a sequence of words,
            optionally removing stop words. Returns a list of words.
        """
        text = text.strip()
        if len(text) == 0:
            return text
        
        # 1. Remove HTML
        text = BeautifulSoup(text).get_text()
          
        # 2. Remove \n and non-digit characters
        text = re.sub(r"\\n"," ", text)
        text = re.sub("[^a-zA-Z]", " ", text)
        
        # 3. Convert words to lower case and split them
        words = text.lower().split()
        
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words('english'))
            words = [w for w in words if not w in stops]
        
        # 5. Join the words back into one string separated by space, 
        # and return the result.
        return (" ".join(words))
    
    def prepare_data(self):
        self.label_encoder_.fit(self.raw_labels_)
        logger.debug('Training Category Predictor for following categories:\n{0}'
                     .format('\t'.join(self.raw_labels_)))
        
        for category_dir in os.listdir(self.data_dir_):
            logger.info("Processing Category: {0}".format(category_dir))
            documents = os.listdir(os.path.join(self.data_dir_, category_dir))
            for document in documents:
                raw_text = self.read_document(os.path.join(self.data_dir_, category_dir, document))
                # This might not be required if CountVectorizer provides 
                # all required processing capabilities
                clean_text = self.preprocess(raw_text)
                self.X_train_.append(clean_text)
                self.y_train_.append(self.label_encoder_.transform([category_dir])[0])
                
    def train_model(self):
        pipeline = Pipeline([
            ('vect', TfidfVectorizer(analyzer='word', stop_words='english',
                                     strip_accents='unicode', ngram_range=(1,2))),
            ('clf', SGDClassifier(penalty='l2'))
        ])

        # Uncommenting more parameters will give better exploring power but will
        # increase processing time in a combinatorial way
        parameters = {
            'vect__max_df': (0.5, 0.75, 1.0),
            #'vect__max_features': (None, 5000, 10000, 50000),
            #'tfidf__use_idf': (True, False),
            #'tfidf__norm': ('l1', 'l2'),
            'clf__alpha': (0.00001, 0.000001),
#             'clf__penalty': ('l2', 'elasticnet'),
            #'clf__n_iter': (10, 50, 80),
        }
                
        # Find the best parameters for both the feature extraction and the
        # classifier
        grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=0)

        print("Performing grid search...")
        print("pipeline:", [name for name, _ in pipeline.steps])
        print("parameters:")
        pprint(parameters)
        t0 = time()
        grid_search.fit(self.X_train_, self.y_train_)
        print("done in %0.3fs" % (time() - t0))
        print()

        print("Best score: %0.3f" % grid_search.best_score_)
        print("Best parameters set:")
        best_parameters = grid_search.best_estimator_.get_params()
        for param_name in sorted(parameters.keys()):
            print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [93]:
def main():
    category_matcher = CategoryMatcher(CATEGORY_FILE)
    category_matcher.build()
    with open(EXPERIMENT_FILE, 'rb') as f:
        for line in f:
            for term in line.split(' '):
                categories = category_matcher.get_term_categories(term)
                logger.debug(term, categories)
                
if __name__ == '__main__':
    category_predictor = CategoryPredictor(CATEGORY_FILE, DATA_DIR)
    category_predictor.prepare_data()

DEBUG:root:Training Category Predictor for following categories:
centrifuge	earthquake
INFO:root:Processing Category: centrifuge
INFO:root:Processing Category: earthquake


In [94]:
category_predictor.train_model()

Performing grid search...
('pipeline:', ['vect', 'clf'])
parameters:
{'clf__alpha': (1e-05, 1e-06), 'vect__max_df': (0.5, 0.75, 1.0)}




done in 0.477s
()
Best score: 1.000
Best parameters set:
	clf__alpha: 1e-05
	vect__max_df: 0.5
