In [23]:
from dfiner.classifier.lexicon import Lexicon
from dfiner.utils.utils import dump_pickle, load_pickle
import os
import types
from collections import defaultdict
import numpy as np
from scipy import sparse

class FeatureStroage(object):
    
    cache_dir = "/tmp/cache"
    COO = "COO"
    DENSE = "DENSE"

    @staticmethod
    def save_sparse_csr(filename,array):
        np.savez(filename,
                 data=array.data,
                 row=array.row,
                 col=array.col, 
                 shape=array.shape)
    
    @staticmethod
    def load_sparse_csr(mat_file):
        mat_file = "%.coo_mat" % mat_file        
        loader = np.load(filename)
        data = loader['data']
        row = loader['row']
        col = loader['col']
        shape = loader['shape']
        return coo_matrix((data, (row, col)), shape=shape)

    @staticmethod
    def dump_mat(mat, filename):
        # We assume the input is either plain dense numpy matrix,
        # or coo_matrix from scipy
        if sparse.isspmatrix_coo(mat):
            with open(filename, "wb") as out:
                np.save(mat, out)
    
        else:
            with open(filename, "wb") as out:
                save_sparse_csr(out, mat)
    
    @staticmethod
    def cache_exist(mat_file):
        if os.path.exists("%.dense_mat" % mat_path):
            return FeatureStroage.DENSE
        else:
            if os.path.exists("%.coo_mat" % mat_path):
                return FeatureStroage.COO
            else:
                return None

    @classmethod
    def load(feature_name, corpora_name):
        mat_path = os.path.join(cache_dir,
                                "%s__%s.cache_mat" % (feature_name,
                                                      corpora_name))
        existed = cache_exist(mat_path)
        if not existed:
            return None
        else:
            # Do extraction.
            if existed is FeatureStroage.COO:                
                return FeatureStroage.load_sparse_csr(mat_file)
            
            if existed is FeatureStroage.DENSE:
                return FeatureStroage.load_dense_mat(mat_file)

class FeatureExtractor(object):
    def __init__(self, feature_functions):
        self.ffs = feature_functions
    
    def extract(self, corpora_name, corpora, force_update=True , save_new_to_cache = True):
        matrices = []
        lookup_lexicons = []
        for feature_func in self.ffs:
            cache = None
            if not force_update:
                # Allow to use cache
                cache = FeatureStroage.load(feature_func, corpora_name)
                if cache:
                    # Cache is valid
                    mat = cache
                    matrices.append(mat)
            
            if not cache:           
                # If not allow to use cache or cache missed.
                mat = feature_func.extrcat(corpora)
                matrices.append(mat)
            
            for v in feature_func.lex:
                lookup_lexicons.append(v)
        return (matrices, lookup_lexicons)
                        
    def build_lexicon(self, corpora_name, corpora, min_support=5, force_update=False):
        for feature_func in self.ffs:            
            feature_func.build_lexicon(corpora, min_support, force_update)
            
            
class FeatureFunction_(object):
    
    cache_dir = "/tmp/cache"
    
    def __init__(self, feature_func, name, reuse_lex_from_cache = True):
        self.lex = None
        self.func = feature_func
        self.name = name
        if reuse_lex_from_cache:
            lex_path = os.path.join(FeatureFunction_.cache_dir,
                            "%s.cache_lex" % (name))
            if os.path.exists(lex_path):
                ## Load the lex file
                self.lex = load_pickle(lex_path)

    def __call__(self, *args, **kwargs):
        return self.func(*args, **kwargs)
    
    def __repr__(self):
        return "%s.FeatureFunction()" % self.name

    def build_lexicon(self, corpora, min_support=5, force_update=False):
        if not force_update and self.lex is not None:
            return
        self.lex = Lexicon()
        
        
    def freeze_lexicon(self, lex):
        self.lex.allow_new_lexemes = False
    
    def prune(self, min_support):
        self.lex.prune(min_support)
        
    def matrix_of(self, objs):
        if self.lex is None:
            self.lex = Lexicon()
        features = []
        for x in objs:
            ret = self.func(x)
            row = []
            for x in ret:
                try:
                    k, v = x
                except ValueError:
                    k = x
                    v = 1.0
                row.append((k,v))
            features.append(row)
        return features
 
            
class DenseFeatureFunction_(object):
        
    def __init__(self, feature_func, name):
        self.lex = None
        self.func = feature_func
        self.name = name

    def __call__(self, *args, **kwargs):
        return self.func(*args, **kwargs)
    
    def __repr__(self):
        return "%s.FeatureFunction()" % self.name
        
    def freeze_lexicon(self, lex):
        self.lex.allow_new_lexemes = False
    
    def prune(self, min_support):
        self.lex.prune(min_support)
        
    def matrix_of(self, objs):
        rows = []
        for x in objs:
            row = self.func(x)
            rows.append(row)
        return np.vstack(rows)
    
class FeatureFunction(object):
    def __init__(self, name = None, reuse_lex = True):
        self.name = name
        self.reuse_lex = reuse_lex

    def __call__(self, original_func):
        
        if self.name is None:
            self.name = original_func.__name__
        
        return FeatureFunction_(original_func, self.name, self.reuse_lex)
    
class DenseFeatureFunction(object):
    def __init__(self, name = None):
        self.name = name

    def __call__(self, original_func):
        
        if self.name is None:
            self.name = original_func.__name__
        
        return DenseFeatureFunction_(original_func, self.name)
    


In [34]:
print_when_run = True
@FeatureFunction()
def length(s):
    if print_when_run:
        print("length running...")
    yield "%d" % len(s)
    
@FeatureFunction()
def letter_in_word(s):
    if print_when_run:
        print("letter_in_word running...")
    for c in s:
        yield c

@FeatureFunction()
def letter_freq_in_word(s):
    if print_when_run:
        print("letter_in_word running...")
    m = defaultdict(int)
    for c in s:
        m[c] += 1.0
    for c in m:
        yield c, m[c]

        
@DenseFeatureFunction()
def number_of_a_and_b(s):
    if print_when_run:
        print("number_of_a_and_b running...")    
    ac = 0
    bc = 0
    for w in s:
        if w == "a":
            ac += 1
        if w == "b":
            bc += 1
    return np.asarray([ac,bc])


In [35]:
ffs = [length, letter_in_word, letter_freq_in_word, number_of_a_and_b]
dataset = ["a", "b", "xx", "casa", "dbba"]


In [36]:
number_of_a_and_b.matrix_of(dataset)

number_of_a_and_b running...
number_of_a_and_b running...
number_of_a_and_b running...
number_of_a_and_b running...
number_of_a_and_b running...


array([[1, 0],
       [0, 1],
       [0, 0],
       [2, 0],
       [1, 2]])

In [37]:
letter_freq_in_word.matrix_of(dataset)

letter_in_word running...
letter_in_word running...
letter_in_word running...
letter_in_word running...
letter_in_word running...


[[('a', 1.0)],
 [('b', 1.0)],
 [('x', 2.0)],
 [('a', 2.0), ('c', 1.0), ('s', 1.0)],
 [('a', 1.0), ('b', 2.0), ('d', 1.0)]]

In [38]:
letter_in_word.matrix_of(dataset)

letter_in_word running...
letter_in_word running...
letter_in_word running...
letter_in_word running...
letter_in_word running...


[[('a', 1.0)],
 [('b', 1.0)],
 [('x', 1.0), ('x', 1.0)],
 [('c', 1.0), ('a', 1.0), ('s', 1.0), ('a', 1.0)],
 [('d', 1.0), ('b', 1.0), ('b', 1.0), ('a', 1.0)]]