In [1]:
import libmxnet
import numpy as np
import subprocess
from sklearn.base import BaseEstimator
import os
import shutil
import time
from itertools import izip, imap
from sklearn.metrics import log_loss, mean_squared_error
from scipy.special import expit 

In [2]:
class SklearnMxnet(BaseEstimator):
    def __init__(
        self, iters_count=100, depth=None, 
        mxnet_mode='regressor', dicrete_levels=None,
        factors_fraction=None, f_taken_fraction=None,
        model_complexity=None, verbose=False
    ):
        assert(mxnet_mode in ['regressor', 'bin_classifier'])
        self.mxnet_mode = mxnet_mode
        self.iters_count = iters_count
        self.depth = depth
        self.dicrete_levels = dicrete_levels
        self.factors_fraction = factors_fraction
        self.f_taken_fraction = f_taken_fraction
        self.model_complexity = model_complexity
        self.verbose = verbose
    
    def fit(self, X, y):
        folder_name = '__tmp_mxnet_learn_{}'.format(time.time())
        os.makedirs(folder_name)
        
        with open(folder_name + '/train', 'w') as f:
            for i, (features, target) in enumerate(izip(X, y)):
                f.write('{query_id}\t{relevance}\t{label}\t{weight}\t{features}\n'.format(
                    query_id = 0,
                    relevance = target,
                    weight = 1,
                    label = i,
                    features = '\t'.join(imap(str, features)),
                ))
                
        cmd = ['matrixnet']
        cmd.extend(['-f', folder_name + '/train'])
        cmd.extend(['-o', folder_name + '/matrixnet'])

        if self.mxnet_mode == 'bin_classifier':
            cmd.extend(['-c', '--c-fast'])
            
        if self.dicrete_levels is not None:
            cmd.extend(['-x', str(self.dicrete_levels)])
            
        if self.factors_fraction is not None:
            cmd.extend(['-Z', str(self.factors_fraction)])
            
        if self.f_taken_fraction is not None:
            cmd.extend(['-S', str(self.f_taken_fraction)])
            
        if self.model_complexity is not None:
            cmd.extend(['-g', str(self.model_complexity)])
        
        if self.iters_count is not None:
            cmd.extend(['-i', str(self.iters_count)])
        
        if self.depth is not None:
            cmd.extend(['-n', str(self.depth)])
        
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        resp = process.communicate()[0]

        if self.verbose:
            print resp
        
        self.mxnet_ = libmxnet.TMXNetInfo(folder_name + '/matrixnet.info')
        
        shutil.rmtree(folder_name)
        return self
        
        
    def predict(self, X):
        try:
            getattr(self, "mxnet_")
        except AttributeError:
            raise RuntimeError("You must train mxnet before predicting data")
        prediction = np.array(self.mxnet_.Calculate([list(x) for x in X]))
        if self.mxnet_mode == 'bin_classifier':
            prediction = (expit(prediction) > 0.5).astype(int)
        return prediction
    
    def predict_proba(self, X):
        try:
            getattr(self, "mxnet_")
        except AttributeError:
            raise RuntimeError("You must train mxnet before predicting data")
        prediction = np.array(self.mxnet_.Calculate([list(x) for x in X]))
        if self.mxnet_mode == 'bin_classifier':
            prediction = expit(prediction)
        return prediction
    
    def get_params(self, deep=True):
        return {
            'mxnet_mode': self.mxnet_mode,
            'iters_count': self.iters_count,
            'depth': self.depth,
            'dicrete_levels': self.dicrete_levels,
            'factors_fraction': self.factors_fraction,
            'f_taken_fraction': self.f_taken_fraction,
            'model_complexity': self.model_complexity,
            'verbose': self.verbose
        }
    
    def set_params(self, **params):
        self.mxnet_mode = params.get('mxnet_mode', self.mxnet_mode)
        self.iters_count = params.get('iters_count', self.iters_count)
        self.depth = params.get('depth', self.depth)
        self.dicrete_levels = params.get('dicrete_levels', self.dicrete_levels)
        self.factors_fraction = params.get('factors_fraction', self.factors_fraction)
        self.f_taken_fraction = params.get('f_taken_fraction', self.f_taken_fraction)
        self.model_complexity = params.get('model_complexity', self.model_complexity)
        
        self.verbose = params.get('verbose', self.verbose)
        
        return self
    
    def score(self, X, y):
        y_pred = self.predict_proba(X)
        if self.mxnet_mode == 'bin_classifier':
            return log_loss(y, y_pred)
        if self.mxnet_mode == 'regressor':
            return - mean_squared_error(y, y_pred)

In [3]:
from sklearn.datasets import make_regression
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LinearRegression

In [4]:
X_data, y_data = make_regression(n_features=10, n_samples=5000, noise=1000)

In [5]:
tuned_params = {'depth' : [2, 4, 6, 8], 'iters_count':[100, 200, 300], 'f_taken_fraction': [None, 0.7]}
gs = GridSearchCV(SklearnMxnet(), tuned_params, scoring='mean_squared_error')
gs.fit(X_data, y_data)

print gs.best_params_
print gs.best_score_

{'depth': 2, 'iters_count': 300, 'f_taken_fraction': None}
-996290.978241


In [6]:
tuned_params = {'fit_intercept' : [True, False]}
gs = GridSearchCV(LinearRegression(), tuned_params, scoring='mean_squared_error')
gs.fit(X_data, y_data)

print gs.best_params_
print gs.best_score_

{'fit_intercept': False}
-992477.518841


In [7]:
from sklearn.datasets import make_classification
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [8]:
X_data, y_data = make_classification(n_features=10, n_samples=5000)

In [9]:
tuned_params = {'depth' : [2, 4, 6, 8], 'iters_count':[200]}
gs = GridSearchCV(SklearnMxnet(mxnet_mode='bin_classifier'), tuned_params, scoring='log_loss')
gs.fit(X_data, y_data)

print gs.best_params_
print gs.best_score_

{'depth': 4, 'iters_count': 200}
-0.218350088109


In [10]:
tuned_params = {'penalty' : ['l1', 'l2']}
gs = GridSearchCV(LogisticRegression(), tuned_params, scoring='log_loss')
gs.fit(X_data, y_data)

print gs.best_params_
print gs.best_score_

{'penalty': 'l1'}
-0.289103510667


In [11]:
from sklearn.datasets import load_boston
boston = load_boston()

In [12]:
tuned_params = {'fit_intercept' : [True, False]}
gs = GridSearchCV(LinearRegression(), tuned_params, scoring='mean_squared_error')
gs.fit(boston.data, boston.target)

print gs.best_params_
print gs.best_score_

{'fit_intercept': True}
-168.089177602


In [13]:
tuned_params = {'depth' : [2, 4, 6, 8], 'iters_count':[100, 200, 300], 'f_taken_fraction': [None, 0.7]}
gs = GridSearchCV(SklearnMxnet(), tuned_params, scoring='mean_squared_error')
gs.fit(boston.data, boston.target)

print gs.best_params_
print gs.best_score_

{'depth': 2, 'iters_count': 300, 'f_taken_fraction': 0.7}
-28.2219447143
