Merge pull request #2 from ikegami-yukino/sklearn-interface

Sklearn interface
ikegami-yukino · Nov 25, 2016 · e7a30e1 · e7a30e1
2 parents a8858d9 + 00291b3
commit e7a30e1
Show file tree

Hide file tree

Showing 4 changed files with 104 additions and 18 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,19 +1,31 @@
 language: python
 python:
-  - "2.6"
   - "2.7"
-  - "3.3"
   - "3.4"
+  - "3.5"
+
+# Setup anaconda
+before_install:
+  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
+      wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
+    else
+      wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
+    fi
+  - chmod +x miniconda.sh
+  - ./miniconda.sh -b -p /home/travis/miniconda
+  - export PATH=/home/travis/miniconda/bin:$PATH
+  - conda update --yes conda
+
+# Install packages
 install:
-  - "python setup.py install"
-  - "pip install coveralls"
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION numpy scipy nose
+  - pip install coveralls nose-cov
+  - python setup.py install
+
+# Run test
 script:
-  - "nosetests --with-coverage --cover-package=oll"
+  - nosetests --with-coverage --cover-package=oll
+
+# Calculate coverage
 after_success:
   - coveralls
-notifications:
-  email:
-    recipients:
-      - yukino0131@me.com
-    on_success: always
-    on_failure: always
diff --git a/oll/oll.py b/oll/oll.py
@@ -512,6 +512,52 @@ def add(self, example, y):
             fv.push_back(IntFloatPair(_id, value))
         self.train_method(fv, y)
 
+    def _array_to_feature_vector(self, x):
+        fv = FeatureVector()
+        if hasattr(x, 'indices'):  # for sparse matrix
+            indices = map(int, x.indices)
+            values = map(float, x.data)
+        else:
+            nonzero = x.nonzero()
+            indices = map(int, nonzero[0])
+            values = map(float, x[nonzero])
+        for (_id, value) in zip(indices, values):
+            fv.push_back(IntFloatPair(_id, value))
+        return fv
+
+    def fit(self, X, y):
+        """
+        train examples from numpy/scipy array
+
+        Args
+        X : numpy.ndarray or scipy.sparse matrix,
+            shape = (n_samples, self.n_features)
+        y : iterable
+        """
+        assert set(y) == set([1, -1])
+        for (i, y_i) in enumerate(map(int, y)):
+            fv = self._array_to_feature_vector(X[i])
+            self.train_method(fv, y_i)
+
+    def predict(self, X):
+        """
+        predict examples from numpy/scipy array
+
+        Args
+        X : numpy.ndarray or scipy.sparse matrix,
+            shape = (n_samples, self.n_features)
+        Return
+        labels : list (it takes 1 or -1)
+        """
+        X = X.astype('float32')
+        labels = []
+        for i in range(X.shape[0]):
+            fv = self._array_to_feature_vector(X[i])
+            score = _oll.oll_classify(self, fv)
+            labels.append(1 if score > 0 else -1)
+        return labels
+
+
 oll_swigregister = _oll.oll_swigregister
 oll_swigregister(oll)
 

diff --git a/setup.py b/setup.py
@@ -1,10 +1,11 @@
 # -*- coding: utf-8 -*-
+from codecs import open
 import os
 import re
 from setuptools import setup, Extension
 
 
-with open(os.path.join('oll', '__init__.py'), 'r') as f:
+with open(os.path.join('oll', '__init__.py'), 'r', encoding='utf8') as f:
     version = re.compile(
         r'.*__version__ = "(.*?)"', re.S).match(f.read()).group(1)
 
@@ -20,10 +21,11 @@
     name='oll',
     version=version,
     author="Yukino Ikegami",
-    author_email='yukino0131@me.com',
+    author_email='yknikgm@gmail.com',
     url='https://github.com/ikegami-yukino/oll-python',
-    description="Online machine learning algorithms library (wrapper for OLL C++ library)",
-    long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(),
+    description="Online binary classification algorithms library (wrapper for OLL C++ library)",
+    long_description='%s\n\n%s' % (open('README.rst', encoding='utf8').read(),
+                                   open('CHANGES.rst', encoding='utf8').read()),
     ext_modules=[oll_module],
     py_modules=["oll"],
     headers=['lib/oll.hpp'],
@@ -33,14 +35,13 @@
         'Intended Audience :: Developers',
         'Intended Audience :: Science/Research',
         'License :: OSI Approved :: BSD License',
-        'Programming Language :: Python :: 2.6',
         'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3.3',
         'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
         'Topic :: Scientific/Engineering :: Information Analysis',
         'Topic :: Text Processing :: Linguistic'
-        ],
+    ],
     keywords=['machine learning', 'online learning', 'perceptron',
               'Passive Agressive', 'PA', 'ALMA',
               'Confidence Weighted Linear-Classification'],

diff --git a/test_oll.py b/test_oll.py
@@ -2,6 +2,8 @@
 import os
 import tempfile
 from nose.tools import ok_, eq_, assert_raises, assert_almost_equals
+import numpy as np
+from scipy.sparse import csr_matrix
 import oll
 
 
@@ -75,3 +77,28 @@ def test_setC(self):
 
     def test_setBias(self):
         self.oll.setBias(0.14)
+
+    def test_fit(self):
+        np_array = np.array([[1.0, 2.0, -1.0], [-0.5, 1.0, -0.5]])
+        y = [1, -1]
+        self.oll.fit(np_array, y)
+        assert_almost_equals(self.oll.classify({0: 1.0, 1: 1.0}), 0.171429, 6)
+
+        self.oll = oll.oll('PA1')
+        sparse_matrix = csr_matrix([[1.0, 2.0, -1.0], [-0.5, 1.0, -0.5]])
+        self.oll.fit(sparse_matrix, y)
+        assert_almost_equals(self.oll.classify({0: 1.0, 1: 1.0}), 0.171429, 6)
+
+        self.oll = oll.oll('PA1')
+        self.oll.fit(sparse_matrix, np.array([1, -1]))
+
+        assert_raises(AssertionError, self.oll.fit, np_array, [1, 2])
+
+    def test_predict(self):
+        self.oll.add({0: 1.0, 1: 2.0, 2: -1.0}, 1)
+        self.oll.add({0: -0.5, 1: 1.0, 2: -0.5}, -1)
+        np_array = np.array([[1.0, 1.0]])
+        eq_(self.oll.predict(np_array), [1])
+
+        sparse_matrix = csr_matrix([[1.0, 1.0]])
+        eq_(self.oll.predict(sparse_matrix), [1])