Merge pull request #9 from jlgarridol/development

Update to version 1.0.3
jlgarridol · Mar 29, 2023 · 21b3d2c · 21b3d2c
2 parents 51a132e + f2563cb
commit 21b3d2c
Show file tree

Hide file tree

Showing 11 changed files with 228 additions and 80 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.3] - 2023-03-29
+
+### Added
+- Methods now support no unlabeled data. In this case, the method will return the same as the base estimator.
+
+### Changed
+- In OneHotEncoder, the `sparse` parameter is now `sparse_output` to avoid a FutureWarning.
+
+### Fixed
+
+- CoForest now is most similar to the original paper.
+- TriTraining can use at least 3 n_jobs. Fixed the bug that allows using as many n_jobs as cpus in the machine.
+
 ## [1.0.2] - 2023-02-17
 
 ### Fixed

diff --git a/README.md b/README.md
@@ -45,12 +45,12 @@ Citing
                   César García-Osorio and
                   Juan J. Rodríguez and
                   Jesus Maudes},
-  title        = {jlgarridol/sslearn: Zenodo Indexed},
-  month        = jan,
+  title        = {jlgarridol/sslearn: V1.0.2},
+  month        = feb,
   year         = 2023,
   publisher    = {Zenodo},
-  version      = {1.0.1},
-  doi          = {10.5281/zenodo.7565222},
-  url          = {https://doi.org/10.5281/zenodo.7565222}
+  version      = {1.0.2},
+  doi          = {10.5281/zenodo.7650049},
+  url          = {https://doi.org/10.5281/zenodo.7650049}
 }
 ```
diff --git a/pytest.ini b/pytest.ini
@@ -5,3 +5,5 @@ filterwarnings =
     ignore:divide by zero
     ignore:X does not have valid feature names
     ignore:invalid value encountered in divide 
+    ignore:Poolsize
+    ignore:y contains no unlabeled samples
diff --git a/requirements.txt b/requirements.txt
@@ -2,5 +2,5 @@ joblib==1.2.0
 numpy==1.23.3
 pandas==1.4.3
 scikit_learn==1.2.0
-scipy==1.9.3
+scipy==1.10.1
 statsmodels==0.13.2
diff --git a/sslearn/base.py b/sslearn/base.py
@@ -20,17 +20,13 @@
 from sklearn.ensemble._base import _set_random_states
 from sklearn.utils import check_random_state
 
-
-
-
 def get_dataset(X, y):
 
     is_df = False
     if isinstance(X, pd.DataFrame):
         is_df = True
         columns = X.columns
 
-
     X = check_array(X)
     y = check_array(y, ensure_2d=False, dtype=y.dtype.type)
 

diff --git a/sslearn/model_selection/_split.py b/sslearn/model_selection/_split.py
@@ -76,8 +76,8 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, **kwards):
     y_unlabel: ndarray
         The true label for each y in the same order.
     """
-    assert (label_rate > 0) and (label_rate < 100),\
-        "Label rate must be in (0, 100)."
+    assert (label_rate > 0) and (label_rate < 1),\
+        "Label rate must be in (0, 1)."
     assert "test_size" not in kwards and "train_size" not in kwards,\
         "Test size and train size are illegal parameters in this method."
     X_label, X_unlabel, y_label, true_label = \

diff --git a/sslearn/utils.py b/sslearn/utils.py
@@ -3,30 +3,25 @@
 import math
 
 import pandas as pd
-import json
 
 from statsmodels.stats.proportion import proportion_confint
-import scipy.stats as st
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.base import ClassifierMixin
 
-import sslearn
-
 
 def safe_division(dividend, divisor, epsilon):
     if divisor == 0:
         return dividend / epsilon
     return dividend / divisor
 
 
-def confidence_interval(X, hyp, y, alpha=.95 ):
+def confidence_interval(X, hyp, y, alpha=.95):
     data = hyp.predict(X)
 
     successes = np.count_nonzero(data == y)
     trials = X.shape[0]
     li, hi = proportion_confint(successes, trials, alpha=1 - alpha, method="wilson")
     return li, hi
-
 
 
 def choice_with_proportion(predictions, class_predicted, proportion, extra=0):
@@ -69,6 +64,27 @@ def is_int(x):
     return isinstance(x, (int, np.integer)) and not isinstance(x, bool)
 
 
+def mode(y):
+    """Calculate the mode of a list of values
+
+    Parameters
+    ----------
+    y : array-like of shape (n_samples, n_estimators)
+        array of values
+
+    Returns
+    -------
+    mode: array-like of shape (n_samples,)
+        array of mode of each label
+    count: array-like of shape (n_samples,)
+        array of count of the mode of each label
+    """
+    array = pd.DataFrame(np.array(y))
+    mode = array.mode(axis=0).loc[0, :]
+    count = array.apply(lambda x: x.value_counts().max())
+    return mode.values, count.values
+
+
 def check_n_jobs(n_jobs):
     """Check `n_jobs` parameter according to the scikit-learn convention.
     From sktime: BSD 3-Clause
@@ -101,9 +117,10 @@ def calc_number_per_class(y_label):
     number_per_class = dict()
     for c in classes:
         number_per_class[c] = math.ceil(proportion[c] * factor)
-    
+
     return number_per_class
 
+
 def check_classifier(base_classifier, can_be_list=True, collection_size=None):
 
     if base_classifier is None:
@@ -114,7 +131,7 @@ def check_classifier(base_classifier, can_be_list=True, collection_size=None):
                 raise AttributeError(f"base_classifier is a list of classifiers, but its length ({len(base_classifier)}) is different from expected ({collection_size})")
         for i, bc in enumerate(base_classifier):
             base_classifier[i] = check_classifier(bc, False)
-        return list(base_classifier) # Transform to list
+        return list(base_classifier)  # Transform to list
     else:
         if not isinstance(base_classifier, ClassifierMixin):
             raise AttributeError(f"base_classifier must be a ClassifierMixin, but found {type(base_classifier)}")