Skip to content

Commit

Permalink
Merge pull request #9 from jlgarridol/development
Browse files Browse the repository at this point in the history
Update to version 1.0.3
  • Loading branch information
jlgarridol committed Mar 29, 2023
2 parents 51a132e + f2563cb commit 21b3d2c
Show file tree
Hide file tree
Showing 11 changed files with 228 additions and 80 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.0.3] - 2023-03-29

### Added
- Methods now support no unlabeled data. In this case, the method will return the same as the base estimator.

### Changed
- In OneHotEncoder, the `sparse` parameter is now `sparse_output` to avoid a FutureWarning.

### Fixed

- CoForest now is most similar to the original paper.
- TriTraining can use at least 3 n_jobs. Fixed the bug that allows using as many n_jobs as cpus in the machine.

## [1.0.2] - 2023-02-17

### Fixed
Expand Down
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@ Citing
César García-Osorio and
Juan J. Rodríguez and
Jesus Maudes},
title = {jlgarridol/sslearn: Zenodo Indexed},
month = jan,
title = {jlgarridol/sslearn: V1.0.2},
month = feb,
year = 2023,
publisher = {Zenodo},
version = {1.0.1},
doi = {10.5281/zenodo.7565222},
url = {https://doi.org/10.5281/zenodo.7565222}
version = {1.0.2},
doi = {10.5281/zenodo.7650049},
url = {https://doi.org/10.5281/zenodo.7650049}
}
```
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ filterwarnings =
ignore:divide by zero
ignore:X does not have valid feature names
ignore:invalid value encountered in divide
ignore:Poolsize
ignore:y contains no unlabeled samples
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ joblib==1.2.0
numpy==1.23.3
pandas==1.4.3
scikit_learn==1.2.0
scipy==1.9.3
scipy==1.10.1
statsmodels==0.13.2
4 changes: 0 additions & 4 deletions sslearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,13 @@
from sklearn.ensemble._base import _set_random_states
from sklearn.utils import check_random_state




def get_dataset(X, y):

is_df = False
if isinstance(X, pd.DataFrame):
is_df = True
columns = X.columns


X = check_array(X)
y = check_array(y, ensure_2d=False, dtype=y.dtype.type)

Expand Down
4 changes: 2 additions & 2 deletions sslearn/model_selection/_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ def artificial_ssl_dataset(X, y, label_rate=0.1, random_state=None, **kwards):
y_unlabel: ndarray
The true label for each y in the same order.
"""
assert (label_rate > 0) and (label_rate < 100),\
"Label rate must be in (0, 100)."
assert (label_rate > 0) and (label_rate < 1),\
"Label rate must be in (0, 1)."
assert "test_size" not in kwards and "train_size" not in kwards,\
"Test size and train size are illegal parameters in this method."
X_label, X_unlabel, y_label, true_label = \
Expand Down
33 changes: 25 additions & 8 deletions sslearn/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,30 +3,25 @@
import math

import pandas as pd
import json

from statsmodels.stats.proportion import proportion_confint
import scipy.stats as st
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import ClassifierMixin

import sslearn


def safe_division(dividend, divisor, epsilon):
if divisor == 0:
return dividend / epsilon
return dividend / divisor


def confidence_interval(X, hyp, y, alpha=.95 ):
def confidence_interval(X, hyp, y, alpha=.95):
data = hyp.predict(X)

successes = np.count_nonzero(data == y)
trials = X.shape[0]
li, hi = proportion_confint(successes, trials, alpha=1 - alpha, method="wilson")
return li, hi



def choice_with_proportion(predictions, class_predicted, proportion, extra=0):
Expand Down Expand Up @@ -69,6 +64,27 @@ def is_int(x):
return isinstance(x, (int, np.integer)) and not isinstance(x, bool)


def mode(y):
"""Calculate the mode of a list of values
Parameters
----------
y : array-like of shape (n_samples, n_estimators)
array of values
Returns
-------
mode: array-like of shape (n_samples,)
array of mode of each label
count: array-like of shape (n_samples,)
array of count of the mode of each label
"""
array = pd.DataFrame(np.array(y))
mode = array.mode(axis=0).loc[0, :]
count = array.apply(lambda x: x.value_counts().max())
return mode.values, count.values


def check_n_jobs(n_jobs):
"""Check `n_jobs` parameter according to the scikit-learn convention.
From sktime: BSD 3-Clause
Expand Down Expand Up @@ -101,9 +117,10 @@ def calc_number_per_class(y_label):
number_per_class = dict()
for c in classes:
number_per_class[c] = math.ceil(proportion[c] * factor)

return number_per_class


def check_classifier(base_classifier, can_be_list=True, collection_size=None):

if base_classifier is None:
Expand All @@ -114,7 +131,7 @@ def check_classifier(base_classifier, can_be_list=True, collection_size=None):
raise AttributeError(f"base_classifier is a list of classifiers, but its length ({len(base_classifier)}) is different from expected ({collection_size})")
for i, bc in enumerate(base_classifier):
base_classifier[i] = check_classifier(bc, False)
return list(base_classifier) # Transform to list
return list(base_classifier) # Transform to list
else:
if not isinstance(base_classifier, ClassifierMixin):
raise AttributeError(f"base_classifier must be a ClassifierMixin, but found {type(base_classifier)}")
Expand Down
Loading

0 comments on commit 21b3d2c

Please sign in to comment.