In [1]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


def prepare_rice_data():
    path = kagglehub.dataset_download("seymasa/rice-dataset-gonenjasmine")
    df = pd.read_csv(path + "/Rice-Gonen andJasmine.csv")
    df = df.drop(["id"], axis=1)
    df["Class"], _ = pd.factorize(df["Class"])

    indices = np.arange(len(df["Class"].to_numpy()))
    np.random.shuffle(indices)

    X = df.drop(["Class"], axis=1).to_numpy()[indices[:1000]]
    y = df["Class"].to_numpy()[indices[:1000]]

    return X, y


X, y = prepare_rice_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
path = kagglehub.dataset_download("seymasa/rice-dataset-gonenjasmine")
df = pd.read_csv(path + "/Rice-Gonen andJasmine.csv")
df




Unnamed: 0,id,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
0,1,4537,92.229316,64.012769,0.719916,4677,76.004525,0.657536,273.085,0.764510,1.440796,jasmine
1,2,2872,74.691881,51.400454,0.725553,3015,60.471018,0.713009,208.317,0.831658,1.453137,jasmine
2,3,3048,76.293164,52.043491,0.731211,3132,62.296341,0.759153,210.012,0.868434,1.465950,jasmine
3,4,3073,77.033628,51.928487,0.738639,3157,62.551300,0.783529,210.657,0.870203,1.483456,jasmine
4,5,3693,85.124785,56.374021,0.749282,3802,68.571668,0.769375,230.332,0.874743,1.510000,jasmine
...,...,...,...,...,...,...,...,...,...,...,...,...
18180,18181,5853,148.624571,51.029281,0.939210,6008,86.326537,0.498594,332.960,0.663444,2.912535,Gonen
18181,18182,7585,169.593996,58.141659,0.939398,7806,98.272692,0.647461,385.506,0.641362,2.916910,Gonen
18182,18183,6365,154.777085,52.908085,0.939760,6531,90.023162,0.561287,342.253,0.682832,2.925396,Gonen
18183,18184,5960,151.397924,51.474600,0.940427,6189,87.112041,0.492399,343.371,0.635227,2.941216,Gonen


In [2]:
from sklearn.tree import DecisionTreeClassifier


class RandomSubspaceMethod:
    def __init__(
        self,
        BaseEstimator=DecisionTreeClassifier,
        n_estimators=10,
        subspace_size=0.5,
        random_state=None,
    ):
        self.BaseEstimator = BaseEstimator
        self.n_estimators = n_estimators
        self.subspace_size = subspace_size
        self.random_state = random_state
        self.estimators = []
        self.feature_indices = []

    def fit(self, X: np.ndarray | pd.DataFrame, y: np.ndarray):
        _n_samples, n_features = X.shape
        n_subspace_features = max(1, int(n_features * self.subspace_size))

        rng = np.random.RandomState(self.random_state)

        for i in range(self.n_estimators):
            feature_indices = rng.choice(n_features, n_subspace_features, replace=False)
            self.feature_indices.append(feature_indices)

            estimator = self.BaseEstimator()
            estimator.fit(X[:, feature_indices], y)
            self.estimators.append(estimator)

        return self

    def predict(self, X):
        predictions = np.zeros((X.shape[0], len(self.estimators)))

        for estimator_idx, (estimator, features) in enumerate(
            zip(self.estimators, self.feature_indices)
        ):
            predictions[:, estimator_idx] = estimator.predict(X[:, features])

        return np.array([np.bincount(row.astype(int)).argmax() for row in predictions])


rsm = RandomSubspaceMethod()

rsm.fit(X_train, y_train)


<__main__.RandomSubspaceMethod at 0x7f7065333710>

In [3]:
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import sklearn


def perform_cross_validation(
    X: np.ndarray,
    y: np.ndarray,
    model: sklearn.base.ClassifierMixin,
    n_splits: int = 5,
    random_state: int = 0,
) -> dict:
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    cv_scores = []

    for train_index, test_index in kf.split(X):
        X_cv_train, X_cv_test = X[train_index], X[test_index]
        y_cv_train, y_cv_test = y[train_index], y[test_index]

        model.fit(X_cv_train, y_cv_train)

        y_cv_pred = model.predict(X_cv_test)
        f1 = f1_score(y_cv_test, y_cv_pred)
        cv_scores.append(f1)

    results = {"scores": cv_scores, "mean_score": np.mean(cv_scores)}

    return results


perform_cross_validation(
    X,
    y,
    rsm,
)


{'scores': [0.9834254143646409, 1.0, 1.0, 1.0, 1.0],
 'mean_score': 0.9966850828729281}

In [4]:
# Эталонная реализация Random Subspace Method с использованием scikit-learn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import time


start_time = time.time()

n_estimators = 10
max_features = 0.5
rsm_sklearn = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=n_estimators,
    max_features=max_features,
    bootstrap=False,
    bootstrap_features=True,
    random_state=0,
)
rsm_sklearn.fit(X_train, y_train)
perform_cross_validation(
    X,
    y,
    rsm_sklearn,
)


{'scores': [0.994413407821229,
  1.0,
  0.9704142011834319,
  1.0,
  0.9885057471264368],
 'mean_score': 0.9906666712262195}

In [5]:
%timeit rsm_sklearn.fit(X_train, y_train)


16.2 ms ± 71 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [6]:
%timeit rsm.fit(X_train, y_train)


10.8 ms ± 95.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
