# Goal

* Fix the GenRA classes after upgrading to sklearn 0.24
* Make sure the imblearn package runs correctly

In [1]:
import sys,os,re
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

os.environ['PYTHONPATH']=os.environ.get('SRC')
if not os.environ.get('SRC') in sys.path: 
    sys.path.insert(0,os.environ.get('SRC'))

%load_ext autoreload
%autoreload 2
%pylab inline
%matplotlib inline


import pandas as pd
import numpy as np
import pylab as pl
import scipy as sp
import seaborn as sns
from scipy import stats
from box import Box
import warnings

from IPython.core.display import display, HTML

warnings.simplefilter('ignore')

D1 = os.environ.get('DAT')+'/shah-2016/'
FIG_DIR = os.environ.get('FIG')

Populating the interactive namespace from numpy and matplotlib


# Create dummy data set

In [2]:
# Authors: Christos Aridas
#          Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT

# %%
print(__doc__)

# %% [markdown]
# Let's first create an imbalanced dataset and split in to two sets.

# %%
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_classes=2,
    class_sep=1.25,
    weights=[0.3, 0.7],
    n_informative=3,
    n_redundant=1,
    flip_y=0,
    n_features=5,
    n_clusters_per_class=1,
    n_samples=5000,
    random_state=10,
)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

Automatically created module for IPython interactive environment


In [3]:
from genra.rax.skl.cls import *


# Test GenRAPredClass 

In [5]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer,f1_score,precision_score,recall_score,roc_auc_score
from genra.rax.skl.cls import *

Scorers = dict(f1_score=make_scorer(f1_score),
              precision_score= make_scorer(precision_score),
              recall_score= make_scorer(recall_score),
              roc_auc=make_scorer(roc_auc_score))

GP = GenRAPredClass(n_neighbors=10,metric='jaccard',n_jobs=-1)


P = pd.DataFrame(cross_validate(GP,X,y,cv=10,scoring=Scorers))
R1 = pd.DataFrame(dict(mn=P.mean(),sd=P.std()))
R1

Unnamed: 0,mn,sd
fit_time,0.002753,0.000315
score_time,0.149711,0.326355
test_f1_score,0.0,0.0
test_precision_score,0.0,0.0
test_recall_score,0.0,0.0
test_roc_auc,0.5,0.0


# Test GenRAPredValue

In [8]:
from genra.rax.skl.reg import GenRAPredValue

In [9]:
from genra.rax.skl.reg import GenRAPredValue
from sklearn.metrics import make_scorer,explained_variance_score,roc_auc_score,r2_score,f1_score,accuracy_score,precision_score,recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut

#KNeighborsClassifier(n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, 
#                     metric=’minkowski’, metric_params=None, n_jobs=1)[source]

GP1 = GenRAPredValue(algorithm='brute')

params={'n_neighbors': range(1,15),
        'metric':['jaccard','euclidean']
       }


Grid3= GridSearchCV(estimator=GP1,param_grid=params,
                    n_jobs=-1,cv=5,
                    verbose=1,
                    scoring=make_scorer(r2_score))


Best3=Grid3.fit(X,y)

AttributeError: 'GenRAPredValue' object has no attribute 'sim_params'

Usage of pipeline embedding samplers
====================================

An example of the :class:~imblearn.pipeline.Pipeline


In [6]:
# %% [markdown]
# Now, we will create each individual steps that we would like later to combine

# %%
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE

pca = PCA(n_components=2)
enn = EditedNearestNeighbours()
smote = SMOTE(random_state=0)
knn = KNeighborsClassifier(n_neighbors=1)

# %% [markdown]
# Now, we can finally create a pipeline to specify in which order the different
# transformers and samplers should be executed before to provide the data to
# the final classifier.

# %%
from imblearn.pipeline import make_pipeline

model = make_pipeline(pca, enn, smote, knn)

# %% [markdown]
# We can now use the pipeline created as a normal classifier where resampling
# will happen when calling `fit` and disabled when calling `decision_function`,
# `predict_proba`, or `predict`.

# %%
from sklearn.metrics import classification_report

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       375
           1       1.00      1.00      1.00       875

    accuracy                           0.99      1250
   macro avg       0.99      0.99      0.99      1250
weighted avg       0.99      0.99      0.99      1250

