In [1]:
# Read in data with an optional subset to shorten testing time
# If reshape=True, return 2d array values
def load_dataset(filename, cramer_coef, target, var_count, row_count, reshape=False):
    df = pd.read_csv(filename, index_col=0, header=0, nrows=row_count)
    
    # split into input (X) and output (y) variables
    X = df.drop(target, axis=1, inplace=False)
    
    # order columns by Cramer coeffs
    cramer_df = pd.read_csv(cramer_coef, sep='\t', header=None)
    cols = cramer_df.iloc[:, 0].tolist()
    X = X[cols]
    y = df[target]
    
    # Reduce # of variables
    if var_count < X.shape[1]:
        X = X.iloc[:, 0:var_count]
        
    # reshape target to be a 2d array, if flag set
    if reshape:
        y = df[target].values
        y = y.reshape((len(y), 1))
        X = X.values

    return X, y 

In [2]:
%%time
# evaluate weighted svm with calibrated probabilities for imbalanced classification
import pandas as pd
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
# load dataset
X, y = load_dataset('/MFMDatasets/MFM_bopf/data/csl/CSL_tl_PI_binned.csv', 
                    '/MFMDatasets/MFM_bopf/data/csl/CramerTheil/Cramer_PI_Tl_coeff_ALL.csv', 
                    'trans_loss', 50, 200000, reshape=False)
print(f'X.shape = {X.shape}; y.shape = {y.shape}')

# define model
model = SVC(gamma='scale', class_weight='balanced', C=0.1, kernel='linear')
#model = RandomForestClassifier(n_estimators=128, max_depth=40, min_samples_leaf=1, min_samples_split=2, class_weight='balanced', C=0.1, kernel='linear')
# wrap the model
calibrated = CalibratedClassifierCV(model, method='isotonic', cv=2)
#calibrated = CalibratedClassifierCV(model, method='sigmoid', cv=3)
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=1)
#cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=1)
# evaluate model
scores = cross_val_score(calibrated, X, y, scoring='roc_auc', cv=cv, n_jobs=6)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))

X.shape = (185413, 50); y.shape = (185413,)
Mean ROC AUC: 0.700
CPU times: user 4.17 s, sys: 1.12 s, total: 5.29 s
Wall time: 38min 53s


In [3]:
scores

array([0.68897299, 0.67377507, 0.74288295, 0.72350062, 0.67451299,
       0.6937483 ])

In [4]:
proba = calibrated.predict_proba(X)
prob1 = probs[:,1]

NotFittedError: This CalibratedClassifierCV instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve
%matplotlib inline

plt.rcParams["figure.figsize"] = (9, 7)

brier_score = brier_score_loss(y, prob1)
plot_name = 'Brier_score = {:.2f}'.format(brier_score)

fop, mpv = calibration_curve(y, prob1, n_bins=20)
plt.plot(mpv, fop, label=plot_name)
plt.xlabel('Mean Predicted Value')
plt.ylabel('Fraction of Positives')
plt.grid(True)