In [1]:
# importing key packages
import os
import random
import glob
import numpy as np
import pandas as pd
from src.raman.sample_builder import SampleBuilder
from src.raman.sample import Sample
random.seed(42)
from src.raman.constants import Label

In [None]:
def split_train_val_test(sample_list, train_frac, val_frac):
    train_num = int(len(sample_list)*train_frac)
    val_num = int(len(sample_list)*val_frac)
    test_num = len(sample_list) - train_num - val_num
    assert train_num != 0 or train_frac == 0, 'train fraction too small'
    assert val_num !=0 or val_frac == 0, 'val fraction too small'
    index_list = random.shuffle([i for i in range(0, len(sample_list))])
    train_vals = sample_list[index_list[0:train_num]]
    val_vals = sample_list[index_list[train_num: val_num]]
    test_vals = sample_list[index_list[val_num:-1]]

    return train_vals, val_vals, test_vals

def make_df(sample_list):
    df = pd.DataFrame()
    for sample in sample_list:
       df.append(sample.to_pandas(), ignore_index=True)
    return df

def get_X_Y(df, remove_maybe_uncat=True):
    if remove_maybe_uncat:
        df = df[(df['labels'] == Label.GOOD ) & (df['labels'] == Label.BAD)]

    X = df['spectrum'].to_numpy()
    Y_obj = df['labels'].to_numpy
    Y = np.array([y.value for y in Y_obj])
    return X, Y

In [None]:
# Set Directory of Labeled raman data and split into test, train val
labeled_data_dir = '/data/'
net_cdf_files = glob.glob(os.path.join(labeled_data_dir, '*.nc'))
sample_list = []
for file in net_cdf_files:
    sample_list.append(Sample.build_from_netcdf(file))

train_samples, val_samples, test_samples = split_train_val_test(sample_list, 0.6, 0.2)

In [None]:
# make a training test and val dataframe
train_df = make_df(train_samples)
val_df = make_df(val_samples)
test_df = make_df(test_samples)

X_train, y_train = get_X_Y(train_df)
X_val, y_val = get_X_Y(val_df)
X_test, y_test = get_X_Y(test_df)

In [None]:
#Build Models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
import xgboost as xgb


In [None]:
# Create classifiers
lr = LogisticRegression()
gnb = GaussianNB()
svc = LinearSVC(C=1.0)
rfc = RandomForestClassifier()


# #############################################################################
# Plot calibration plots

plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for clf, name in [(lr, 'Logistic'),
                  (gnb, 'Naive Bayes'),
                  (svc, 'Support Vector Classification'),
                  (rfc, 'Random Forest')]:
    clf.fit(X_train, y_train)
    if hasattr(clf, "predict_proba"):
        prob_pos = clf.predict_proba(X_test)[:, 1]
    else:  # use decision function
        prob_pos = clf.decision_function(X_test)
        prob_pos = \
            (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    fraction_of_positives, mean_predicted_value = \
        calibration_curve(y_test, prob_pos, n_bins=10)

    ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
             label="%s" % (name, ))

    ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
             histtype="step", lw=2)

ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots  (reliability curve)')

ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)

plt.tight_layout()
plt.show()