In [None]:
#Installing the modAL library
!pip install -qq modAL

In [None]:
#Importing necessary libraries 

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline

import modAL 
from modAL.models import ActiveLearner, CommitteeRegressor
from modAL.uncertainty import uncertainty_sampling
from modAL.disagreement import max_std_sampling,KL_max_disagreement,max_disagreement_sampling

# import catboost as cb
from sklearn.model_selection import KFold, RandomizedSearchCV,GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import make_classification
from sklearn.metrics import mean_absolute_percentage_error

import seaborn as sns
from scipy.stats import randint, uniform 

np.random.seed(42)
import random
random.seed(0)

In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [None]:
from probml_utils import savefig,latexify

In [None]:
%env LATEXIFY=1
%env FIG_DIR = figures

In [None]:
latexify(fig_width=3,fig_height=2)

In [None]:
X= np.load("FVC_FEATURES_60.npy")
Y= np.load("FVC_LABELS_60.npy")

In [None]:
X= np.delete(X,[23,55,4,9,52,44,45,33,43,20,1,50], axis=0)
Y= np.delete(Y,[23,55,4,9,52,44,45,33,43,20,1,50], axis=0)

In [None]:
train_idx= [30, 47, 15, 29, 35, 9, 24, 12] 
test_idx= [45, 39, 8, 4, 20, 32, 25, 46, 42, 41]
query_idx = [i for i in range(0,48) if i not in train_idx+test_idx]

In [None]:
def mape(model, feat, train_label):
  pred = model.predict(feat)
  mpe = 100*np.mean(np.abs((train_label.reshape(-1) -pred)/train_label.reshape(-1)))
  return mpe

In [None]:
def loss_fnc(train_label,pred):
    mape = 100*np.mean(np.abs((train_label.reshape(-1) -pred)/train_label.reshape(-1)))
    return mape

from sklearn.metrics import make_scorer
loss = make_scorer(loss_fnc, greater_is_better=False)

In [None]:
print("Starting RF")
param_grid = {
    'bootstrap': [True, False],
    'max_depth': randint(10,50),
    'max_features': [2, 3, 4, 'sqrt','auto'],
    'min_samples_leaf': randint(1,10),
    'min_samples_split': randint(2,10),
    'n_estimators': randint(10,150)
}
print("params_initialised")
rf = RandomForestRegressor(random_state=0,verbose=0)
print("model_done")
grid_search_rf = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                          cv=2,n_jobs = -1, verbose = 0, scoring=loss, n_iter=20,random_state=100)
print("grid_done")
    
                   
print("Initialising Indexes")
indexAdded = []
recordedMPE = []
best_param_dict = {}

original_train = train_idx.copy()

while len(query_idx):
    print("Current length of Pool set is = {}".format(len(query_idx)))

    lowest_mpe = 100 #reset this for each run

    for datapoint_idx in query_idx:
        train_idx = original_train.copy()
        train_idx.append(datapoint_idx)

        X_train, X_test = np.float32(X[train_idx]), np.float32(X[test_idx])
        Y_train, Y_test = np.float32(Y[train_idx]), np.float32(Y[test_idx])

        grid_search_rf.fit(X_train, Y_train)
        #grid_search.best_params_
        best_grid = grid_search_rf.best_estimator_
        grid_mpe = mape(best_grid, X_test, Y_test)

        #lowest_mpe_datapoint = datapoint_idx

        if(grid_mpe) < lowest_mpe:
            idx_best_reduction = datapoint_idx;
            final_grid = best_grid;
            lowest_mpe = grid_mpe

    indexAdded.append(idx_best_reduction)
    recordedMPE.append(lowest_mpe)
    best_param_dict[idx_best_reduction] = final_grid
    print("Best Grid:",final_grid)
    print("Lowest MPE recorded = {}\n".format(lowest_mpe))

    #remove the index from the query which was just detected to give the max differnece in MPE
    query_idx.remove(indexAdded[len(indexAdded)-1]) 
    #add the index to the original train set which was just detected to give the max differnece in MPE
    original_train.append(indexAdded[len(indexAdded)-1])

# Random Sampling

In [None]:
#Splitting the data into Train and Pool set 
X_train,X_test, X_pool = X[train_idx],X[test_idx] ,X[query_idx] 
Y_train,Y_test, Y_pool = Y[train_idx],Y[test_idx] ,Y[query_idx]

In [None]:
def GP_regression_std(regressor, X):
    _, std = regressor.predict(X, return_std=True)
    mean_std = sum(std)/(len(std))
    #print(len(std))
    query_idx = np.argmax(std)
    return query_idx, X[query_idx],mean_std

In [None]:
learner_list_a = [ActiveLearner(estimator=RandomForestRegressor(random_state=0),X_training=X_train, y_training=Y_train)
               ]
committee_a = CommitteeRegressor(learner_list=learner_list_a,query_strategy=GP_regression_std)
pred, std = committee_a.predict(X[test_idx], return_std=True)
initial_scores_0= mape(pred,Y_test)          

In [None]:
#Randomly sampling 100 points from the pool and adding them to the train set. We repeat this process 20 times with different subsets of data.
random_scores=[] 

for i in range(20):
    #Creating a copy of the Train Set
    X_random_train=X_train 
    Y_random_train=Y_train
    #Creating a copy of the Pool Set
    X_random_pool = X_pool 
    Y_random_pool = Y_pool

    scores=[initial_scores_0] 

    for idx in range(len(pool_idx)):
      query_id = np.random.choice(range(X_random_pool.shape[0]), size=1, replace=False) #Querying a random index from the pool 
      X_random_train = np.concatenate((X_random_train, X_random_pool[query_id])) #Appending the Query point to the train set 
      Y_random_train = np.concatenate((Y_random_train, Y_random_pool[query_id]))
      committee_a.fit(X_random_train,Y_random_train) #Training the committee with the updated train set
      X_random_pool = np.delete(X_random_pool, query_id, axis=0) #Deleting the query point from the pool set
      Y_random_pool = np.delete(Y_random_pool, query_id)
      predn_, stdd_ = committee_a.predict(X_test, return_std=True)
      scores.append(mape(predn_,Y_test)) #Calculating the score on the updated pool set
     
    random_scores.append(scores)

In [None]:
random_scores_array = np.array(random_scores) 
random_mean= np.array(np.mean(random_scores_array,axis=0))
random_std = np.array(np.std(random_scores_array,axis=0))

In [None]:
p = plt.rcParams
p["axes.grid"] = True
p["grid.color"] = "#999999"
p["grid.linestyle"] = "--"

p["lines.marker"] = "o"
p["lines.markeredgecolor"] = "auto"
p["lines.markerfacecolor"] = "white"
p["lines.markersize"] = 3

x= [i for i in range(0,31)]
plt.plot(x,random_mean,color='#4caf50', label="(RF,Random)")
plt.fill_between(x, random_mean - random_std, random_mean + random_std, color="#4caf50",alpha=0.5, label="95\% interval")  
plt.plot(recordedMPE,color="#ff7f50", label = "(RF,Oracle)")
plt.xlabel('Number of Points queried')
plt.ylabel('MAPE')
plt.legend()
sns.despine()
savefig("oracle_vs_random")
