import dependencies

In [78]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, fbeta_score, accuracy_score, f1_score, precision_score, recall_score, precision_recall_curve 
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import scipy
from sklearn.utils import resample
from sklearn import preprocessing
import time
import os
import json

### classification_pred_and_metric_profile

In [89]:
class classification_pred_and_metric_profile:
    '''
    Predicts the hardclasses for classiciation model.
    Uses the cut-off based on selection of metric.
    Stores and saves metrics.
    '''
    def __init__(self, model, algorithm_name):
        self.model = model
        self.metric_cutoff_dict = {}
        self.metric = {}
        self.algorithm_name = algorithm_name
        timestr = time.strftime("%Y%m%d-%H%M%S")
        self.output_path = os.path.join("../output",timestr)
        os.makedirs(self.output_path) 
    
    def get_predictions(self, 
                        data):
        '''
        Predict probabilities for data.
        '''
        return self.model.predict_proba(data)[:,1]
        
    def get_metric_cutoff(self, 
                          x_train, 
                          y_train):
        '''
        Estimate optimum metric cut_off for the metric of selection and save map.
        '''
        actual = y_train.values.ravel()
        cutoffs = np.linspace(0.001,0.999,999)
        fbetas=[]
        accuracy_scores = []
        f1_scores = []
        precision_scores = []
        recall_scores = []
        
        predicted_train = self.get_predictions(x_train)
        
        for cutoff in cutoffs:    
            predicted=(predicted_train>cutoff).astype(int)  
            
            fbetas.append(fbeta_score(actual, 
                                      predicted, 
                                      beta=2))
            accuracy_scores.append(accuracy_score(actual, 
                                                  predicted))             
            f1_scores.append(f1_score(actual,
                                      predicted)) 
            precision_scores.append(precision_score(actual,
                                                    predicted)) 
            recall_scores.append(recall_score(actual,
                                              predicted))  
          
        cutoff_map = list(zip(cutoffs,
                              fbetas))
        with open(os.path.join(self.output_path, 
                               "%s_fbeta_cutoff_map.json"%self.algorithm_name), 
                  "w") as outfile:
            json.dump(cutoff_map, outfile)     
        cutoff_optimum = cutoffs[fbetas == max(fbetas)][0] 
        self.metric_cutoff_dict["fbetas"] = {"map":cutoff_map, 
                                             "optimum": cutoff_optimum}
        
        cutoff_map = list(zip(cutoffs,accuracy_scores))
        with open(os.path.join(self.output_path, 
                               "%s_accuracy_cutoff_map.json"%self.algorithm_name), 
                  "w") as outfile:
            json.dump(cutoff_map, outfile)  
        cutoff_optimum = cutoffs[accuracy_scores == max(accuracy_scores)][0] 
        self.metric_cutoff_dict["accuracy_scores"] = {"map":cutoff_map, 
                                                      "optimum": cutoff_optimum} 
        
        cutoff_map = list(zip(cutoffs,f1_scores))
        with open(os.path.join(self.output_path, 
                               "%s_f1_score_cutoff_map.json"%self.algorithm_name), 
                  "w") as outfile:
            json.dump(cutoff_map, outfile)  
        cutoff_optimum = cutoffs[f1_scores == max(f1_scores)][0] 
        self.metric_cutoff_dict["f1_scores"] = {"map":cutoff_map,
                                                "optimum": cutoff_optimum}         
        
        cutoff_map = list(zip(cutoffs, precision_scores))
        with open(os.path.join(self.output_path, 
                               "%s_precision_cutoff_map.json"%self.algorithm_name), 
                  "w") as outfile:
            json.dump(cutoff_map, outfile)  
        cutoff_optimum = cutoffs[precision_scores == max(precision_scores)][0]  
        self.metric_cutoff_dict["precision_scores"] = {"map":cutoff_map, 
                                                      "optimum": cutoff_optimum}         
        
        cutoff_map = list(zip(cutoffs, recall_scores))
        with open(os.path.join(self.output_path, 
                               "%s_recall_cutoff_map.json"%self.algorithm_name), 
                  "w") as outfile:
            json.dump(cutoff_map, outfile)  
        cutoff_optimum = cutoffs[recall_scores == max(recall_scores)][0]  
        self.metric_cutoff_dict["recall_scores"] = {"map":cutoff_map, 
                                                    "optimum": cutoff_optimum}     
        
        with open(os.path.join(self.output_path, 
                               "%s_metric_cutoff_dict.json"%self.algorithm_name), 
                  "w") as outfile:
            json.dump(self.metric_cutoff_dict, outfile)         
       
    def prediction_hardclasses_metrics(self, 
                                       x_data, 
                                       y_data, 
                                       metric_cutoff_type = "accuracy_scores"):
        '''
        Predict hardclasses based on metric cutoff.
        Save metrics output.
        '''
        predicted = self.get_predictions(x_data)
        pred_hardclasses = (predicted>self.metric_cutoff_dict[metric_cutoff_type]["optimum"]).astype(int)
        np.save(os.path.join(self.output_path, 
                               "%s_pred_hardclasses.npy"%self.algorithm_name), 
                pred_hardclasses)  
        
        fbeta =fbeta_score(y_data, 
                           pred_hardclasses,
                           beta=2)
        cm = confusion_matrix(y_data, 
                             pred_hardclasses)
        class_report = classification_report(y_data, 
                                            pred_hardclasses)  
        roc_auc = roc_auc_score(y_data, 
                                pred_hardclasses)  
        accuracy = accuracy_score(y_data, 
                                  pred_hardclasses)
        f1score = f1_score(y_data, 
                           pred_hardclasses)
        precision = precision_score(y_data,
                                    pred_hardclasses)
        recall = recall_score(y_data,
                              pred_hardclasses)
        
        # AUROC represents the likelihood of the model distinguishing observations from two classes.
        return fbeta, roc_auc, cm, class_report, accuracy, f1score, precision, recall
    
    def call_metric(self, 
                    x_train, 
                    y_train, 
                    x_val, 
                    y_val, 
                    x_test, 
                    y_test):
        fbeta, roc_auc, cm, class_report, accuracy, f1score, precision, recall = self.prediction_hardclasses_metrics(x_train,
                                                                                                                     y_train)
        self.store_metric(fbeta, 
                          roc_auc, 
                          cm, 
                          class_report, 
                          accuracy, 
                          f1score, 
                          precision, 
                          recall,
                          "train_data")
        
        fbeta, roc_auc, cm, class_report, accuracy, f1score, precision, recall = self.prediction_hardclasses_metrics(x_val, 
                                                                                                                     y_val) 
        self.store_metric(fbeta, 
                          roc_auc, 
                          cm, 
                          class_report, 
                          accuracy, 
                          f1score, 
                          precision, 
                          recall, 
                          "validation_data")
        
        fbeta, roc_auc, cm, class_report, accuracy, f1score, precision, recall = self.prediction_hardclasses_metrics(x_test,
                                                                                                                     y_test)
        self.store_metric(fbeta, 
                          roc_auc, 
                          cm, 
                          class_report, 
                          accuracy, 
                          f1score, 
                          precision, 
                          recall, 
                          "test_data")
        
        with open(os.path.join(self.output_path, 
                               "%s_metrics.json"%self.algorithm_name), 
                  "w") as outfile:
            json.dump(self.metric, outfile)  
            
    def store_metric(self, 
                    fbeta, 
                    roc_auc, 
                    cm, 
                    class_report, 
                    accuracy, 
                    f1score, 
                    precision, 
                    recall,
                     key):
        self.metric[key] = {}        
        self.metric[key]["fbeta"] = fbeta
        self.metric[key]["roc_auc_score"] = roc_auc
        self.metric[key]["confusion_matrix"] = cm.tolist()   
        self.metric[key]["accuracy"] = accuracy   
        self.metric[key]["f1score"] = f1score   
        self.metric[key]["precision"] = precision   
        self.metric[key]["recall"] = recall 
        
        with open(os.path.join(self.output_path, 
                               "%s_classification_report.txt"%self.algorithm_name),
                  "a+") as outfile:
            print(class_report, file=outfile)
  

#### calling classification_pred_and_metric_profile

In [90]:
output = classification_pred_and_metric_profile(model, 
                                                "rf")
output.get_metric_cutoff(x_train, 
                         y_train)
output.call_metric(x_train, 
                   y_train, 
                   x_val, 
                   y_val, 
                   x_test, 
                   y_test)

### regression_pred_and_metric_profile

In [None]:
class regression_pred_and_metric_profile:
    '''
    Predicts the hardclasses for classiciation model.
    Uses the cut-off based on selection of metric.
    Stores and saves metrics.
    '''
    def __init__(self, model, algorithm_name):
        self.model = model
        self.metric = {}
        self.algorithm_name = algorithm_name
        timestr = time.strftime("%Y%m%d-%H%M%S")
        self.output_path = os.path.join("../output",timestr)
        os.makedirs(self.output_path) 
    
    def get_predictions(self, 
                        data):
        '''
        Predict probabilities for data.
        '''
        return self.model.predict(data)       
       
    def prediction_and_metrics(self, 
                               x_data, 
                               y_data):
        '''
        Predict hardclasses based on metric cutoff.
        Save metrics output.
        '''
        predicted = self.get_predictions(x_data)
        np.save(os.path.join(self.output_path, 
                               "%s_predicted.npy"%self.algorithm_name),
                predicted)  
        
        mae = mean_absolute_error(y_data, 
                                  predicted)
        mse = mean_squared_error(y_data,
                                 predicted,
                                 squared=True)
        slope, _, r_value, p_value, std_err = scipy.stats.mstats.linregress(y_data, 
                                                                            predicted)
        r_squared = r2_score(y_data, 
                             predicted)        
        
        # AUROC represents the likelihood of the model distinguishing observations from two classes.
        return mae, mse, slope, r_value, p_value, std_err, r_squared
    
    def call_metric(self, 
                    x_train, 
                    y_train, 
                    x_val, 
                    y_val, 
                    x_test, 
                    y_test):
        mae, mse, slope, r_value, p_value, std_err, r_squared = self.prediction_and_metrics(x_train,
                                                                                                    y_train)
        self.store_metric(mae, 
                          mse, 
                          slope, 
                          r_value, 
                          p_value, 
                          std_err, 
                          r_squared,
                          "train_data")
        
        mae, mse, slope, r_value, p_value, std_err, r_squared = self.prediction_and_metrics(x_val,
                                                                                            y_val) 
        self.store_metric(mae, 
                          mse, 
                          slope, 
                          r_value, 
                          p_value, 
                          std_err, 
                          r_squared, 
                          "validation_data")
        
        mae, mse, slope, r_value, p_value, std_err, r_squared = self.prediction_and_metrics(x_test,
                                                                                            y_test)
        self.store_metric(mae, 
                          mse, 
                          slope, 
                          r_value, 
                          p_value, 
                          std_err, 
                          r_squared,
                          "test_data")
        
        with open(os.path.join(self.output_path, 
                               "%s_metrics.json"%self.algorithm_name), 
                  "w") as outfile:
            json.dump(self.metric, outfile)  
            
    def store_metric(self, 
                    mae, 
                    mse, 
                    slope, 
                    r_value, 
                    p_value, 
                    std_err, 
                    r_squared,
                    key):
        self.metric[key] = {}        
        self.metric[key]["mean_absolute_error"] = mae
        self.metric[key]["root_mean_squared_error"] = mse
        self.metric[key]["slope"] = slope   
        self.metric[key]["r_value"] = r_value   
        self.metric[key]["p_value"] = p_value   
        self.metric[key]["std_err"] = std_err   
        self.metric[key]["r_squared"] = r_squared  

#### calling regression_pred_and_metric_profile

In [None]:
output = regression_pred_and_metric_profile(model,"rf")
output.call_metric(x_train, 
                   y_train, 
                   x_val, 
                   y_val)