<a href="https://colab.research.google.com/github/jsaj/dssc/blob/master/DSSC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from glob import glob
import pandas as pd
from pathlib import Path
import numpy as np
import re, sys, itertools, pickle

from warnings import filterwarnings
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# base classifiers
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# import ds techniques
from deslib.dcs import OLA, LCA, Rank, MCB
from deslib.des import KNORAE, KNORAU, KNOP, METADES

from sklearn.preprocessing import MinMaxScaler, StandardScaler

# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.metrics import f1_score, auc, roc_auc_score, precision_score, recall_score, accuracy_score
from scipy import *

# from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

from sklearn.metrics import confusion_matrix

In [None]:
class DSSC():
  """Dynamic Selection Supervised Cross-project defect prediction (DSSC).

    Parameters
    ----------

  References
    ----------

    R. M. Cruz, L. G. Hafemann, R. Sabourin, and G. D. Cavalcanti.
    "Deslib:  A dynamicensemble selection library in python."
    Journal of Machine Learning Research, vol. 21, no. 8, pp. 1–5, 2020.
  
  """


  def __init__(self, url_dataset, with_PF=True):

    dataset_name = url_dataset.split('/')
    self.dataset_name = dataset_name[len(dataset_name)-1]
    self.url_dataset = url_dataset + '/*'
    # self.pf_has_been_called = False

    dataset_total = []

    for project in glob(self.url_dataset):
      project_name = project.split('/')
      project_name = project_name[len(project_name)-1].split('.csv')[0]
      ds = pd.read_csv(project)

      if 'name' in list(ds.columns):
        ds.pop('name')
        ds['name'] = project_name
      else:
        ds['name'] = project_name
      dataset_total.append(ds)
    
    dataset_total = pd.concat(dataset_total).reset_index(drop=True)

    #project filtering stage
    if with_PF == True:
      dataset_total = self._project_filtering(dataset_total)

    self.dataset_total = dataset_total
    self.train, self.test = [], []
    self.percent_bugs = 0
  
  def _project_filtering(self, dataset_total):

    """Filter of projects for prediction.

    Each project and its versions are checked to see if they have a minimum
    number of 5 instances of each label (defect and non-defect).
 
    Parameters
    ----------
    with_PF : Boolean (Default = False)
        Determines if the filter is applied to check if project have a minimum
        number of instances.

    Returns
    ----------
    dataset_total : DataFrame
        DataFrame containing all projects with minimum number of instances.

    References
    ----------

    S. Herbold,  A. Trautsch,  and J. Grabowski. "A comparative study to
    benchmark cross-project  defect  prediction  approaches". IEEE  Transactions
    on  Software  Engineering, vol. 44, no. 9, pp. 811–833, 2017.

    """
    # self.pf_has_been_called = True
    # for project in list(np.unique(dataset_total['name'])):
    #   ds = dataset_total.loc[dataset_total['name'] == project]
    #   bugs = list(np.unique(ds[ds.columns[0]]))[1:]
    #   ds[ds.columns[0]] = ds[ds.columns[0]].replace(bugs, 1)
    #   defective = np.count_nonzero(np.array(ds[ds.columns[0]]) == 1)
    #   no_defective = np.count_nonzero(np.array(ds[ds.columns[0]]) == 0)

    #   percent_bugs =  (defective / len(ds)) * 100
    #   percent_no_bugs =  (no_defective / len(ds)) * 100

    #   print(project, 'bugs: ', percent_bugs, ' | no_bugs: ', percent_no_bugs)
    #   if len(ds) < 100:
    #     dataset_total.drop(dataset_total.loc[dataset_total['name']==project].index, inplace=True)
    #   elif len(ds) > 100 and percent_bugs < 5.0:
    #     dataset_total.drop(dataset_total.loc[dataset_total['name']==project].index, inplace=True)
    #   elif len(ds) > 100 and percent_no_bugs < 5.0:
    #     dataset_total.drop(dataset_total.loc[dataset_total['name']==project].index, inplace=True)
    # print('opa: ', list(np.unique(dataset_total['name'])))
    return dataset_total

  def _target_definition(self, target_project, dataset_total):
    """ Selecting the best model to predict the target project

    Parameters
    ----------
    
    target_project : string
        String with name of target project.

    dataset_total : DataFrame
        DataFrame containing all projects for prediction

    Returns
    -------
    train : {DataFrame} of shape (n_samples, n_features)
        The training input samples.

    test : {DataFrame} of shape (n_samples, n_features)
        The target project input samples.

    """
    target_name = target_project.split('.csv')[0]

    if '-' in target_name:
      target_name = target_project.split('-')[0]

    if '.' in target_name:
      target_name = target_project.split('.')[0]

    for character in target_name:
      if character.isdigit():
        target_name = list(re.findall(r'(\w+?)(\d+)', target_name)[0])
        target_name = target_name[0]
        break

    test_data = dataset_total.loc[dataset_total['name'] == target_project]
    test_data = test_data.select_dtypes(exclude=['object']).reset_index(drop=True)
    bugs = list(np.unique(test_data[test_data.columns[0]]))[1:]
    test_data[test_data.columns[0]] = test_data[test_data.columns[0]].replace(bugs, 1)
    
    train_data = dataset_total[~dataset_total['name'].str.contains(target_name)]
    for project_name in list(np.unique(train_data['name'])):
      ds = train_data.loc[train_data['name'] == project_name]
      y = ds[ds.columns[0]]
      bugs = list(np.unique(y))[1:]
      y = y.replace(bugs, 1)
      defective = round((np.count_nonzero(np.array(y) == 1) / len(y)) * 100, 2)
      no_defective = round((np.count_nonzero(np.array(y) == 0) / len(y)) * 100, 2)
      # print(project_name, ' || bugs: ', defective, ' | nobugs: ', no_defective)
      if defective < 5.0 or no_defective < 5.0 or len(ds) < 100:
        # print('Removido do train :', project_name)
        train_data = train_data[train_data['name'] != project_name]
    train_data = train_data.select_dtypes(exclude=['object']).reset_index(drop=True)
    bugs = list(np.unique(train_data[train_data.columns[0]]))[1:]
    train_data[train_data.columns[0]] = train_data[train_data.columns[0]].replace(bugs, 1)

    def Diff(li1, li2):
      return list(set(li1) - set(li2)) + list(set(li2) - set(li1))

    diff_columns = Diff(list(train_data.columns), list(test_data.columns))
    
    if len(diff_columns) > 0:
      for col in diff_columns:
        if col in list(train_data.columns):
          train_data.pop(col)
        elif col in list(test_data.columns):
          test_data.pop(col)

   
    self.test = test_data
    self.train = train_data

    y = self.test[self.test.columns[0]]
    bugs = list(np.unique(y))[1:]
    y = y.replace(bugs, 1)
    self.percent_bugs = round((np.count_nonzero(np.array(y) == 1) / len(y)) * 100, 2)

    # return train, test

  def _calc_popt(self, defective, LOC, effort):
    
    effort_instances =  (effort * defective[LOC].sum())/100
    index = defective[LOC].cumsum().searchsorted(effort_instances)
    TargetList = defective[:index]
    y_test = TargetList[TargetList.columns[0]]
    bugs = list(np.unique(y_test))[1:]
    y_test = y_test.replace(bugs, 1) 
    effort_percent = np.arange(0, 101, 1)
    defective_list = []

    for percent in range(0, 101):
      effort_loc =  (percent * TargetList[LOC].sum())/100
      index = TargetList[LOC].cumsum().searchsorted(effort_loc)
      data = defective[:index]
      
      if len(data) != 0 and percent < 100:
        bugs = np.count_nonzero(data[data.columns[0]] == 1)
        percent_bugs = bugs / np.count_nonzero( y_test == 1)
        defective_list.append(percent_bugs)
      elif len(data) != 0 and percent == 100:
        percent_bugs = 1.0
        defective_list.append(percent_bugs)
      else:
        percent_bugs = 0.0
        defective_list.append(percent_bugs)
      
    x = effort_percent
    y = defective_list

    y_a = np.arange(0.0, 1.01, 0.02)
    x_b = np.arange(0, 101, 2)

    x_a, y_b, h = [], [], []
    for i in range(51):
      x_a.append(0)
      y_b.append(1)
      h.append(100)

    x1 = np.concatenate([x_a, x_b])
    y1  = np.concatenate([y_a, y_b])

    x2 = np.concatenate([x_b, h])
    y2 = np.concatenate([x_a, y_a])

    area_P_R = auc(x, y) - auc(np.arange(0, 101, 1), np.arange(0.00, 1.01, 0.01))
    area_O_P = auc(x1, y1) - auc(x, y)
    area_P_R = auc(x, y) - auc(np.arange(0, 101, 1), np.arange(0.00, 1.01, 0.01))
    area_R_W = auc(np.arange(0, 101, 1), np.arange(0.00, 1.01, 0.01)) - auc(x2, y2)

    # plt.plot(x1, y1)
    # plt.plot(x, y)
    # plt.plot(np.arange(0, 101, 1), np.arange(0.00, 1.01, 0.01))
    # plt.plot(x2, y2)
    # plt.show()
    popt = 1 - (area_O_P/ (area_O_P + area_P_R + area_R_W))
    if popt > 1:
      popt = 1.0
    return popt
  
  def _calf_IFA(self, defective, LOC, model, scaler, effort):
    X_test = defective.drop(defective.columns[0], axis=1)
    if scaler != None:
      X_test = scaler.transform(X_test)
    y_pred = model.predict(X_test)

    defective['predict'] = y_pred
    effort_instances =  (effort * defective[LOC].sum())/100
    index = defective[LOC].cumsum().searchsorted(effort_instances)
    TargetList = defective[:index]
    y_test = TargetList[TargetList.columns[0]]
    bugs = list(np.unique(y_test))[1:]
    y_test = y_test.replace(bugs, 1) 
    
    IFA = 0
    for pred, true in zip(y_pred, y_test):
      if true == 1 and pred == 1:
        break
      elif true == 0 and pred == 1:
        IFA +=1

    return IFA
  
  def _calc_PIIL_CEL(self, defective, LOC, model, scaler, effort):
    
    if 'predict' in list(defective.columns):
      defective = defective.drop('predict', axis=1)
    X_test = defective.drop(defective.columns[0], axis=1)

    y_test = defective[defective.columns[0]]
    bugs = list(np.unique(y_test))[1:]
    y_test = y_test.replace(bugs, 1)

    if scaler != None:
      X_test = scaler.transform(X_test)
    y_pred = model.predict(X_test)

    defective['predict'] = y_pred
    if effort == 20:
      effort_instances =  (effort * defective[LOC].sum())/100
      index = defective[LOC].cumsum().searchsorted(effort_instances)
    elif effort == 1000:
      index = defective[LOC].cumsum().searchsorted(effort)
    else:
      index = defective[LOC].cumsum().searchsorted(effort)
 
    TargetList = defective[:index]
    
    # print(np.sum(defective[LOC]), np.sum(TargetList[LOC]))
    real_bugs = np.count_nonzero(y_test == 1)
    # PII = len(TargetList)/len(defective)
    PII = np.count_nonzero(TargetList['predict'] == 1)/ len(defective)

    CE = np.count_nonzero(TargetList['predict'] == 1)/ real_bugs

    if PII > 1.0:
      PII = 1.0
    if CE > 1.0:
      CE = 1.0  
    return PII, CE
  
  def _model_evaluating(self, model, scaler):
    
    cols = list(self.test.columns)
    LOC = cols[1]

    X_test = self.test.drop(self.test.columns[0], axis=1)
    y_test = self.test[self.test.columns[0]]
    bugs = list(np.unique(y_test))[1:]
    y_test = y_test.replace(bugs, 1)
    
    if scaler != None:
      X_test = scaler.transform(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    test_data = self.test
    test_data['score'] = y_proba
    test_data['score*loc'] = test_data['score'] * test_data[LOC]
    defective = test_data.loc[test_data[test_data.columns[0]] == 1]
    defective = defective.sort_values(by='score*loc', ascending=False)
    no_defective = test_data.loc[test_data[test_data.columns[0]] == 0]
    no_defective = no_defective.sort_values(by='score*loc', ascending=False)
    # DEFECTIVE = test_data.sort_values(by='score*loc', ascending=False)

    DEFECTIVE = pd.concat([defective, no_defective]).reset_index(drop=True)
    DEFECTIVE = DEFECTIVE.replace({inf: 1.0})

    X_test = DEFECTIVE.drop([DEFECTIVE.columns[0], 'score*loc', 'score'], axis=1)
    y_test = DEFECTIVE[DEFECTIVE.columns[0]]

    bugs = list(np.unique(y_test))[1:]
    y_test = y_test.replace(bugs, 1)

    if scaler != None:
      X_test = scaler.transform(X_test)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    F1 = round(f1_score(y_test, y_pred), 5)
    AUC = round(roc_auc_score(y_test, y_prob), 5)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    PF = round(fp / (fp + tn), 5)

    # precision = precision_score(y_test, y_pred)
    # recall = recall_score(y_test, y_pred)
    # accuracy = accuracy_score(y_test, y_pred)

    NPM = [F1, AUC, PF, precision, recall, accuracy]

    test_data = test_data.drop('score*loc', axis=1)

    test_data['score/loc'] = test_data['score'] / test_data[LOC]
    defective = test_data.loc[test_data[test_data.columns[0]] == 1]
    defective = defective.sort_values(by='score/loc', ascending=False)
    no_defective = test_data.loc[test_data[test_data.columns[0]] == 0]
    no_defective = no_defective.sort_values(by='score/loc', ascending=False)
    
    DEFECTIVE = pd.concat([defective, no_defective]).reset_index(drop=True)
    DEFECTIVE = DEFECTIVE.replace({inf: 1.0})

    DEFECTIVE = DEFECTIVE.drop(['score/loc', 'score'], axis=1)
    # y_test = DEFECTIVE[DEFECTIVE.columns[0]]
    # bugs = list(np.unique(y_test))[1:]
    # y_test = y_test.replace(bugs, 1) 

    IFA =  self._calf_IFA(DEFECTIVE, LOC, model, scaler, 20)
    PII20, CE20 = self._calc_PIIL_CEL(DEFECTIVE, LOC, model, scaler, 20)
    PII1000, CE1000 = self._calc_PIIL_CEL(DEFECTIVE, LOC, model, scaler, 1000)
    PII2000, CE2000 = self._calc_PIIL_CEL(DEFECTIVE, LOC, model, scaler, 2000)
    Popt = self._calc_popt(DEFECTIVE, LOC, 20) 
    EPM = [IFA, PII20, PII1000, PII2000, CE20, CE1000, CE2000, Popt]

    return NPM, EPM
  
  def _model_building(self, ds,
                     base_estimator,
                     scaler,
                     resample_strategy,
                     dsel_size):
    """ Selecting the best model to predict the target project

    Parameters
    ----------
    DS: object (Default = None)
        The dynamic selection technique to fit and predict the target project.
        
        If None, then the dynamic selection technique is a
        :class:`~deslib.des.KNORAU`.

    base_estimator : object or list of base estimatos (Default = None)
        The base estimator used to generated the pool of classifiers. The base
        base_estimator should support the technique "predict_proba".
        
        If None, then the base estimator is a :class:`GaussianNB` from sklearn
        available on :class:`~sklearn.naive_bayes.GaussianNB`.

    scaler: object or list of scaler algorithms (Default = None)
        The scaler algorithm to transform features by scaling each
        feature to a given range.

    resample_strategy : {'over', 'under', None} (Default = None)
        The algorithm to perform random sampling

        - 'over' will use :class:`RandomOverSampler` from imblearn
          available on :class:`~imblearn.underesample_strategyampling.RandomOverSampler`

        - 'under' will use :class:`RandomUnderSampler` from imblearn
          available on :class:`~imblearn.underesample_strategyampling.RandomUnderSampler`

        - None, will not use algorithm to perform random sampling.   

    Returns
      -------
      NPM : list
          list with the array_npm values of non-effort-aware array_npm
          measures.
    
    """
    train_data = self.train
    X = train_data.drop(train_data.columns[0], axis=1)
    y = train_data[train_data.columns[0]]

    bugs = list(np.unique(y))[1:]
    y = y.replace(bugs, 1)

    if scaler != None:
      X = scaler.fit_transform(X)

    if dsel_size != None:
      X_train, X_dsel, y_train, y_dsel = train_test_split(X, y, test_size=dsel_size)
    else:
      X_train, y_train = X, y
      X_dsel, y_dsel = X, y 

    if resample_strategy not in ['over', 'smote', None]:
      raise ValueError("Value input is incorrect. Accept only three values: {'over', 'under', None}.")
                        
    if resample_strategy == 'over':
      resample_strategy = RandomOverSampler()
      X_train, y_train = resample_strategy.fit_resample(X_train, y_train)
      X_dsel, y_dsel = resample_strategy.fit_resample(X_dsel, y_dsel)

    elif resample_strategy == 'smote':
      resample_strategy = SMOTE()
      X_train, y_train = resample_strategy.fit_resample(X_train, y_train)
      X_dsel, y_dsel = resample_strategy.fit_resample(X_dsel, y_dsel)

    if base_estimator == None:
      base_estimator = GaussianNB()
    pool_classifiers = BaggingClassifier(base_estimator=base_estimator)
    pool_classifiers.fit(X_train, y_train)
    model = ds.set_params(pool_classifiers=pool_classifiers)
    model.fit(X_dsel, y_dsel)
    
    return model, scaler
  
  def dynamic_prediction(self, dynamic_algorithm=None,
                         base_estimator=None, preprocessing=None,
                         resample_strategy=None,
                         dsel_size=None):
      
    """ Selecting the best model to predict the target project

    Parameters
    ----------

    DS: object (Default = None)
        The dynamic selection technique to fit and predict the target project.
        
        If None, then the dynamic selection technique is a
        :class:`~deslib.des.KNORAU`.

    base_estimator : object or list of base estimatos (Default = None)
        The base estimator used to generated the pool of classifiers. The base
        base_estimator should support the technique "predict_proba".
        
        If None, then the base estimator is a :class:`GaussianNB` from sklearn
        available on :class:`~sklearn.naive_bayes.GaussianNB`.

    preprocessing: object or list of scaler algorithms (Default = None)
        The scaler algorithm to transform features by scaling each
        feature to a given range.

    resample_strategy : {'over', 'under', None} (Default = None)
        The algorithm to perform random sampling

        - 'over' will use :class:`RandomOverSampler` from imblearn
          available on :class:`~imblearn.underesample_strategyampling.RandomOverSampler`

        - 'under' will use :class:`RandomUnderSampler` from imblearn
          available on :class:`~imblearn.underesample_strategyampling.RandomUnderSampler`

        - None, will not use algorithm to perform random sampling.

    dsel_size : float (Default = None)
        The strategy to division of training data into TRAIN and DSEL
        
        If float, should be between 0.2 and 0.5 and represent the proportion of
        the training dataset to include in the DSEL split. If None, TRAIN and
        DSEL will receive all instances of training data.

    Returns
    -------
    best_model : object
        The best model to predict the target project.

    best_scaler : object
        The best scaler algorithm to transform features by scaling.

    Note: if  best_scaler = None, then doesn't use any pre-processing algorithm.
    """ 

 

    if dynamic_algorithm == None:
      dynamic_algorithm = [KNORAU()]
    
    if type(base_estimator) != list and base_estimator != None:
      base_estimator = [base_estimator]
    elif base_estimator == None:
      base_estimator = [LogisticRegression(solver='liblinear')]

    if preprocessing == None:
      preprocessing = [preprocessing]
    elif type(preprocessing) != list and preprocessing != None:
      preprocessing = [preprocessing]

    if resample_strategy == None:
      resample_strategy = [resample_strategy]
    elif type(resample_strategy) != list and resample_strategy != None:
      resample_strategy = [resample_strategy]

    if (type(dsel_size) == float and dsel_size < 0.2) or (type(dsel_size) == float and dsel_size > 0.5):
      raise ValueError('Value inputed for dsel_size is invalid. Accepts only float between 0.2 and 0.5 or None.')
    elif dsel_size != None and type(dsel_size) != float:
      raise ValueError('Value inputed for dsel is invalid. Accepts only float between 0.2 and 0.5 or None.')

    NPM, EPM = [], []
    projetos_preditos = ['szybkafucha', 'termoproject', 'tomcat', 'velocity-1.4', 'velocity-1.5', 'velocity-1.6', 'workflow', 'wspomaganiepi']
    list_projects = list(np.unique(self.dataset_total['name']))
    print(list_projects)
    aux = []
    performance_NPM, performance_EPM, = [], []
    for target_project in list_projects:
      
      if target_project in projetos_preditos:
        print(target_project)
        for ds in dynamic_algorithm:
          string_ds = str(type(ds))
          string_ds = string_ds.split("'")[1]
          string_ds = string_ds.split(".")

          name_ds = string_ds[len(string_ds)-1]
          print(name_ds)
          if 'deslib' not in string_ds:
            raise ValueError('Input dynamic selection technique invalid!')
          epm_data = []
          for classifier in base_estimator:
            for s in preprocessing:
              for resampling in resample_strategy:
                self._target_definition(target_project, self.dataset_total)
                model, scaler = self._model_building(ds=ds, base_estimator=classifier,
                                                    scaler=s, resample_strategy=None,
                                                    dsel_size=dsel_size)
                array_npm, array_epm = self._model_evaluating(model=model, scaler=scaler)
                
                array_npm.insert(0, self.dataset_name)
                array_npm.insert(1, target_project)
                array_npm.insert(2, self.percent_bugs)
                array_npm.insert(3, name_ds)
                array_npm.insert(4, scaler)
                # array_npm.insert(5, resampling)
                cols = ['Dataset', 'Project', 'Percent_Bugs', 'DS', 'scaler',  'f1', 'auc', 'pf']
                array_npm = pd.DataFrame([array_npm], columns=cols)         
                performance_NPM.append(array_npm)
                ds_data.append(array_npm)
                aux.append(array_npm)

                array_epm.insert(0, self.dataset_name)
                array_epm.insert(1, target_project)
                array_epm.insert(2, self.percent_bugs)
                array_epm.insert(3, name_ds)
                array_epm.insert(4, s)
                array_epm.insert(5, resampling)
                cols = ['Dataset', 'Project', 'Percent_Bugs', 'DS', 'scaler', 'resampling','IFA', 'PII20', 'PII1000', 'PII2000', 'CE20', 'CE1000', 'CE2000', 'Popt']
                array_epm = pd.DataFrame([array_epm], columns=cols) 
                epm_data.append(array_epm)
                performance_EPM.append(array_epm)
          epm_data = pd.concat(epm_data).reset_index(drop=True)
          # epm_data.to_csv('/content/drive/MyDrive/Colab Notebooks/DSSC/Resultados/EPM/{}/{}_EPM_{}_{}.csv'.format(self.dataset_name, self.dataset_name, name_ds, target_project), index=False)

    performance_NPM = pd.concat(performance_NPM).sort_values(by='Percent_Bugs').reset_index(drop=True)
    dict_npm = dict()
    for metric in list(['f1', 'auc', 'pf']):
      array = []
      project_bugs = []
      for project in list(np.unique(performance_NPM['Project'])):
        p_data = performance_NPM.loc[performance_NPM['Project'] == project]
        metric_data = p_data[metric]
        if metric == 'pf':
          max = np.min(metric_data)
        else:
          max = np.max(metric_data)  
        array.append(max)
        aux = [project, list(np.unique(p_data['Percent_Bugs']))[0]]
        project_bugs.append(pd.DataFrame([aux], columns=['Project', '%']))
      dict_npm[metric] = array
    project_bugs = pd.concat(project_bugs).reset_index(drop=True)

    f1 = pd.DataFrame(dict_npm['f1'], columns=['f1'])
    auc = pd.DataFrame(dict_npm['auc'], columns=['auc'])
    pf = pd.DataFrame(dict_npm['pf'], columns=['pf'])
    # precision = pd.DataFrame(dict_npm['precision'], columns=['precision'])
    # recall = pd.DataFrame(dict_npm['recall'], columns=['recall'])
    # accuracy = pd.DataFrame(dict_npm['accuracy'], columns=['accuracy'])
    
    # projects = list(np.unique(NPM['Project']))
    NPM = pd.concat([f1, auc, pf, precision, recall, accuracy], axis=1)
    # NPM.insert(0, 'Project', projects)

    NPM = pd.concat([project_bugs, NPM], axis=1).reindex(project_bugs.index)
  
    performance_EPM = pd.concat(performance_EPM).reset_index(drop=True)

    dict_epm = dict()
    for metric in list(['IFA', 'PII20', 'PII1000', 'PII2000', 'CE20', 'CE1000', 'CE2000', 'Popt']):
      array = []
      project_bugs = []
      for project in list(np.unique(performance_EPM['Project'])):
        p_data = performance_EPM.loc[performance_EPM['Project'] == project]
        metric_data = p_data[metric]
        if metric in ['IFA', 'PII20', 'PII1000', 'PII2000']:
          max = np.min(metric_data)
        else:
          max = np.max(metric_data)  
        array.append(max)
        aux = [project, list(np.unique(p_data['Percent_Bugs']))[0]]
        project_bugs.append(pd.DataFrame([aux], columns=['Project', '%']))
      dict_epm[metric] = array
    project_bugs = pd.concat(project_bugs).reset_index(drop=True)

    IFA = pd.DataFrame(dict_epm['IFA'], columns=['IFA']).reset_index(drop=True)
    PII20 = pd.DataFrame(dict_epm['PII20'], columns=['PII20']).reset_index(drop=True)
    PII1000 = pd.DataFrame(dict_epm['PII1000'], columns=['PII1000']).reset_index(drop=True)
    PII2000 = pd.DataFrame(dict_epm['PII2000'], columns=['PII2000']).reset_index(drop=True)
    CE20 = pd.DataFrame(dict_epm['CE20'], columns=['CE20']).reset_index(drop=True)
    CE1000 = pd.DataFrame(dict_epm['CE1000'], columns=['CE1000']).reset_index(drop=True)
    CE2000 = pd.DataFrame(dict_epm['CE2000'], columns=['CE2000']).reset_index(drop=True)
    Popt = pd.DataFrame(dict_epm['Popt'], columns=['Popt']).reset_index(drop=True)

    EPM = pd.concat([IFA, PII20, PII1000, PII2000, CE20, CE1000, CE2000, Popt], axis=1).reindex(project_bugs.index)
    EPM = pd.concat([project_bugs, EPM], axis=1).reindex(project_bugs.index)

    return NPM, EPM

In [None]:
!pip install deslib
!git clone https://github.com/jsaj/dssc.git

Collecting deslib
  Downloading DESlib-0.3.5-py3-none-any.whl (158 kB)
[?25l[K     |██                              | 10 kB 21.4 MB/s eta 0:00:01[K     |████▏                           | 20 kB 15.7 MB/s eta 0:00:01[K     |██████▏                         | 30 kB 10.2 MB/s eta 0:00:01[K     |████████▎                       | 40 kB 8.5 MB/s eta 0:00:01[K     |██████████▎                     | 51 kB 4.2 MB/s eta 0:00:01[K     |████████████▍                   | 61 kB 4.7 MB/s eta 0:00:01[K     |██████████████▍                 | 71 kB 4.5 MB/s eta 0:00:01[K     |████████████████▌               | 81 kB 4.3 MB/s eta 0:00:01[K     |██████████████████▌             | 92 kB 4.8 MB/s eta 0:00:01[K     |████████████████████▋           | 102 kB 4.2 MB/s eta 0:00:01[K     |██████████████████████▊         | 112 kB 4.2 MB/s eta 0:00:01[K     |████████████████████████▊       | 122 kB 4.2 MB/s eta 0:00:01[K     |██████████████████████████▉     | 133 kB 4.2 MB/s eta 0:00:01[K  

In [None]:
filterwarnings('ignore')
# '/content/ml/Datasets/AEEEM',
#             '/content/ml/Datasets/NASA',
datasets = ['/content/ml/Datasets/PROMISE']

# dynamic_algorithm = [KNORAE()]
dynamic_algorithm = [KNORAE(), KNORAU(), KNOP(), METADES(), LCA(), OLA(), MCB(), Rank()]

base_estimator = [LogisticRegression(solver='liblinear'),
                  RandomForestClassifier(),
                  GaussianNB(),
                  DecisionTreeClassifier()]

preprocessing = [None,
                 MinMaxScaler(),
                 StandardScaler()]

resample_strategy = [None,
                    'over']

for data in datasets:
  NPM, EPM = [], []
  dataset_name = data.split('/')
  dataset_name = dataset_name[len(dataset_name)-1]
  model = DSSC(data, with_PF=True)
  

  npm, epm = model.dynamic_prediction(dynamic_algorithm=dynamic_algorithm,
                                 base_estimator=base_estimator,
                                 preprocessing=preprocessing,
                                 resample_strategy=resample_strategy)
  
  # npm.to_csv('/content/sample_data/DSSC-{}.csv'.format(dataset_name))
  # print('F1: ', round(np.mean(npm['f1']), 3))
  # print('AUC: ', round(np.mean(npm['auc']), 3))
  # print('PF: ', round(np.mean(npm['pf']), 3))
  # print('Precision: ', round(np.mean(npm['precision']), 3))
  # print('Recall: ', round(np.mean(npm['recall']), 3))
  # print('Accuracy: ', round(np.mean(npm['accuracy']), 3))

  # print()

  print('IFA: ', round(np.sum(epm['IFA']), 3))
  print('PII20: ', round(np.mean(epm['PII20']), 3))
  print('PII1000: ', round(np.mean(epm['PII1000']), 3))
  print('PII2000: ', round(np.mean(epm['PII2000']), 3))
  print('CE20: ', round(np.mean(epm['CE20']), 3))
  print('CE1000: ', round(np.mean(epm['CE1000']), 3))
  print('CE2000: ', round(np.mean(epm['CE2000']), 3))
  print('Popt: ', round(np.mean(epm['Popt']), 3))

['ant-1.3', 'ant-1.4', 'ant-1.5', 'ant-1.6', 'ant-1.7', 'arc', 'berek', 'camel-1.0', 'camel-1.2', 'camel-1.4', 'camel-1.6', 'ckjm', 'e_learning', 'forrest-0.7', 'ivy-1.1', 'ivy-1.4', 'ivy-2.0', 'jedit-3.2', 'jedit-4.0', 'jedit-4.1', 'jedit-4.2', 'jedit-4.3', 'kalkulator', 'log4j-1.0', 'log4j-1.1', 'log4j-1.2', 'lucene-2.0', 'lucene-2.2', 'lucene-2.4', 'nieruchomosci', 'pbeans-1', 'pbeans-2', 'pdftranslator', 'poi-1.5', 'poi-2.0', 'poi-2.5', 'poi-3.0', 'redaktor', 'serapion', 'skarbonka', 'sklebagd', 'synapse-1.0', 'synapse-1.1', 'synapse-1.2', 'systemdata', 'szybkafucha', 'termoproject', 'tomcat', 'velocity-1.4', 'velocity-1.5', 'velocity-1.6', 'workflow', 'wspomaganiepi', 'xalan-2.4', 'xalan-2.5', 'xalan-2.6', 'xalan-2.7', 'xerces-1.2', 'xerces-1.3', 'xerces-1.4', 'xerces-init', 'zuzel']
szybkafucha
KNORAE
KNORAU
KNOP
METADES
LCA
OLA
MCB
Rank
termoproject
KNORAE
KNORAU
KNOP
METADES
LCA
OLA
MCB
Rank
tomcat
KNORAE
KNORAU
KNOP
METADES
LCA
OLA
MCB
Rank
velocity-1.4
KNORAE
KNORAU
KNOP
META

KeyboardInterrupt: ignored

In [None]:
data = []
for csv in glob('/content/drive/MyDrive/Colab Notebooks/DSSC/Resultados/EPM/  /*'):
  data.append(pd.read_csv(csv))

performance_EPM = pd.concat(data).reset_index(drop=True)
# minmax = performance_EPM.loc[performance_EPM['scaler'] == 'MinMaxScaler()']
# stantard =  performance_EPM.loc[performance_EPM['scaler'] == 'StandardScaler()']
# performance_EPM = pd.concat([minmax, stantard])
# performance_EPM = performance_EPM.loc[performance_EPM['resampling'] != 'over']
# print(performance_EPM['resampling'])
dict_epm = dict()
for metric in list(['IFA', 'PII20', 'PII1000', 'PII2000', 'CE20', 'CE1000', 'CE2000', 'Popt']):
  array = []
  project_bugs = []
  for project in list(np.unique(performance_EPM['Project'])):
    p_data = performance_EPM.loc[performance_EPM['Project'] == project]
    metric_data = p_data[metric]
    if metric in ['IFA', 'PII20', 'PII1000', 'PII2000']:
      max = np.min(metric_data)
    else:
      max = np.max(metric_data)  
    array.append(max)
    aux = [project, list(np.unique(p_data['Percent_Bugs']))[0]]
    project_bugs.append(pd.DataFrame([aux], columns=['Project', '%']))
  dict_epm[metric] = array
project_bugs = pd.concat(project_bugs).reset_index(drop=True)


IFA = pd.DataFrame(dict_epm['IFA'], columns=['IFA']).reset_index(drop=True)
PII20 = pd.DataFrame(dict_epm['PII20'], columns=['PII20']).reset_index(drop=True)
PII1000 = pd.DataFrame(dict_epm['PII1000'], columns=['PII1000']).reset_index(drop=True)
PII2000 = pd.DataFrame(dict_epm['PII2000'], columns=['PII2000']).reset_index(drop=True)
CE20 = pd.DataFrame(dict_epm['CE20'], columns=['CE20']).reset_index(drop=True)
CE1000 = pd.DataFrame(dict_epm['CE1000'], columns=['CE1000']).reset_index(drop=True)
CE2000 = pd.DataFrame(dict_epm['CE2000'], columns=['CE2000']).reset_index(drop=True)
Popt = pd.DataFrame(dict_epm['Popt'], columns=['Popt']).reset_index(drop=True)

epm = pd.concat([IFA, PII20, PII1000, PII2000, CE20, CE1000, CE2000, Popt], axis=1).reindex(project_bugs.index)
epm = pd.concat([project_bugs, epm], axis=1).reindex(project_bugs.index)

print('IFA: ', round(np.sum(epm['IFA']), 3))
print('PII20: ', round(np.mean(epm['PII20']), 3))
print('PII1000: ', round(np.mean(epm['PII1000']), 3))
print('PII2000: ', round(np.mean(epm['PII2000']), 3))
print('CE20: ', round(np.mean(epm['CE20']), 3))
print('CE1000: ', round(np.mean(epm['CE1000']), 3))
print('CE2000: ', round(np.mean(epm['CE2000']), 3))
print('Popt: ', round(np.mean(epm['Popt']), 3))


# IFA: 0
# PII20:  0.052
# PII1000:  0.003
# PII2000:  0.009
# CE20:  0.465
# CE1000:  0.302
# CE2000:  0.408
# Popt:  0.724

# IFA:  0
# PII20:  0.052
# PII1000:  0.003
# PII2000:  0.009
# CE20:  0.613
# CE1000:  0.429
# CE2000:  0.57
# Popt:  0.731

# IFA:  0
# PII20:  0.021
# PII1000:  0.0
# PII2000:  0.001
# CE20:  0.575
# CE1000:  0.138
# CE2000:  0.185
# Popt:  0.747

ValueError: ignored

In [None]:
ds = pd.read_csv('/content/sample_data/DSSC-NASA.csv')
f1, auc, pf = [], [], []
for p in list(np.unique(ds['Project'])):
  data_p = ds.loc[ds['Project'] == p]
  f1_max = np.max(data_p['f1'])
  value = data_p.loc[data_p['f1'] == f1_max].reset_index(drop=True)
  if len(value) > 1:
    value = value[:1]
  f1.append(value['f1'].item())

  auc_max = np.max(data_p['auc'])
  value = data_p.loc[data_p['auc'] == auc_max].reset_index(drop=True)
  if len(value) > 1:
    value = value[:1]
  auc.append(value['auc'].item())

  pf_max = np.min(data_p['pf'])
  value = data_p.loc[data_p['pf'] == pf_max].reset_index(drop=True)
  if len(value) > 1:
    value = value[:1]
  pf.append(value['pf'].item())

print('f1: ', round(np.mean(f1), 3))
print('auc: ', round(np.mean(auc), 3))
print('pf: ', round(np.mean(pf), 3))

#remove < 100
# f1:  0.226
# auc:  0.729
# pf:  0.015

In [None]:
dataset = []
for csv in glob('/content/sample_data/*'):
  if 'AEEEM' in csv:
    name_csv = csv.split('/')[3]
    name_csv = name_csv.split('_')[3]
    name_csv = name_csv.split('.csv')[0]
    ds = pd.read_csv(csv)
    ds['name'] = name_csv
    dataset.append(ds)
dataset = pd.concat(dataset).reset_index(drop=True)
dataset

for projeto in list(['RELINK']):
  print('------- {} --------'.format(projeto))
  projeto_values = dataset
  
  f1, auc, pf = [], [], []
  for p in list(np.unique(projeto_values['Project'])):
    array_DS = projeto_values.loc[projeto_values['Project'] == p]
    max_f1 = np.max(array_DS['f1'])
    best_f1 = array_DS.loc[array_DS['f1'] == max_f1].reset_index(drop=True)
    if len(best_f1)> 1:
      best_f1 = best_f1[:1]
    f1.append(best_f1['f1'].item())

    max_auc = np.max(array_DS['auc'])
    best_auc = array_DS.loc[array_DS['auc'] == max_auc].reset_index(drop=True)
    if len(best_auc)> 1:
      best_auc = best_auc[:1]
    auc.append(best_auc['auc'].item())

    min_pf = np.min(array_DS['pf'])
    best_pf = array_DS.loc[array_DS['pf'] == min_pf].reset_index(drop=True)
    if len(best_pf)> 1:
      best_pf = best_pf[:1]
    pf.append(best_pf['pf'].item())

  f1_mean, f1_var = round(np.mean(f1), 3),round(np.var(f1), 3)
  auc_mean, auc_var = round(np.mean(auc), 3), round(np.var(auc), 3)
  pf_mean, pf_var = round(np.mean(pf), 3), round(np.var(pf), 3)
  array = [projeto,
           str(f1_mean) + str('$\pm$') + str(f1_var),
           str(auc_mean) + str('$\pm$') + str(auc_var),
           str(pf_mean) + str('$\pm$') + str(pf_var)]

  res = pd.DataFrame([array], columns=['Project', 'F1-score', 'AUC', 'False Alarm'])
  print(res)

In [None]:
path = '/content/sample_data/*'
relink = []
for i in glob(path):
  if 'RELINK' in i:
    ds = pd.read_csv(i) 
    relink.append(ds)
relink = pd.concat(relink).reset_index(drop= True)
relink.to_csv('/content/drive/MyDrive/Colab Notebooks/DSSC/RELINK.csv', index=False)

In [None]:
res = []
for project in  list(np.unique(relink['Project'])):
  data_project = relink.loc[relink['Project'] == project]
  max_acc = np.max(data_project['acc'])
  best_model = data_project.loc[data_project['acc'] == max_acc]
  if len(best_model) > 1:
    min = np.min(best_model['bsl'])
    best_model = best_model.loc[best_model['bsl'] == min].reset_index(drop=True)
    if len(best_model) > 1:
      best_model = best_model[:1]
  res.append(best_model)
res = pd.concat(res).reset_index(drop=True)

print('F1: ', round(res['f1'].mean(), 2))
print('auc: ', round(res['auc'].mean(), 2))
print('pf: ', round(res['pf'].mean(), 2))
# res.to_csv('/content/drive/MyDrive/Colab Notebooks/DSSC/Estatistica/DSSC_{}_by_project.csv'.format(name_data), index=False)

In [None]:
from collections import Counter, OrderedDict

In [None]:
# ds = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DSSC/Resultados/DSSC_dictory_NPM.csv')

results = dict()
results['promise'] = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DSSC/Resultados/DSSC_PROMISE_NPM.csv')
results['relink'] = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DSSC/Resultados/DSSC_RELINK_NPM.csv')
results['aeeem'] = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DSSC/Resultados/DSSC_AEEEM_NPM.csv')
results['nasa'] = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DSSC/Resultados/DSSC_NASA_NPM.csv')

f1 = {'KNE': 0,
      'KNU': 0,
      'METADES': 0,
      'KNOP': 0,
      'OLA': 0,
      'LCA': 0,
      'RANK': 0,
      'MCB': 0}

auc = {'KNE': 0,
      'KNU': 0,
      'METADES': 0,
      'KNOP': 0,
      'OLA': 0,
      'LCA': 0,
      'RANK': 0,
      'MCB': 0}

pf = {'KNE': 0,
      'KNU': 0,
      'METADES': 0,
      'KNOP': 0,
      'OLA': 0,
      'LCA': 0,
      'RANK': 0,
      'MCB': 0}      

dict_ds = {'F1': f1,
            'AUC': auc,
            'PF': pf}

all_dicts = []
metrics_count = dict()
# f1, auc, pf = [], [], []
for project in results:
  ds = results[project]
  metrics = ['F1', 'AUC', 'PF']

  for m in metrics:
    for p in list(np.unique(ds['Project'])):
      p_data = ds.loc[ds['Project'] == p]
      m_data = p_data[['DS', m]]
      print(m_data)
      ds_methods = list(np.unique(m_data['DS']))

      if m == 'PF':
        min = np.min(m_data[m])
        best_ds = m_data.loc[m_data[m] == min].drop(m, axis=1).reset_index(drop=True)
        best_ds = list(best_ds['DS'])

        if len(best_ds) > 1:
          for x in best_ds:
            pf[x] = pf[x] + 0.5
        else:
          pf[best_ds[0]] = pf[best_ds[0]] + 1

      elif m == 'F1':
        max = np.max(m_data[m])
        best_ds = m_data.loc[m_data[m] == max].drop(m, axis=1).reset_index(drop=True)
        best_ds = list(best_ds['DS'])

        if len(best_ds) > 1:
          for x in best_ds:
            f1[x] = f1[x] + 0.5
        else:
          f1[best_ds[0]] = f1[best_ds[0]] + 1
      
    
      else:
        max = np.max(m_data[m])
        best_ds = m_data.loc[m_data[m] == max].drop(m, axis=1).reset_index(drop=True)
        best_ds = list(best_ds['DS'])

        if len(best_ds) > 1:
          for x in best_ds:
            auc[x] = auc[x] + 0.5
        else:
          auc[best_ds[0]] = auc[best_ds[0]] + 1

dict_ds['F1'] = f1
dict_ds['AUC'] = auc
dict_ds['PF'] = pf

# print('F1: ', dict_ds['F1'])  
# print('AUC: ', dict_ds['AUC'])  
# print('PF: ', dict_ds['PF'])

res = []
for m in dict_ds:
  res.append(pd.DataFrame([dict_ds[m].values()], columns=dict_ds[m].keys()))

res = pd.concat(res).transpose()
res.columns = ['F1-score', 'AUC', 'False Alarm']
res['Total'] = res[list(res.columns)].sum(axis=1)
res = res.sort_values(by=['Total'], ascending=False)
res.to_csv('/content/drive/MyDrive/Colab Notebooks/DSSC/Resultados/Win_Tie.csv', index=False)
res
  
# metrics_count['F1'] = f1
# metrics_count['AUC'] = auc
# metrics_count['PF'] = pf

# metrics_count['F1'] = dict(Counter(metrics_count['F1']))
# metrics_count['AUC']  = dict(Counter(metrics_count['AUC']))
# metrics_count['PF'] = dict(Counter(metrics_count['PF']))

# all_df = []
# for i in metrics_count:
#   print(i)
#   d = metrics_count[i]
#   # print(d)
#   print(pd.DataFrame(list(d.items()), columns = ['Method', i]))
  # all_df.append(pd.DataFrame(list(d.items()), columns = ['Method', i]))
  # print(sorted(d.items(), key=lambda x: x[1], reverse=True))

# result = pd.concat([all_df[0], all_df[1], all_df[2]], axis=1)  
# result

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.linear_model import LogisticRegression

filterwarnings('ignore')

res = []
for target in list(np.unique(dataset['name'])):
  test = dataset.loc[dataset['name'] == target].select_dtypes(exclude=['object']).reset_index(drop=True)
  train = dataset.loc[dataset['name'] != target].select_dtypes(exclude=['object']).reset_index(drop=True)

  X, y = train.drop(train.columns[0], axis=1), train[train.columns[0]]
  # X_dsel, y_dsel = X, y
  X_test, y_test = test.drop(test.columns[0], axis=1), test[test.columns[0]]
  

  sampling = [None, RandomOverSampler(), SMOTE()]
  print(target)
  for DS in list([KNORAU(), METADES(), LCA()]):
    for clf in list([LogisticRegression(),
                     RandomForestClassifier(),
                     GaussianNB(),
                     DecisionTreeClassifier()]):
      
      for sample in sampling:
        for scaler in list([None, MinMaxScaler(), StandardScaler()]):
          if scaler != None:
            X = scaler.fit_transform(X)

          if sample != None:
            X, y = sample.fit_resample(X, y)

          # X_train, X_dsel, y_train, y_dsel = train_test_split(X, y, test_size=0.2)
          X_train, y_train = X, y
          X_dsel, y_dsel = X, y

          pool = BaggingClassifier(base_estimator=clf)
          pool.fit(X_train, y_train)
          model = DS
          model.set_params(pool_classifiers = pool)
          model.fit(X_dsel, y_dsel)
          if scaler != None:
            X_test = scaler.transform(X_test)
          y_pred = model.predict(X_test)
          y_prob = model.predict_proba(X_test)[:, 1]

          F1 = round(f1_score(y_test, y_pred), 5)
          AUC = round(roc_auc_score(y_test, y_prob), 5)
          tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

          PF = round(fp / (fp + tn), 5)
          string_ds = str(type(DS))
          string_ds = string_ds.split("'")[1]
          string_ds = string_ds.split(".")

          name_ds = string_ds[len(string_ds)-1]
          values = [target, name_ds, F1, AUC, PF, clf, sample, scaler]
          cols = ['Target', 'DS', 'f1', 'auc', 'pf', 'clf', 'sampling', 'scaler']
          res.append(pd.DataFrame([values], columns=cols))
res = pd.concat(res).reset_index(drop=True)

In [None]:
import locale
print(locale.format("%d", 1255000, grouping=True))

datasets = ['AEEEM', 'NASA', 'PROMISE', 'RELINK']
for data_name in datasets:
  path = '/content/ml/Datasets/{}/*'.format(data_name)
  data = []
  for csv in glob(path):
    name_csv = csv.split('/')[5]
    name_csv = name_csv.split('.csv')[0]
    ds = pd.read_csv(csv)
    bugs = list(ds[ds.columns[0]].value_counts())[1]
    instances = len(ds)
    percent = round((bugs / instances)*100, 2)
    effort = np.sum(ds[ds.columns[1]])
    loc_metric = list(ds.columns)[1]
    print(loc_metric)
    values = [data_name, name_csv, instances, bugs, percent, effort, loc_metric]
    cols = ['Dataset', 'Project', 'Instances', 'Deffects', '%', 'Effort', 'Effort Metric']
    data.append(pd.DataFrame([values], columns=cols))
  data = pd.concat(data).reset_index(drop=True)
  data = data.sort_values(by='Project', ascending=True)
  data.loc['Column_Total']= data.sum(numeric_only=True, axis=0)
  data.to_csv('/content/sample_data/{}_Instance_Effort.csv'.format(data_name), index=False)

In [None]:
df = pd.read_csv('/content/ml/Datasets/RELINK/Apache.csv')
df = df.sort_values(by='CountLineCodeExe', ascending=True).reset_index(drop=True)
effort =  (20 * df['CountLineCodeExe'].sum())/100
index = df['CountLineCodeExe'].cumsum().searchsorted(effort)
TargetList = df[:index]

LOC = 'CountLineCodeExe'
effort_percent = np.arange(0, 101, 1)
defective_list = []

for percent in range(0, 101):
  data = []
  effort_loc =  (percent * target_list['CountLineCodeExe'].sum())/100
  index = target_list['CountLineCodeExe'].cumsum().searchsorted(effort_loc)
  data = df[:index]

  if len(data) != 0:
    
    bugs = np.count_nonzero(data[data.columns[0]] == 1)
    percent_bugs = bugs / np.count_nonzero(TargetList[TargetList.columns[0]] == 1)
    defective_list.append(percent_bugs)
  else:
    defective_list.append(0.0)


x = effort_percent
y = defective_list

y_a = np.arange(0.0, 1.01, 0.02)
x_b = np.arange(0, 101, 2)

x_a, y_b, h = [], [], []
for i in range(51):
  x_a.append(0)
  y_b.append(1)
  h.append(100)

x1 = np.concatenate([x_a, x_b])
y1  = np.concatenate([y_a, y_b])

x2 = np.concatenate([x_b, h])
y2 = np.concatenate([x_a, y_a])

area_P_R = auc(x, y) - auc(np.arange(0, 100, 1), np.arange(0.00, 1.0, 0.01))
area_O_P = auc(x1, y1) - auc(x, y)
area_P_R = auc(x, y) - auc(np.arange(0, 101, 1), np.arange(0.00, 1.01, 0.01))
area_R_W = auc(np.arange(0, 100, 1), np.arange(0.00, 1.0, 0.01)) - auc(x2, y2)

popt = 1 - (area_O_P/ (area_O_P + area_P_R + area_R_W))


print(popt)

Unnamed: 0,isDefective,CountLineCodeExe,AvgCyclomatic,AvgCyclomaticModified,AvgCyclomaticStrict,AvgEssential,AvgLine,AvgLineBlank,AvgLineCode,AvgLineComment,CountLine,CountLineBlank,CountLineCode,CountLineCodeDecl,CountLineComment,CountSemicolon,CountStmt,CountStmtDecl,CountStmtExe,MaxCyclomatic,MaxCyclomaticModified,MaxCyclomaticStrict,RatioCommentToCode,SumCyclomatic,SumCyclomaticModified,SumCyclomaticStrict,SumEssential
0,1,907,12,11,14,4,82,5,61,7,2195,165,1185,191,328,719,916,178,738,51,51,56,0.28,221,197,246,81
1,0,0,0,0,0,0,0,0,0,0,644,1,0,0,16,0,0,0,0,0,0,0,0.00,0,0,0,0
2,1,1096,8,7,9,4,54,4,49,1,1945,209,1607,185,65,19001,1049,172,877,45,26,45,0.04,256,217,277,128
3,0,522,5,5,6,2,37,5,27,5,1159,171,727,137,236,397,526,112,414,25,25,30,0.32,134,122,146,63
4,0,26,2,2,2,1,21,1,9,3,166,15,55,13,40,20,30,10,20,4,5,4,0.73,9,10,9,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,1,354,5,5,6,3,35,3,26,7,1072,126,607,119,280,309,428,107,321,18,18,22,0.46,122,122,148,79
190,0,0,0,0,0,0,0,0,0,0,697,1,0,0,16,0,0,0,0,0,0,0,0.00,0,0,0,0
191,0,205,17,13,21,8,100,7,83,3,375,34,268,49,44,144,196,35,161,46,33,56,0.16,51,38,62,25
192,1,866,5,5,6,2,42,3,29,8,2136,224,1322,289,474,666,962,341,621,29,29,34,0.36,205,191,229,89
