In [None]:
import pandas as pd
import random
import copy
import warnings
warnings.filterwarnings('ignore')
from os.path import exists
import itertools
import ast

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import TheilSenRegressor
from sklearn.dummy import DummyRegressor
from sklearn import linear_model
from sklearn.linear_model import ElasticNetCV
from sklearn.linear_model import RANSACRegressor
import xgboost as xg
from sklearn.preprocessing import QuantileTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import f1_score as F1
from sklearn.metrics import matthews_corrcoef as MCC 
from imblearn.metrics import geometric_mean_score as Gm
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer


import os
import numpy as np
import lightgbm as lgb
import time
from yellowbrick.regressor import ResidualsPlot
from sklearn.linear_model import LinearRegression
from yellowbrick.regressor import CooksDistance
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest
import plotly.express as px
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from scipy.special import boxcox1p
from scipy.special import inv_boxcox1p
from flaml import AutoML
from scipy.stats import kurtosis, skew

from sklearn.linear_model import HuberRegressor


In [None]:
ORGANIZATION = 'Eclipse'
DATA_PATH = './datasets'
REGRESSION_VARIABLE = 'date_updated_date_created_diff'
N_REPITIONS = 100
VALIDATION_SCHEMA = TimeSeriesSplit(n_splits=10)
REGRESSOR_MODEL = RandomForestRegressor()

# class for random guessing 
class RandomGuess: 
    def __init__(self): 
        self.X =None
        self.y = None
    
    def fit(self,X,y) : 
        self.X = X 
        self.y = list(y)
    
    def predict(self,X):
        return [random.choice(self.y) for _ in range(len(X))]
    
MODELS = {
    'ExtraTrees' : ExtraTreesRegressor(),
    'Random_forest' : RandomForestRegressor(), 
    'LightGBM' : lgb.LGBMRegressor(),
    'Decision_tree' : DecisionTreeRegressor(max_depth=10),
    'XGBOOST' : xg.XGBRegressor(),
    'AdaBoost' :AdaBoostRegressor(),
    'Dummy_median' : DummyRegressor(strategy="median"), 
    'Dummy_mean' : DummyRegressor(strategy= 'mean')
    
    
    
}
ALL_MODELS = {
    'ExtraTrees' : {
        'default' : ExtraTreesRegressor(),
        'grid' : {
            'criterion' : ['absolute_error'],
            'max_depth' : [5,10,None],
            'n_estimators' : [100,500]
        }
    },
    'Random_forest' : {
        'default' :RandomForestRegressor(n_jobs = -1) , 
        'grid' : {
            'criterion' : ['squared_error', 'poisson'],
            'max_depth' : [5,10,None],
            'n_estimators' : [100,500]
        }
    },
    'Decision_tree': {
        'default' :DecisionTreeRegressor() , 
        'grid' :{
            'criterion' : ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            'max_depth' : [5,10,None],
            'splitter' : ['best', 'random']
        }
    },
    'XGBOOST' : {
        'default' : xg.XGBRegressor(n_jobs = -1),
        'grid' : {
            'max_depth' : [5,10,None],
            'n_estimators' : [100,500],
            'learning_rate': [0.01,0.1,0.3]
        }
    },
    'AdaBoost' : {
        'default' :AdaBoostRegressor(),
        'grid' : {
            'n_estimators' : [100,500],
            'learning_rate': [0.01,0.1,0.3],
            'loss' :['linear', 'square', 'exponential']
        }
    }
}


FEATURES =[
    #Time
    'created_weekday_utc',
    'created_weekday_owner_tz',
    'created_hours_utc',
    'created_hours_owner_tz',
     #collaboration graph
     'degree_centrality',
     'closeness_centrality',
     'betweenness_centrality',
     'eigenvector_centrality',
     'clustering_coefficient',
     'core_number',
     #Change
     'insertions',
     'deletions',
     'num_files',
     'num_files_type',
     'num_directory',
     'num_programming_languages',
     'modify_entropy',
     'code_churn',
     'sum_loc',
     'sum_complexity',
     'num_binary_files',
      #text
     'subject_length',
     'subject_word_count',
     'msg_length',
     'msg_word_count',
     'is_non_fonctional',
     'is_refactoring',
     'is_corrective',
     'is_preventive',
     'has_feature_addition',
     'is_merge',
     #Files
     'avg_num_dev_modified_files',
     'num_files_changes',
     'num_files_changes_avg',
     'files_changes_duration_avg',
     'files_revisions_duration_avg',
     'files_num_recent_branch_changes',
     'num_file_changes_for_revrs_avg_recent',
     'num_file_changes_for_revrs_max_recent',
     'file_changes_time_revrs_avg_recent',
     'file_changes_time_revrs_max_recent',
     'num_owner_prior_changes',
     'num_owner_open_changes_recent',
      #Owner
     'owner_age',
     'prior_owner_rate',
     'owner_changes_messages',
     'owner_changes_messages_avg',
     'owner_time_between_message_avg',
     'owner_merged_ratio',
     'num_prior_owner_project_changes',
     'prior_owner_project_changes_ratio',
     'owner_revrs_commons_changes_recent',
     'owner_revrs_commons_msgs_avg_recent',
     #history
     'num_prior_changes',
     'num_prior_project_changes',
     'num_open_changes_recent',
     'num_project_open_changes_recent',
      #Reviewers 
     'revrs_changes_recent',
     'revrs_changes_avg_recent',
     'revrs_merged_changes_recent',
     'revrs_merged_changes_avg_recent',
     'revrs_abandoned_changes_recent',
     'revrs_abandoned_changes_avg_recent',
     'revrs_open_changes_recent',
     'revrs_open_changes_avg_recent',
     'revrs_num_review_recent',
     'revrs_num_review_avg_recent',
     'revrs_previous_msgs_recent',
     'revrs_previous_msgs_avg_recent'
]

BINARY_VARIABLES = [
    'is_non_fonctional','is_refactoring','is_corrective','is_preventive','is_author_core','is_master_branch'
]
MULTICATEGORICAL_FEATURES = [
    'code_churn_size','created_weekday_utc','created_weekday_owner_tz'
]
ALL_CATEGORICAL_FEATURES = MULTICATEGORICAL_FEATURES + BINARY_VARIABLES

DIMS = {
    'all': FEATURES,
    
    'time':['created_weekday_utc','created_weekday_owner_tz','created_hours_utc',
            'created_hours_owner_tz'],
    
    'change': ['insertions','deletions','num_files','num_files_type','num_directory',
               'num_programming_languages','modify_entropy','code_churn','sum_loc',
               'sum_complexity','num_binary_files'],
    
    'text': ['subject_length','subject_word_count','msg_length','msg_word_count',
             'is_non_fonctional','is_refactoring','is_corrective','is_preventive',
             'has_feature_addition','is_merge'],
    
    'files': ['avg_num_dev_modified_files','num_files_changes','num_files_changes_avg',
              'files_changes_duration_avg','files_revisions_duration_avg',
              'files_num_recent_branch_changes','num_file_changes_for_revrs_avg_recent',
              'num_file_changes_for_revrs_max_recent','file_changes_time_revrs_avg_recent',
              'file_changes_time_revrs_max_recent'],
    
    'Owner': ['num_owner_prior_changes','num_owner_open_changes_recent','owner_age',
           'prior_owner_rate','owner_changes_messages','owner_changes_messages_avg',
           'owner_time_between_message_avg','owner_merged_ratio','num_prior_owner_project_changes',
           'prior_owner_project_changes_ratio'],
    
    'history': ['num_prior_changes','num_prior_project_changes','num_open_changes_recent',
             'num_project_open_changes_recent'],
    
    'reviewers': ['revrs_changes_recent','revrs_changes_avg_recent','revrs_merged_changes_recent',
               'revrs_merged_changes_avg_recent','revrs_abandoned_changes_recent',
               'revrs_abandoned_changes_avg_recent','revrs_open_changes_recent',
               'revrs_open_changes_avg_recent','revrs_num_review_recent','revrs_num_review_avg_recent',
               'revrs_previous_msgs_recent','revrs_previous_msgs_avg_recent','owner_revrs_commons_changes_recent',
               'owner_revrs_commons_msgs_avg_recent']
    }

RQ3_SELECTED_FEATURES = {
    'Openstack': ['created_weekday_owner_tz','created_hours_utc','created_hours_owner_tz',
                  'betweenness_centrality','core_number','insertions','deletions','num_files_type',
                  'sum_complexity','num_binary_files','subject_length','msg_word_count','is_non_fonctional',
                  'is_refactoring','is_corrective','is_preventive','has_feature_addition','is_merge',
                  'num_files_changes','num_owner_open_changes_recent','owner_age','owner_changes_messages_avg',
                  'num_prior_owner_project_changes','num_project_open_changes_recent','revrs_abandoned_changes_recent'],
    
    'Android': ['created_weekday_owner_tz','created_hours_utc','created_hours_owner_tz','clustering_coefficient',
                'insertions','deletions','num_directory','sum_complexity','num_binary_files','subject_length',
                'msg_length','is_non_fonctional','is_preventive','is_corrective','has_feature_addition','is_merge',
                'files_changes_duration_avg','files_revisions_duration_avg','num_file_changes_for_revrs_avg_recent',
                'owner_age','num_owner_open_changes_recent','owner_changes_messages_avg','owner_time_between_message_avg',
                'owner_merged_ratio','prior_owner_project_changes_ratio','num_open_changes_recent',
                'num_prior_project_changes','revrs_abandoned_changes_avg_recent','revrs_open_changes_recent'],
    
    'Qt': ['created_weekday_owner_tz','created_hours_utc','eigenvector_centrality','core_number','code_churn',
           'deletions','num_files_type','num_programming_languages','sum_complexity','num_binary_files','subject_length',
           'msg_word_count','is_non_fonctional','is_refactoring','is_corrective','is_preventive','has_feature_addition',
           'is_merge','num_file_changes_for_revrs_max_recent','files_changes_duration_avg','files_revisions_duration_avg',
           'files_num_recent_branch_changes','owner_revrs_commons_msgs_avg_recent','num_owner_open_changes_recent',
           'owner_changes_messages_avg','owner_time_between_message_avg','owner_merged_ratio','prior_owner_project_changes_ratio',
           'num_prior_project_changes','num_open_changes_recent','revrs_abandoned_changes_avg_recent','revrs_open_changes_avg_recent'],
    
    'Eclipse': ['created_weekday_owner_tz','created_hours_utc','created_hours_owner_tz','betweenness_centrality',
                'clustering_coefficient','num_directory','deletions','num_files_type','sum_complexity','num_binary_files',
                'subject_length','msg_word_count','is_non_fonctional','is_refactoring','is_corrective','is_preventive',
                'has_feature_addition','is_merge','num_file_changes_for_revrs_max_recent','files_changes_duration_avg',
                'files_revisions_duration_avg','files_num_recent_branch_changes','owner_age','num_owner_open_changes_recent',
                'prior_owner_rate','owner_changes_messages_avg','owner_time_between_message_avg','owner_merged_ratio',
                'num_prior_owner_project_changes','prior_owner_project_changes_ratio','num_prior_changes','num_project_open_changes_recent',
                'num_open_changes_recent','revrs_open_changes_recent'],
    
    'Libreoffice': ['created_weekday_utc','created_hours_utc','created_hours_owner_tz','closeness_centrality',
                    'clustering_coefficient','code_churn','deletions','modify_entropy','num_programming_languages',
                    'sum_complexity','num_binary_files','msg_length','is_non_fonctional','is_refactoring','is_corrective',
                    'is_preventive','has_feature_addition','is_merge','num_file_changes_for_revrs_avg_recent',
                    'files_changes_duration_avg','files_revisions_duration_avg','files_num_recent_branch_changes',
                    'num_owner_open_changes_recent','revrs_abandoned_changes_avg_recent','owner_changes_messages_avg',
                    'owner_time_between_message_avg','owner_merged_ratio','num_open_changes_recent',
                    'revrs_open_changes_recent']}


In [None]:
def set_global_variables(org) : 
    global ORGANIZATION
    ORGANIZATION = org
    
    
def guess_n_times_mae(X_train,y_train,X_test,y_test,n_guesses = 1000) : 
    all_guesses = []
    all_trues = []
    y_test_list = list(y_test)
    for n_guess in range(n_guesses): 
        random_guess_model = RandomGuess()
        random_guess_model.fit(X_train,y_train)
        guess = random_guess_model.predict(X_test)
        all_guesses += guess
        all_trues += y_test_list
    return mean_absolute_error(all_trues,all_guesses)

def SAE(X_train,y_train,y_true,y_pred,random_guess_mae = None,random_guess_runs = 1000) : 
    if random_guess_mae is None: 
        random_guess_mae_n_times = guess_n_times_mae(X_train,y_train,y_true,y_true,n_guesses = 1000) 
    else :
        random_guess_mae_n_times = random_guess_mae 
    
    return 1 - (mean_absolute_error(y_true,y_pred)/random_guess_mae_n_times)
    
def get_data():
    all_data =  pd.read_csv(os.path.join(FEATURES_PATH,DATA_VERSION),on_bad_lines='skip')
    time_aware_data = pd.read_csv(os.path.join(FEATURES_PATH,DATA_TIME_VERSION),on_bad_lines='skip')
    id_col = "id"
    result = all_data.copy()
    for col in result.columns: 
        if col in time_aware_data and (col != id_col): 
            result = result.drop(columns=[col])
            
    return pd.merge(result,time_aware_data,how = 'inner',on='id')

def load_full_data(path) : 
    return pd.read_csv(path)
    
def filter_data(data,min_delay_hours = 24,max_delay_hours = 14*24,projects=None): 
    min_hours = min_delay_hours
    max_hours = max_delay_hours
    result = data.copy()
    if not (projects is None) : 
        result = result[result['project'].isin(projects)]
    result = result[(result[REGRESSION_VARIABLE] >= min_hours) & (result[REGRESSION_VARIABLE] <= max_hours) ]
    result = boxplot_filtering(result,REGRESSION_VARIABLE)
    #result = result.drop_duplicates(subset=[REGRESSION_VARIABLE], keep='first')
    return result 

def select_data_between_dates(data,first_date,second_date): 
    result = data[(data['closed'] >= first_date) & (data['closed'] < second_date)]
    return result.copy()

def select_data_before_date(data,date): 
    result = data[(data['closed'] < date)]
    return result.copy()

def select_data_after_date(data,date) : 
    result = data[(data['closed'] >= date)]
    return result.copy()

def boxplot_filtering(data,col) : 
    df = data.copy()
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1    #IQR is interquartile range. 
    filter = (df[col] >= Q1 - 1.5 * IQR) & (df[col] <= Q3 + 1.5 *IQR)
    return df.loc[filter] 

def select_data_by_indicies(data,indicies): 
    return data.iloc[indicies]

def set_data_version(data_version):
    global DATA_VERSION 
    DATA_VERSION = data_version

    
def compare_two_data_folds(first_fold,second_fold,first_fold_name,second_fold_name):
    first_stats = tfdv.generate_statistics_from_dataframe(first_fold)
    second_stats = tfdv.generate_statistics_from_dataframe(second_fold)
    tfdv.visualize_statistics(lhs_statistics = first_stats,rhs_statistics = second_stats,lhs_name = first_fold_name,rhs_name= second_fold_name)
    
def apply_IsolationForest_filtering(data,cols,contamination = 0.1) : 
    result = data.copy()
    random_state = np.random.RandomState(42)
    model=IsolationForest(n_estimators=1000,max_samples='auto',contamination=contamination,random_state=random_state)
    model.fit(result[cols])
    result['scores'] = model.decision_function(result[cols])
    result['anomaly_score'] = model.predict(result[cols])
    return result[result['anomaly_score']== 1]
    
def apply_log1p(data,cols) : 
    result = data.copy()
    for col in cols: 
        if (col in ALL_CATEGORICAL_FEATURES): 
            continue 
        result[col] = np.log1p(result[col])
    return result 

def apply_quantile_transoformation(df,cols,outcome,transformor = QuantileTransformer(n_quantiles=1000, random_state=0
                                                                             ,output_distribution = 'uniform')):
    
    result = df.copy()
    target_transformer = None
    for col in cols+[outcome]:
        scaler = copy.deepcopy(transformor)
        result.loc[:,col]  = scaler.fit_transform(result.loc[:,col].to_numpy().reshape(-1, 1))
        if col == outcome :
            target_transformer = scaler
    return result, target_transformer
def apply_data_scaling(data,cols,sklearn_scaler = RobustScaler()):
    result = data.copy()
    for col in cols:
        scaler = copy.deepcopy(sklearn_scaler)
        result.loc[:,col]  = scaler.fit_transform(result.loc[:,col].to_numpy().reshape(-1,1))
    return result 
def apply_boxcox_transformation(data,cols) : 
    df = data.copy() 
    lambdas = {}
    for col in cols : 
        if not (col in df.columns) : 
            continue 
        print('processing ',col)
        df[col],fitted_lambda= boxcox(np.array(df[col])+ 1,lmbda=None)
        print('done with ',col)
        lambdas[col] = fitted_lambda 
    
    return df,lambdas

def prepare_data(filtering_function,
                 date_1 = pd.to_datetime("2015-01-01"),
                 date_2 = None,contamination = 'auto'
                ): 
    FULL_DATA = load_full_data(path=os.path.join(DATA_PATH,f'{ORGANIZATION}.csv'))
    FULL_DATA['created'] = FULL_DATA['created'].apply(pd.to_datetime)
    FULL_DATA['closed'] = FULL_DATA['closed'].apply(pd.to_datetime)
    if not (date_1 is None): 
        FULL_DATA = select_data_after_date(FULL_DATA,date_1)
    if not (date_2 is None) : 
        FULL_DATA = select_data_before_date(FULL_DATA,date_2)
    READY_DATA = filtering_function(FULL_DATA)
    if 'change_number' in READY_DATA.columns : 
        READY_DATA = READY_DATA.sort_values(by= ['change_number'])
    else : 
        READY_DATA = READY_DATA.sort_values(by= ['change_id'])
    READY_DATA['code_churn'] = READY_DATA['insertions'] + READY_DATA['deletions']   
   
    READY_DATA = READY_DATA.dropna()
    if not (contamination is None): 
        print('starting isolation')
        READY_DATA = apply_IsolationForest_filtering(READY_DATA,[REGRESSION_VARIABLE],contamination = contamination)
        print('isolation done')
    return READY_DATA

def preprocess(data,features = FEATURES,
               outcome = REGRESSION_VARIABLE,
               do_scale_data = False, 
               do_log_features_log_transform = False,
               do_boxcox = False,
               quantile_transform = True,
               do_pca = True,
               contamination = 'auto'
              ):
    data = data
    features = features
    transformers = {}
    if do_boxcox: 
        print('boxcox started')
        data,lambdas = apply_boxcox_transformation(data,[f for f in features if not (f in ALL_CATEGORICAL_FEATURES)] + [outcome])
        transformers['boxplot_lambdas'] = lambdas
        print('boxcox done')
        
    if do_log_features_log_transform : 
        print('log started')
        data = apply_log1p(data,[f for f in features if not (f in ALL_CATEGORICAL_FEATURES)] + [outcome])
        print('log done')
        
    if do_scale_data: 
        print('scaling started')
        data = apply_data_scaling(data,[f for f in features if not (f in ALL_CATEGORICAL_FEATURES)] ) 
        print('scaling done')
        
    if quantile_transform: 
        print('quantile transformation started')
        data, target_transformer = apply_quantile_transoformation(data,[f for f in features if not (f in ALL_CATEGORICAL_FEATURES)],outcome=outcome)
        transformers['target_quantile_transformer'] =  target_transformer
        print('quantile transformation done')
        
    if do_pca: 
        print('PCA started')
        pca = PCA(n_components = 40,whiten =True,svd_solver = 'full')
        pca_data = pca.fit_transform(data[[f for f in features if not (f in ALL_CATEGORICAL_FEATURES)]])
        n_components = pca_data.shape[1]
        pca_data_dataframe = pd.DataFrame(pca_data,columns = [f'component_{i}' for i in range(n_components)])
        new_features = list(pca_data_dataframe.columns)
        for col in data : 
            if col in [f for f in features if not (f in ALL_CATEGORICAL_FEATURES)]: 
                continue 
            pca_data_dataframe[col] = data[col].values
            if col in features: 
                new_features.append(col)
        data = pca_data_dataframe
        features=new_features
        print('PCA done')
    
    return data,features, transformers

def cross_project_validation(source_data,target_data ,features = FEATURES,
                        outcome = REGRESSION_VARIABLE,
                        regression_model = RandomForestRegressor(),
                        validation_schema = VALIDATION_SCHEMA,  
                        do_log_features_log_transform = True,
                        do_boxcox=False,
                        do_scale_data = True, 
                        quantile_transform = True,
                        do_pca = True,
                        contamination = 'auto',
                        is_processed = False
                       ): 

    source_data = source_data
    target_data = target_data
    features = features
    regression_model = regression_model
    
    if not(is_processed) : 
        source_data,source_features,source_transformers = preprocess(source_data,features = features,
               outcome = outcome,
               do_scale_data = do_scale_data, 
               do_log_features_log_transform = do_log_features_log_transform,
               do_boxcox = do_boxcox,
               quantile_transform = quantile_transform,
               do_pca = do_pca,
               contamination = contamination
              )
        
        target_data,target_features,target_transformers = preprocess(target_data,features = features,
               outcome = outcome,
               do_scale_data = do_scale_data, 
               do_log_features_log_transform = do_log_features_log_transform,
               do_boxcox = do_boxcox,
               quantile_transform = quantile_transform,
               do_pca = do_pca,
               contamination = contamination
              )
        
    
    X_train = source_data.loc[:,source_features]
    y_train = source_data[outcome]
    X_test = target_data.loc[:,target_features]
    y_test = target_data[outcome]
    results = []
   
    regression_model.fit(X_train,y_train)
    y_train_pred = regression_model.predict(X_train)
    y_test_pred = regression_model.predict(X_test)
    new_row = {}
    if quantile_transform :
        source_transformer = source_transformers['target_quantile_transformer']
        target_transformer = target_transformers['target_quantile_transformer']
        y_train_pred = source_transformer.inverse_transform(np.array(y_train_pred).reshape(-1,1))
        y_test_pred = target_transformer.inverse_transform(np.array(y_test_pred).reshape(-1,1))
        y_train = source_transformer.inverse_transform(np.array(y_train).reshape(-1,1))
        y_test = target_transformer.inverse_transform(np.array(y_test).reshape(-1,1))
            
    if do_log_features_log_transform : 
        y_train_pred = np.expm1(y_train_pred)
        y_test_pred = np.expm1(y_test_pred)
        y_train = np.expm1(y_train)
        y_test = np.expm1(y_test)

    if do_boxcox: 
        y_train_pred = inv_boxcox(y_train_pred, lambdas[outcome])
        y_test_pred = inv_boxcox(y_test_pred, lambdas[outcome])
        y_train = inv_boxcox(y_train, lambdas[outcome])
        y_test = inv_boxcox(y_test, lambdas[outcome])
    new_row.update(
        {
        "Train MSE": mean_squared_error(y_train,y_train_pred),
        "Test MSE": mean_squared_error(y_test,y_test_pred),
        "Train MAE": mean_absolute_error(y_train,y_train_pred),
        "Test MAE": mean_absolute_error(y_test,y_test_pred),
        'Train MAPE' : mean_absolute_percentage_error(y_train,y_train_pred),
        'Test MAPE' : mean_absolute_percentage_error(y_test,y_test_pred),
        'Test SAE':SAE(X_train,y_train,y_test,y_test_pred)
    })
    results.append(new_row)
    results_df = pd.DataFrame(results)
    return results_df

def validate_regression(data,features = FEATURES,
                        outcome = REGRESSION_VARIABLE,
                        regression_model = RandomForestRegressor(),
                        validation_schema = VALIDATION_SCHEMA,  
                        do_log_features_log_transform = True,
                        do_boxcox=False,
                        do_scale_data = True, 
                        quantile_transform = True,
                        do_pca = True,
                        contamination = 'auto',
                        is_processed = False,
                        transformers = None
                       ): 
    
    data = data
    features = features
    regression_model = regression_model
    transformers = transformers
    if not(is_processed) : 
        data,features,transformers = preprocess(data,features = features,
               outcome = REGRESSION_VARIABLE,
               do_scale_data = do_scale_data, 
               do_log_features_log_transform = do_log_features_log_transform,
               do_boxcox = do_boxcox,
               quantile_transform = quantile_transform,
               do_pca = do_pca,
               contamination = 'auto'
              )
        
    X = data.loc[:,features]
    y = data[outcome]
    results = []
    features_importance = {
        feature: [] for feature in features 
    }
    for index,(train_index, test_index) in enumerate(validation_schema.split(X)): 
        X_train = X.iloc[train_index]
        y_train = y.iloc[train_index]
        X_test = X.iloc[test_index]
        y_test = y.iloc[test_index]
        start = time.time()
        regression_model.fit(X_train,y_train)
        fit_time = time.time() - start
        y_train_pred = regression_model.predict(X_train)
        y_test_pred = regression_model.predict(X_test)
        new_row = {}
        if quantile_transform : 
            transformer = transformers['target_quantile_transformer']
            y_train_pred = transformer.inverse_transform(np.array(y_train_pred).reshape(-1,1))
            y_test_pred = transformer.inverse_transform(np.array(y_test_pred).reshape(-1,1))
            y_train = transformer.inverse_transform(np.array(y_train).reshape(-1,1))
            y_test = transformer.inverse_transform(np.array(y_test).reshape(-1,1))
            
        if do_log_features_log_transform : 
            y_train_pred = np.expm1(y_train_pred)
            y_test_pred = np.expm1(y_test_pred)
            y_train = np.expm1(y_train)
            y_test = np.expm1(y_test)
            
        if do_boxcox: 
            y_train_pred = inv_boxcox(y_train_pred, lambdas[outcome])
            y_test_pred = inv_boxcox(y_test_pred, lambdas[outcome])
            y_train = inv_boxcox(y_train, lambdas[outcome])
            y_test = inv_boxcox(y_test, lambdas[outcome])
        #print(y_test[:10])
        #print(y_test_pred[:10])
        
            
        new_row.update(
            {
            "Train MAE": mean_absolute_error(y_train,y_train_pred),
            "Test MAE": mean_absolute_error(y_test,y_test_pred),
            'Train MAPE' : mean_absolute_percentage_error(y_train,y_train_pred),
            'Test MAPE' : mean_absolute_percentage_error(y_test,y_test_pred),
            'Test SAE':SAE(X_train,y_train,y_test,y_test_pred),
            'fit_time' :fit_time
        })
        if not(do_pca) : 
            
            if not (hasattr(regression_model, 'feature_importances_')):
                print('model does not support gini feature importance')
                continue 
                
            for feature, importance in zip(X_train.columns, regression_model.feature_importances_):
                features_importance[feature].append(importance) 
        results.append(new_row)
        print(f'Iteration {index + 1} is done.')
    results_df = pd.DataFrame(results)
    return results_df,features_importance

def run_cross_projects(projects,model_name = 'ExtraTrees',model = ExtraTreesRegressor(),
                       repetations = 10,results_path='./results/cross_project_validation_results') :
    results = []
    os.makedirs(results_path,exist_ok= True)
    for source_project in projects : 
        source_results = []
        set_global_variables(source_project)
        source_data =  prepare_data(lambda data:filter_data(data,min_delay_hours=12,max_delay_hours=24*30,projects = None))
        for target_project in projects : 
            if source_project == target_project :
                continue 
                
            print('source:',source_project,'traget:',target_project)
            set_global_variables(target_project)
            target_data =  prepare_data(lambda data:filter_data(data,min_delay_hours=12,max_delay_hours=24*30,projects = None))
            for repetation in range(repetations) :
                print('Repetation:',repetation)
                result = cross_project_validation(source_data,target_data,contamination='auto')
                for index,row in result.iterrows() :
                    dict_row = dict(row)
                    dict_row['Source project'] = source_project
                    dict_row['Target project'] = target_project
                    dict_row['repitation'] = repetation
                    source_results.append(dict_row)
                    results.append(results)
        source_results = pd.DataFrame(source_results)
        source_results.to_csv(os.path.join(results_path,f'Disc_corss_project_{source_project}.csv'),index=False)
    results = pd.DataFrame(results)
    results.to_csv(os.path.join(results_path,f'Disc_corss_project_all.csv'),index=False)
                

def run_RQ1(orgs,models,nb_repetations,features= FEATURES,features_per_org = {
    'Libreoffice' : FEATURES,
    'Openstack' : FEATURES,
    'Android' : FEATURES,
    'Eclipse' : FEATURES, 
    'Qt' : FEATURES
},report_feature_importance = False,run_random_guess = True,
            results_folder = "./results/RQ1_results_exp") : 
    
    os.makedirs(results_folder,exist_ok=True)
    all_results = []
    for org in orgs:
        
        print('working on:',org)
        set_global_variables(org)
        
        features = features
        if org in features_per_org : 
            features = features_per_org[org] 
        org_results = []
        
        data = prepare_data( lambda data:filter_data(data,min_delay_hours=12,max_delay_hours=24*30,projects = None))
        print(data[REGRESSION_VARIABLE].describe())
        for model_name, model in models.items():
            model_results = []
            feature_importances_results = []
            print('running model:',model_name)
            for repetation in range(nb_repetations):
                #if repetation % 20 == 0 :
                print('repetation:',repetation)
                results,features_importances  = validate_regression(data=data,regression_model=model,
                                                do_boxcox = False,do_scale_data=True,
                                               do_log_features_log_transform=True,do_pca=not(report_feature_importance),
                                               quantile_transform=True,contamination = 'auto',features = features)
                
                
                feature_importance_row = {
                    'repitation' : repetation, 
                    'model' : model_name
                }
                for index,row in results.iterrows() : 
                    dict_row = dict(row)
                    dict_row['Organization'] = org
                    dict_row['model'] = model_name
                    dict_row['repitation'] = repetation
                    dict_row['fold'] = index
                    all_results.append(dict_row)
                    org_results.append(dict_row)
                    model_results.append(dict_row)
                    feature_importance_row['fold'] = index
                    
                    if report_feature_importance: 
                        for feature,importances in features_importances.items(): 
                            new_row = copy.deepcopy(feature_importance_row)
                            new_row['feature_name'] = feature
                            new_row['importance_value'] = importances[index] 
                            feature_importances_results.append(new_row)
                        
            model_results = pd.DataFrame(model_results)
            #model_results.to_csv(f'{org}_{model_name}_RQ1.csv',index=False)
            if report_feature_importance:
                model_features_importances = pd.DataFrame(feature_importances_results)
                model_features_importances.to_csv(os.path.join(results_folder,f'{org}_{model_name}_features_importances.csv'),index=False)
            
        org_results = pd.DataFrame(org_results)
        org_results.to_csv(os.path.join(results_folder,f'{org}_RQ1.csv'),index=False)
    return pd.DataFrame(all_results)

def tune_params(data,org_name,model,model_name,grid) : 
    all_params = grid.keys()
    all_results = []
    keys, values = zip(*grid.items())
    combinations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
    set_global_variables(org_name)
    data = prepare_data( lambda data:filter_data(data,min_delay_hours=6,max_delay_hours=24*30,projects = None))
    preprocessed_data,final_features = preprocess(data,outcome = REGRESSION_VARIABLE,
               do_scale_data = True, 
               do_log_features_log_transform = True,
               do_boxcox = False,
               quantile_transform = True,
               do_pca = True,
               contamination = 'auto')
    for combination in combinations_dicts : 
        new_row = {
            'Organization':org_name,
            'model' : model_name
        }
        new_row.update(combination)
        model = copy.deepcopy(model)
        print(combination)
        model.set_params(**combination)
        results = validate_regression(preprocessed_data,features = final_features,
                        outcome = REGRESSION_VARIABLE,
                        regression_model = model,
                        validation_schema = VALIDATION_SCHEMA,  
                        do_log_features_log_transform = True,
                        do_boxcox=False,
                        do_IsolationForest_filtering = True,
                        do_scale_data = True, 
                        quantile_transform = True,
                        do_pca = True,
                        contamination = 'auto',
                        is_processed = True
                       )
            
        new_row.update({
            'Mean Test SA' : results['Test SAE'].mean(),
            'Mean Test MRE' : results['Test MAPE'].mean(),
            'Mean Test MAE' : results['Test MAE'].mean()
        })
        all_results.append(new_row)
    return pd.DataFrame(all_results)

def run_RQ2(orgs,model = ExtraTreesRegressor(n_jobs=-1),model_name = 'ExtraTrees_dim_validation',dims = DIMS
            ,nb_repetitions = 10,results_path = './results/RQ2_results') : 
    all_results = []
    os.makedirs(results_path,exist_ok=True)
    for org in orgs : 
        org_results = []
        print('working on:',org)
        set_global_variables(org)
        data = prepare_data( lambda data:filter_data(data,min_delay_hours=12,max_delay_hours=24*30,projects = None))
        print('validating dims with ',model_name)
        for dim_name, dim in dims.items() : 
            print('validating dim ',dim_name)
            dim_results = []
            for repetation in range(nb_repetitions) : 
                if repetation % 5 == 0 :
                    print('repetation:',repetation)
                results,_  = validate_regression(data=data,
                                               regression_model=copy.deepcopy(model),do_boxcox = False,do_scale_data=True,
                                               do_log_features_log_transform=True,do_pca=False,
                                               quantile_transform=True,contamination = 'auto',features = list(set(dim)))
                

                for index,row in results.iterrows() : 
                    dict_row = dict(row)
                    dict_row['Organization'] = org
                    dict_row['model'] = model_name
                    dict_row['dim'] = dim_name
                    dict_row['fold'] = index
                    dict_row['repitation'] = repetation
                    dim_results.append(dict_row)
                    all_results.append(dict_row)
                    org_results.append(dict_row)
                    
            dim_results = pd.DataFrame(dim_results)
            #dim_results.to_csv(f'{org}_{dim_name}_{model_name}_RQ2.csv',index=False)
        org_results = pd.DataFrame(org_results)
        org_results.to_csv(os.path.join(results_path,f'{org}_RQ2.csv'),index=False)
    all_results = pd.DataFrame(all_results)
    #all_results.to_csv(f'RQ2_{model_name}.csv',index=False)  

## Parameter Tunning

In [None]:
#may take a long time to run
PT_results = "./PT_results"
for org in ['Android','Qt','Openstack','Libreoffice','Eclipse']:
    print('Processing:',org)
    set_global_variables(org)
    project_results_path = os.path.join(PT_results,org)
    os.makedirs(project_results_path,exist_ok = True)
    data = prepare_data( lambda data:filter_data(data,min_delay_hours=6,max_delay_hours=24*30,projects = None))
    for model_name,model_params in ALL_MODELS.items() : 
        print('Tunning model:',model_name)
        print('Grid:',model_params['grid'])
        tunning_results = tune_params(data,org,model_params['default'],model_name,grid=model_params['grid'] )
        tunning_results.to_csv(os.path.join(project_results_path,f'{model_name}_tunning_results.csv'),index=False)

## RQ1: models performance

In [None]:
#models parameters are choosen based on PT results
ml_models_results = run_RQ1(['Android','Libreoffice','Eclipse','Qt','Openstack'],models = {
    'ExtreTrees' : ExtraTreesRegressor(n_jobs = -1,max_depth=10,n_estimators=500),
    'RandomForest' : RandomForestRegressor(n_jobs = -1,max_depth=10,n_estimators=500),
    'AdaBoost' : AdaBoostRegressor(n_estimators=100,learning_rate=0.01,loss='linear'),
    'XGBOOST' : xg.XGBRegressor(n_jobs=-1,max_depth = 5,learning_rate=0.01,n_estimators=500),
    'DecisionTree': DecisionTreeRegressor(criterion='friedman_mse',max_depth=5,splitter='best'),
    'Median' : DummyRegressor(strategy="median"), 
    'Mean' : DummyRegressor(strategy= 'mean'),   
    'RandomGuess':RandomGuess()
},nb_repetations=1,report_feature_importance=False,run_random_guess=False)


## RQ2 dimension validation

In [None]:
run_RQ2(['Libreoffice','Android','Openstack','Eclipse','Qt'],model=ExtraTreesRegressor(n_jobs = -1),model_name='ExtraTrees',nb_repetitions=1,dims=DIMS,results_path='./results/RQ2_val')

## RQ3 feature importance

In [None]:
results = run_RQ1(['Libreoffice','Android','Qt','Openstack','Eclipse'],models = {'ExtreTrees_feature_ranking_reduced' : ExtraTreesRegressor()},nb_repetations=1,features_per_org=RQ3_SELECTED_FEATURES,results_folder='./results/RQ3_results_validation',report_feature_importance = True) 

## Cross project validation

In [None]:
run_cross_projects(['Libreoffice','Android','Eclipse','Openstack','Qt'],model_name = 'ExtraTrees',model = ExtraTreesRegressor(),repetations = 1)