In [10]:
import os
import os.path as osp
import pandas as pd
import numpy as np
from datetime import datetime

from datetime import timedelta

from xgboost import XGBClassifier
from sklearn.metrics import  precision_score, recall_score, roc_auc_score, brier_score_loss
from sklearn.base import clone
from sklearn.model_selection import TimeSeriesSplit
from imblearn.under_sampling import RandomUnderSampler

# %matplotlib inline
import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
# from alibi.explainers import ALE, plot_ale

from utils import helpers as hpr
from utils import constants
import utils.classifier_util as clas_util

from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.spatial.distance import squareform

### Constants

In [2]:
METRICS = constants.get_metrics()
# metric_imp = pd.read_csv(osp.join('.', 'Files', 'third_metric_importances.csv'))
# metric_imp = metric_imp.set_index('m')['imp'].to_dict()

### Dependent changes

In [4]:
df_dependent_changes = pd.read_csv(osp.join('.', 'Files', 'all_dependencies.csv'))

In [5]:
dependent_changes = set(hpr.flatten_list(df_dependent_changes[['Source', 'Target']].values))
# cross_pro_changes = set(hpr.flatten_list(df_dependent_changes.loc[df_dependent_changes['is_cross']==True, ['Source', 'Target']].values))
# within_pro_changes = dependent_changes.difference(cross_pro_changes)

### Load changes

In [26]:
df_changes = hpr.combine_openstack_data()
df_changes['changed_files'] = df_changes['changed_files'].map(hpr.combine_changed_file_names)
df_changes['commit_message'] = df_changes['commit_message'].map(hpr.preprocess_change_description)
# df_changes = df_changes[df_changes['number'].isin(dependent_changes)]
# all_change_ids = df_changes['number'].unique()
# df_changes['reviewers'] = df_changes['reviewers'].map(ast.literal_eval)
# df_changes['reviewers'] = df_changes['reviewers'].map(lambda x: [rev['_account_id'] for rev in x])

Reading OpenStack changes...
OpenStack changes loaded successfully...


In [8]:
min_date = datetime(2014, 1, 1)

In [9]:
# df_changes = df_changes[(df_changes['status']!='NEW')&(df_changes['created']>=min_date)]
df_changes = df_changes.drop_duplicates(subset=['change_id'], keep='last')

In [None]:
changed_files = dict(zip(df_changes['number'], df_changes['changed_files']))
changes_description = dict(zip(df_changes['number'], df_changes['commit_message']))
added_lines = dict(zip(df_changes['number'], df_changes['added_lines']))
deleted_lines = dict(zip(df_changes['number'], df_changes['deleted_lines']))

### Import pairs of changes with depends-on and needed-by tags

In [48]:
df_dependencies = pd.read_csv(osp.join(".", "Files", "Preliminary", "deps_ident.csv"))
df_dependencies = df_dependencies.loc[(df_dependencies['Source_status']!='NEW')&(df_dependencies['Target_status']!='NEW')]
df_dependencies['related'] = 1

In [51]:
df_deps_red = df_dependencies[['when_identified']]

# Calculate Z-scores
z_scores = np.abs((df_deps_red - df_deps_red.mean()) / df_deps_red.std())

# Set a threshold for identifying outliers
threshold = 3

# Filter out the outliers
df_clean = df_deps_red[(z_scores < threshold).all(axis=1)]

In [61]:
df_dependencies = df_dependencies[df_dependencies.index.isin(df_clean.index)]

In [None]:
# df_changes[df_changes['status']=="MERGED"]
df_dependencies[(df_dependencies['Source_status']=="MERGED")&(df_dependencies['Target_status']=="MERGED")]

### Combine features

In [6]:
def compute_pctg_cross_project_changes(row):
    dominator = row['cross_project_changes'] + row['within_project_changes']
    if dominator == 0:
        return 0
    return row['cross_project_changes'] / dominator

def compute_pctg_whole_cross_project_changes(row):
    dominator = row['whole_cross_project_changes'] + row['whole_within_project_changes']
    if dominator == 0:
        return 0
    return row['whole_cross_project_changes'] / dominator

def compute_ptg_cross_project_changes_owner(row):
    dominator = row['cross_project_changes_owner'] + row['within_project_changes_owner']
    if dominator == 0:
        return 0
    return row['cross_project_changes_owner'] / dominator

def combine_features():
    metric_path = osp.join('.', 'Files', 'Metrics')
    metric_list = hpr.list_file(metric_path)
    df = pd.read_csv(f'{metric_path}/{metric_list[0]}')
    for metric_file in metric_list[1:]:
        df_metric = pd.read_csv(f'{metric_path}/{metric_file}') 
        # Join source and target changes with features of changes
        df = pd.merge(
            left=df, 
            right=df_metric, 
            left_on='number', 
            right_on='number', 
            how='inner',
            suffixes=('_target', '_source')
        )

    df['project_age'] /= (60 * 60 * 24)

    df['pctg_cross_project_changes'] = df.apply(compute_pctg_cross_project_changes, axis=1)
    # df['pctg_whole_cross_project_changes'] = df.apply(compute_pctg_whole_cross_project_changes, axis=1)
    df['ptg_cross_project_changes_owner'] = df.apply(compute_ptg_cross_project_changes_owner, axis=1)

    df.drop(columns=['num_build_failures'], inplace=None)
    
    return df

def is_cross_project(number):
    if number in cross_pro_changes:
        return 1
    elif number in within_pro_changes:
        return 0
    else:
        return 2

In [4]:
df = clas_util.combine_features()
df = df.drop(columns=['num_build_failures'])
# df = pd.merge(
#     left=df, 
#     right=df_changes[['number', 'created', 'project', 'owner_account_id']], 
#     left_on='number', 
#     right_on='number', 
#     how='inner',
#     suffixes=('_source', '_target')
# )
# df['is_dependent'] = df['number'].map(lambda nbr: 1 if nbr in dependent_changes else 0)
# df['is_cross'] = df['number'].map(is_cross_project)

### Build dataset for the 3rd model

In [6]:
def assign_past_changes(row, sampling=True):
    days_offset = row['created'] - timedelta(days=38.06)
    source_changes = df_changes.loc[
        (df_changes['status'] == 'MERGED') &
        (df_changes['created'] < row['created']) &
        (df_changes['created'] >= days_offset),
        ['number']
    ]
    if (len(source_changes) > 0) and (sampling == True):
        source_changes = source_changes.sample(n=30, replace=True, random_state=42)

    source_changes = source_changes['number'].tolist()

    # if len(source_changes) >= 60:
    #     source_changes = random.sample(source_changes, 60)
    
    source_changes += df_dependencies.loc[
        (df_dependencies['Target']==row['Target']), 
        'Source'].tolist()
    return set(source_changes)

In [7]:
def compute_common_dev_pctg(row):
    dev_source = df.loc[
        (df['project'] == row['project_source']) &
        (df['created'] < row['created_target']),
        'owner_account_id'
    ].unique()
    dev_target = df.loc[
        (df['project'] == row['project_target']) &
        (df['created'] < row['created_target']),
        'owner_account_id'
    ].unique()

    union = len(set(dev_source).union(dev_target))
    intersect = len(set(dev_source).intersection(dev_target))
    return intersect/union if union != 0 else 0

def count_dev_in_src_change(row):
    changes_nbr = df.loc[
        (df['project'] == row['project_source']) &
        (df['created'] < row['created_target']) &
        (df['owner_account_id'] == row['owner_account_id_target']),
        'number'
    ].nunique()

    return changes_nbr

def count_rev_in_src_change(row):
    account_id = row['owner_account_id_target']
    reviewers = df.loc[
        (df['project'] == row['project_source']) &
        (df['created'] < row['created_target']) & 
        (df['owner_account_id'] != account_id), 'reviewers'].values
    rev_exists = [account_id in reviewers_list for reviewers_list in reviewers]
    return sum(rev_exists)

def count_src_trgt_co_changed(row):
    return len(df_dependent_changes[
        (df_dependent_changes['project_source'] == row['project_source']) &
        (df_dependent_changes['project_target'] == row['project_target']) &
        (df_dependent_changes['created_target'] < row['created_target'])
    ])

def get_features_labels():
    X = pd.DataFrame({})
    X_path = osp.join('.', 'Files', 'Data', 'Model3')
    for f in hpr.list_file(X_path):
        X_item = pd.read_csv(f'{X_path}/{f}')
        X = pd.concat((X, X_item))
    X.sort_values(by=['Target', 'Source'], inplace=True)
    
    y = X['related']

    # X.drop(columns=['Source', 'Target', 'project', 'owner_account_id', 'number_target', 'number_source', 'number', 'created_target', 'created_source', 'related'], inplace=True)
    X.drop(columns=['Source', 'Target', 'related'], inplace=True)

    return X, y

### Base estimators

In [5]:
def assign_features(X, fold):
    print(f'******** Started assigning pairs of changes for Fold {fold}')
    X = pd.merge(
        left=X, 
        right=df, 
        left_on='Source', 
        right_on='number', 
        how='inner',
        suffixes=('_target', '_source')
    )
    X = pd.merge(
        left=X, 
        right=df, 
        left_on='Target', 
        right_on='number', 
        how='inner',
        suffixes=('_target', '_source')
    )
    # y = X['related'].values
    # X.drop(columns=['related', 'number_source', 'number_target'], axis=1, inplace=True)
    X.drop(columns=['number_source', 'number_target'], inplace=True)

    # if X.empty == False:
    #     X.to_csv(osp.join('.', 'Files', 'Data', 'Pairs', f'{target}.csv'), index=None)

    return X

    
def assign_pair_features(X):
    # print(f'******** Assigning pairs\'s features')
    X = pd.merge(
        left=X, 
        right=df[['number', 'owner_account_id', 'project']], 
        left_on='Source', 
        right_on='number', 
        how='left',
        suffixes=('_target', '_source')
    )

    X.drop(columns=['owner_account_id_source', 'project_source', 'owner_account_id_target', 'project_target'], axis=1, inplace=True)

    # print(f'******** Pairs\'s features were assigned successfully...')

    return X

In [9]:
test_path = osp.join('.', 'Files', 'Data', 'Test')
test_files = hpr.list_file(test_path)

for fn in test_files:
    file_path = f'{test_path}/{fn}'
    # print(f'Processing {file_path}')
    df_test = pd.read_csv(file_path)
    df_test = df_test[['Source', 'Target', 'related']+constants.PAIR_METRICS]
    df_test = assign_features(df_test, fn)
    df_test.to_csv(file_path, index=None)
    print(f'{file_path} processed successfully...')

******** Started assigning pairs of changes for Fold 6.csv
./Files/Data/Test/6.csv processed successfully...
******** Started assigning pairs of changes for Fold 7.csv
./Files/Data/Test/7.csv processed successfully...
******** Started assigning pairs of changes for Fold 5.csv
./Files/Data/Test/5.csv processed successfully...
******** Started assigning pairs of changes for Fold 4.csv
./Files/Data/Test/4.csv processed successfully...
******** Started assigning pairs of changes for Fold 0.csv
./Files/Data/Test/0.csv processed successfully...
******** Started assigning pairs of changes for Fold 1.csv
./Files/Data/Test/1.csv processed successfully...
******** Started assigning pairs of changes for Fold 3.csv
./Files/Data/Test/3.csv processed successfully...
******** Started assigning pairs of changes for Fold 2.csv
./Files/Data/Test/2.csv processed successfully...
******** Started assigning pairs of changes for Fold 9.csv
./Files/Data/Test/9.csv processed successfully...
******** Started as

In [None]:
test_path = osp.join('.', 'Files', 'Data', 'Pipeline')
test_files = hpr.list_file(test_path)

for fn in test_files:
    file_path = f'{test_path}/{fn}'
    print(f'Processing {file_path}')
    df_test = pd.read_csv(file_path)
    df_test['num_shrd_file_tkns'] = df_test[['Source', 'Target']].apply(clas_util.compute_filenames_shared_tokens, args=(changed_files,), axis=1)
    df_test['num_shrd_desc_tkns'] = df_test[['Source', 'Target']].apply(clas_util.compute_shared_desc_tokens, args=(changes_description,), axis=1)
    df_test['related'] = df_test['related'].fillna(False)
    df_test.to_csv(file_path, index=None)
    print(f'{file_path} processed successfully...')

In [3]:
ensemble_classifiers = clas_util.load_classifiers()

In [11]:
df_features = pd.read_csv(osp.join(".", "Results", "Correlation", "second_model.csv"))

In [18]:
df_remai_feat = X_train.columns.tolist()
df_remai_feat = [f for f in df_remai_feat if f not in df_features['Feat'].tolist()]
df_remai_feat = [{"Feat": f}|{f"Fold{i}": 1 for i in range(10)} for f in df_remai_feat]
df_remai_feat = pd.DataFrame(df_remai_feat)

In [60]:
training_results = {key: None for key in ensemble_classifiers.keys()}
tscv = TimeSeriesSplit(n_splits = 10)
clf_path = osp.join('.', 'Results')

# df_feat_impo = pd.DataFrame()
# df_feat_impact = pd.DataFrame()

if not os.path.exists(clf_path):
    os.makedirs(clf_path)
    
for label, ens_clf in ensemble_classifiers.items():
    print(f'Start training with {label} classifier...')

    # if label != 'RF':
    #     continue

    auc_scores = []
    precision_scores = []
    recall_scores = []
    brier_scores = []
    feature_importances = []
    features = []
    corr_features = []
    redundant_features = []


    for fold in range(0, 10):

        if fold not in [9]:
            continue

        clone_clf = clone(ens_clf)
        
        # Split training data into features and labels
        X_train = pd.read_csv(osp.join(".", "Files", "Data", "Train", f"{fold}.csv"))
        y_train = X_train['related']
        # X_train = X_train.drop(columns=["related"])
        # pd.DataFrame({'col': X_train.columns.tolist()}).to_csv("test.csv", index=None)
        # df_test = pd.concat((df_test, X_train.iloc[:1]))
        # desc_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold)
        # subject_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, "subject")
        # add_lines_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, 'added_lines')
        # del_lines_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, 'deleted_lines')

        # X_train = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_train, 'commit_message', 'desc')
        # X_train = clas_util.compute_embdedding_similarity(df_changes, subject_model, X_train, 'subject', 'subject')
        # X_train = clas_util.compute_embdedding_similarity(df_changes, add_lines_model, X_train, 'added_lines', 'add_lines')
        # X_train = clas_util.compute_embdedding_similarity(df_changes, del_lines_model, X_train, 'deleted_lines', 'del_lines')

        X_train = X_train.drop(columns=['Source', 'Target', 'related'])
        # X_train.to_csv(osp.join(".", "Files", "Data", "Train", f"{fold}.csv"), index=None)

        # corr_path = osp.join('.', 'Results', 'Correlation')
        # if fold == 0:
        #     df_features = pd.DataFrame({'Feat': X_train.columns.tolist()})
        #     for f in range(0, 10):
        #         df_features[f"Fold{f}"] = [1]*len(df_features)
        #     df_features.to_csv(f"{corr_path}/second_model.csv", index=None)
        

        # conduct the correlation analysis
        # if fold == 0:
        

       
        # if not os.path.exists(corr_path):
        #     os.makedirs(corr_path)
            
        # pd.DataFrame({'Features': corr_features}).to_csv(f'{corr_path}/{fold+1}.csv', index=None)
        # plt.figure(figsize=(6,12))
        # dissimilarity = 1 - abs(X_train.corr())
        # Z = linkage(squareform(dissimilarity), 'complete')

        # dendrogram(Z, labels=X_train.columns, orientation='left')

        # threshold = 0.3 * max(Z[:, 2])  # Scale threshold based on the maximum distance in the dendrogram
        # plt.axvline(x=threshold, color='r', linestyle='--')

        # # # Adjust the layout to make sure labels fit
        # plt.tight_layout()
        # plt.savefig(f'{corr_path}/second_model/{fold}.pdf')

        corr_features = df_features.loc[df_features[f'Fold{fold}']==0, 'Feat'].tolist()
        # Remove highly correlated features from the training set
        X_train = X_train.drop(columns=corr_features)

        # Conduct redundancy analysis
        redundant_features = clas_util.redundancy_analysis(X_train)

        # Remove indepandent variables explained by others
        X_train = X_train.drop(columns=redundant_features)

        # if fold == 0:
        #     break

        # Instantiate the OverSampler class then fit it on the each fold training dataset
        # features = X_train.columns.tolist()
        # print(f'len(X_train) {len(X_train)}')

        # Columns to exclude in the test set
        # cols_exluded = corr_features + redundant_features 

        # Train the Random Forest Classifier on the training fold set 
        clone_clf.fit(X_train, y_train)

        # rf_ale = ALE(clone_clf.predict, feature_names=X_train.columns.tolist())
        # rf_exp = rf_ale.explain(X_train.to_numpy())
        # plot_ale(rf_exp, features=ale_features, n_cols=4, fig_kw={'figwidth':14, 'figheight': 7})
        # plt.tight_layout()
        # plt.savefig(f'./Results/ALE/third_model/ale_{fold+1}_model.pdf', format='pdf')
        # continue

        X_test = pd.read_csv(osp.join(".", "Files", "Data", "Test", f"{fold}.csv"))
        y_test = X_test['related']
        # X_test = X_test.drop(columns=["related"])
        # X_test_pairs = X_test[['Source', 'Target', 'related']]

        # X_test = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_test, 'commit_message', 'desc')
        # X_test = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_test, 'subject', 'subject')
        # X_test = clas_util.compute_embdedding_similarity(df_changes, add_lines_model, X_test, 'added_lines', 'add_lines')
        # X_test = clas_util.compute_embdedding_similarity(df_changes, del_lines_model, X_test, 'deleted_lines', 'del_lines')
        
        # X_test = X_test.drop(columns=cols_dropped+corr_features+redundant_features)
        # X_test = X_test.drop(columns=['Source', 'Target'])
        X_test = X_test[X_train.columns.tolist()]
        # X_test.to_csv(osp.join(".", "Files", "Data", "Test", f"{fold}.csv"), index=None)

        # print(X_train.columns, )
        # Test the Random Forest Classifier on the test fold set 
        y_pred = clone_clf.predict(X_test)

        # y_pred_prob = clone_clf.predict_proba(X_test)[:, 1]
        # X_test_pairs['pred'] = y_pred_prob
        # compute_top_k_prec_recal(X_test_pairs, label, fold+1)

        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        auc_scores.append(roc_auc_score(y_test, y_pred))
        brier_scores.append(brier_score_loss(y_test, y_pred))

        print(f"{label}, Fold: {fold+1}, Precision: {precision_scores[-1]}, Recall: {recall_scores[-1]}, AUC: {auc_scores[-1]}, Brier: {brier_scores[-1]}")

        # logging.debug(msg=f"{label}, Fold: {fold+1}, Precision: {precision_scores[-1]}, Recall: {recall_scores[-1]}, AUC: {auc_scores[-1]}")
        if label == 'XGBoost': 
            df_feat_imp_item = pd.DataFrame({name: [val] for name, val in zip(X_train.columns.to_list(), clone_clf.feature_importances_)})
            df_feat_impo = pd.concat((df_feat_impo, df_feat_imp_item))

        if label not in ['MLP']:

            binary_cols = list(constants.DESCRIPTION_METRICS.keys())
            binary_cols = hpr.flatten_list([[f'{c}_source', f'{c}_target'] for c in binary_cols])

            # Step 1: Calculate the median of each column
            medians = X_train.median()

            # Step 2: Calculate the standard deviation of each column
            std_devs = X_train.std()

            # Step 3: Create a new dataframe starting with the median row
            df_feat_impact_item = pd.DataFrame([medians])
            for col in binary_cols:
                if col in X_train.columns.tolist():
                    df_feat_impact_item[col] = 0
            df_feat_impact_item = pd.concat([df_feat_impact_item] * (len(X_train.columns.to_list()) + 1), ignore_index=True)


            # Step 4: Double the number of rows according to the number of features and add standard deviation to each column
            for idx, col in zip(range(1, len(df_feat_impact_item.columns)+1), df_feat_impact_item.columns):
                # df_feat_impact_item.iloc[idx, idx-1] += std_devs[col]

                if col in binary_cols:
                    df_feat_impact_item.iloc[idx, idx-1] = 1
                else:
                    df_feat_impact_item.iloc[idx, idx-1] += std_devs[col]

            df_feat_impact_item['pred'] = clone_clf.predict_proba(df_feat_impact_item)[:,1]
            proba1 = df_feat_impact_item.iloc[0, -1]
            df_feat_impact_item['impact'] = None

            df_feat_impact_item.iloc[1:, -1] = 0 if proba1 == 0 else (df_feat_impact_item.iloc[1:, -2] - proba1) / proba1

            df_feat_impact_item['fold'] = fold
            df_feat_impact_item['Classifier'] = label

            df_feat_impact = pd.concat((df_feat_impact, df_feat_impact_item))

    # feature_importances /= (fold+1)
    prec_avg = np.average(precision_scores)
    recall_avg = np.average(recall_scores)
    auc_avg = np.average(auc_scores)
    brier_avg = np.average(brier_scores)

    print(f"{label}, Precision: {prec_avg}, Recall: {recall_avg}, AUC: {auc_avg}, Brier: {brier_avg}")

    training_results[label] = {
        'Classifier': label,
        'Precision': prec_avg,
        'Recall': recall_avg,
        'AUC': auc_avg,
        'Brier': brier_avg,
        'AUC Scores': auc_scores,
        'Precision Scores': precision_scores,
        'Recall Scores': recall_scores,
        'Brier Scores': brier_scores
    }
    # training_results[label]['Precision Scores'] += precision_scores
    # training_results[label]['AUC Scores'] += auc_scores
    # training_results[label]['Recall Scores'] += recall_scores
    # training_results[label]['Brier Scores'] += brier_scores

Start training with ET classifier...
ET, Fold: 10, Precision: 0.00729877677907684, Recall: 0.7346938775510204, AUC: 0.7002898011174263, Brier: 0.3338849720024484
ET, Precision: 0.00729877677907684, Recall: 0.7346938775510204, AUC: 0.7002898011174263, Brier: 0.3338849720024484
Start training with RF classifier...
RF, Fold: 10, Precision: 0.01635124905374716, Recall: 0.7346938775510204, AUC: 0.7934569378656748, Brier: 0.14817165786311803
RF, Precision: 0.01635124905374716, Recall: 0.7346938775510204, AUC: 0.7934569378656748, Brier: 0.14817165786311803
Start training with XGBoost classifier...
XGBoost, Fold: 10, Precision: 0.018187025633681797, Recall: 0.8639455782312925, AUC: 0.8539999022081765, Brier: 0.1558794858425336
XGBoost, Precision: 0.018187025633681797, Recall: 0.8639455782312925, AUC: 0.8539999022081765, Brier: 0.1558794858425336
Start training with AdaBoost classifier...
AdaBoost, Fold: 10, Precision: 0.009922534598311428, Recall: 0.7755102040816326, AUC: 0.7583878925057421, B

In [98]:
# training_results['RF']#['AUC Scores']
for met in ['Precision', 'Recall', 'Brier', 'AUC']:
    df_result[met] = df_result[f'{met} Scores'].map(np.mean)

In [28]:
df_result = pd.DataFrame(training_results.values())
df_result.to_csv(osp.join('.', 'Results', 'second_model_perf.csv'), index=None)

### Feature importance

In [61]:
df_feat_impo.fillna(0, inplace=True)
df_feat_impo.to_csv(osp.join('.', 'Results', 'Feature_importance', f'second_feat_impo.csv'), index=None)

### Feature Impact

In [62]:
df_feat_impact = df_feat_impact.reset_index(drop=True)
df_feat_impact = df_feat_impact[df_feat_impact['impact'].notnull()]
df_feat_impact.to_csv(osp.join('.', 'Results', 'Impact', 'second_model_feat_impact.csv'), index=None)

In [2]:
df_feat_impact = pd.read_csv(osp.join('.', 'Results', 'Impact', 'second_model_feat_impact.csv'))

In [63]:
# Creating a DataFrame from the additional data
data = {
    'Feature': [
        'num_shrd_desc_tkns', 'num_shrd_file_tkns', 'desc_sim',
        'subject_sim', 'description_length_source', 'whole_within_project_changes_source',
        'description_length_target', 'projects_contributed_owner_target', 'projects_contributed_owner_source',
        'project_age_target', 'ratio_dep_chan_owner_source', 'project_age_source',
        'whole_changes_owner_source', 'whole_changes_owner_target', 'ratio_dep_chan_owner_target',
        'num_file_changes_source', 'insertions_source', 'num_file_changes_target',
        'insertions_target', 'pctg_cross_project_changes_source', 'add_lines_sim',
        'pctg_cross_project_changes_target', 'last_mth_dep_proj_nbr_target', 'last_mth_cro_proj_nbr_target',
        'last_mth_dep_proj_nbr_source', 'cross_project_changes_target', 'src_trgt_co_changed_nbr',
        'pctg_inter_dep_cha', 'deletions_target', 'deletions_source',
        'cross_project_changes_owner_source', 'project_changes_owner_target', 'dev_in_src_change_nbr',
        'del_lines_sim', 'last_mth_cro_proj_nbr_source', 'cross_project_changes_source',
        'rev_in_src_change_nbr', 'pctg_cross_project_changes_owner_source', 'is_preventive_source',
        'pctg_cross_project_changes_owner_target', 'project_changes_owner_source', 'num_file_types_target',
        'is_corrective_target', 'num_directory_files_target', 'num_file_types_source',
        'has_feature_addition_source', 'is_preventive_target', 'is_corrective_source',
        'is_refactoring_source', 'is_refactoring_target', 'has_feature_addition_target',
        'num_directory_files_source', 'is_merge_source', 'is_non_functional_target',
        'is_non_functional_source', 'is_merge_target'
    ],
    'Ranking': [
        1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 10, 10, 10, 11, 11, 12, 13, 13, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 21, 21, 22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 27, 28, 29, 30, 30, 31, 31, 32, 33, 33, 34, 34, 35
    ]
}

# Creating the dataframe
# Creating the dataframe
df_most_impo_feat = pd.DataFrame(data)
classifiers = df_feat_impact['Classifier'].unique().tolist()
df_most_impo_feat['Classifier'] = [classifiers for _ in range(len(df_most_impo_feat))]
df_most_impo_feat = df_most_impo_feat.explode("Classifier")
df_most_impo_feat.reset_index(drop=True, inplace=True)

In [67]:
def map_impact_to_fold(row):
    # print(row)
    for i in range(10):
        row[f'fold{i}'] = df_feat_impact.loc[(df_feat_impact['Classifier']==row['Classifier'])&(df_feat_impact['fold']==i), [row['Feature'],  'fold', 'impact']].sort_values(by=row['Feature']).iloc[-1, -1]
    return row

def retrieve_impact(row, func):
    df_sub = [row[f'fold{i}'] for i in range(10)]
    return func(df_sub)

In [57]:
df_feat_impact.loc[(df_feat_impact['Classifier']=='ET')&(df_feat_impact['fold']==0), ['num_shrd_desc_tkns',  'fold', 'impact']].sort_values(by='num_shrd_desc_tkns').iloc[-1, -1]

0

In [68]:
df_most_impo_feat = df_most_impo_feat.apply(map_impact_to_fold, axis=1)
# df_most_impo_feat['mean'] = df_most_impo_feat.apply(retrieve_impact, args=(np.mean,), axis=1) 
# df_most_impo_feat['median'] = df_most_impo_feat.apply(retrieve_impact, args=(np.median,), axis=1) 
# df_most_impo_feat['max'] = df_most_impo_feat.apply(retrieve_impact, args=(max,), axis=1) 

In [7]:
for idx, row in df_most_impo_feat[:10].iterrows():
    print(row["Feature"].replace("_", "\\_")+" & "+str(row["Ranking"])+" & "+str(round(row['min'], 2))+" & "+str(round(row['median'], 2))+" & "+str(round(row['max'], 2))+" \\\\")

num\_shrd\_desc\_tkns & 1 & 2.06 & 2.5 & 3.48 \\
num\_shrd\_file\_tkns & 2 & 1.0 & 1.33 & 2.24 \\
desc\_sim & 3 & -0.1 & 0.0 & 0.15 \\
subject\_sim & 4 & -0.03 & 0.16 & 0.43 \\
description\_length\_source & 5 & -0.07 & 0.16 & 0.42 \\
whole\_within\_project\_changes\_source & 6 & -0.21 & -0.06 & 0.37 \\
description\_length\_target & 7 & -0.17 & 0.15 & 0.33 \\
projects\_contributed\_owner\_target & 8 & -0.21 & -0.12 & 0.0 \\
projects\_contributed\_owner\_source & 8 & -0.33 & -0.12 & 0.08 \\
project\_age\_target & 9 & -0.12 & -0.02 & 0.1 \\


In [69]:
df_most_impo_feat.to_csv(osp.join('.', 'Results', 'second_feat_impact_import.csv'), index=None)

### Dimension importance

In [11]:
M1_METRICS = df.columns.tolist()
CHANGE_METRICS = [col for col in constants.CHANGE_METRICS if col in M1_METRICS]
TEXT_METRICS = [col for col in constants.TEXT_METRICS if col in M1_METRICS]
DEVELOPER_METRICS = [col for col in constants.DEVELOPER_METRICS if col in M1_METRICS]
PROJECT_METRICS = [col for col in constants.PROJECT_METRICS if col in M1_METRICS]
FILE_METRICS = [col for col in constants.FILE_METRICS if col in M1_METRICS]
CHANGE_METRICS = [f'{cm}_source' for cm in CHANGE_METRICS] + [f'{cm}_target' for cm in CHANGE_METRICS]
TEXT_METRICS = [f'{cm}_source' for cm in TEXT_METRICS] + [f'{cm}_target' for cm in TEXT_METRICS]
DEVELOPER_METRICS = [f'{cm}_source' for cm in DEVELOPER_METRICS] + [f'{cm}_target' for cm in DEVELOPER_METRICS]
PROJECT_METRICS = [f'{cm}_source' for cm in PROJECT_METRICS] + [f'{cm}_target' for cm in PROJECT_METRICS]
FILE_METRICS = [f'{cm}_source' for cm in FILE_METRICS] + [f'{cm}_target' for cm in FILE_METRICS]

dimensions = {
    'Change': CHANGE_METRICS,
    'Text': TEXT_METRICS,
    'Developer': DEVELOPER_METRICS,
    'Project': PROJECT_METRICS,
    'File': FILE_METRICS,
    'Pairs': constants.PAIR_METRICS
}

In [16]:
dimension_results = {key: [] for key in dimensions.keys()}
tscv = TimeSeriesSplit(n_splits = 10)
clf_path = osp.join('.', 'Results')
dimension_type = 'discard'

if not os.path.exists(clf_path):
    os.makedirs(clf_path)
    
for dim_label, dim_feats in dimensions.items():
    print(f'Start training with {dim_label} dimension...')

    # if dim == 'Pairs':
    #     continue

    features = []
    if dimension_type == 'keep':
        features = dim_feats
    else:
        for lab, dim in dimensions.items():
            if lab != dim_label:
                features += dim

    auc_scores = []
    precision_scores = []
    recall_scores = []
    brier_scores = []
    feature_importances = []
    corr_features = []
    redundant_features = []

    for fold in range(0, 10):

        # if fold in [3, 4]:
        #     continue

        clone_clf = XGBClassifier(random_state=42)
        
        # Split training data into features and dims
        X_train = pd.read_csv(osp.join(".", "Files", "Data", "Train", f"{fold}.csv"))
        y_train = X_train['related']

        # df_test = pd.concat((df_test, X_train.iloc[:1]))
        # if (dimension_type == 'keep' and dim_label == 'Pairs') or (dimension_type == 'discard' and dim_label != 'Pairs'):
        #     desc_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold)
        #     subject_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, "subject")
        #     add_lines_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, 'added_lines')
        #     del_lines_model = clas_util.doc2vec_model(df_changes, X_train[['Source', 'Target']].values, fold, 'deleted_lines')

        #     X_train = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_train, 'commit_message', 'desc')
        #     X_train = clas_util.compute_embdedding_similarity(df_changes, subject_model, X_train, 'subject', 'subject')
        #     X_train = clas_util.compute_embdedding_similarity(df_changes, add_lines_model, X_train, 'added_lines', 'add_lines')
        #     X_train = clas_util.compute_embdedding_similarity(df_changes, del_lines_model, X_train, 'deleted_lines', 'del_lines')

        corr_features = df_features.loc[df_features[f'Fold{fold}']==0, 'Feat'].tolist()
        features = [c for c in features if c not in corr_features]

        X_train = X_train[features]

        # X_train = X_train.drop(columns=cols_dropped)

        # ros = RandomUnderSampler(random_state=0)
        
        # Perform under-sampling of the majority class(es)
        # X_train, y_train = ros.fit_resample(X_train, y_train)

        # conduct the correlation analysis
        # if fold == 0:
        

        # # Remove highly correlated features from the training set
        # X_train = X_train.drop(columns=corr_features)

        # Conduct redundancy analysis
        # if len(X_train.columns) > 1:
        #     redundant_features = clas_util.redundancy_analysis(X_train)

        # Remove indepandent variables explained by others
        # if len(redundant_features) != 0:
        #     X_train = X_train.drop(columns=redundant_features)

        # Instantiate the OverSampler class then fit it on the each fold training dataset
        # features = X_train.columns.tolist()
        # print(f'len(X_train) {len(X_train)}')

        # Columns to exclude in the test set
        # cols_exluded = corr_features + redundant_features 

        X_test = pd.read_csv(osp.join(".", "Files", "Data", "Test", f"{fold}.csv"))
        # X_test_pairs = X_test[['Source', 'Target', 'related']]
        y_test = X_test['related']

        # if (dimension_type == 'keep' and dim_label == 'Pairs') or (dimension_type == 'discard' and dim_label != 'Pairs'):
        #     X_test = clas_util.compute_embdedding_similarity(df_changes, desc_model, X_test, 'commit_message', 'desc')
        #     X_test = clas_util.compute_embdedding_similarity(df_changes, subject_model, X_test, 'subject', 'subject')
        #     X_test = clas_util.compute_embdedding_similarity(df_changes, add_lines_model, X_test, 'added_lines', 'add_lines')
        #     X_test = clas_util.compute_embdedding_similarity(df_changes, del_lines_model, X_test, 'deleted_lines', 'del_lines')
        
        X_test = X_test[X_train.columns.tolist()]

        # Train the Random Forest Classifier on the training fold set 
        clone_clf.fit(X_train, y_train)

        # Test the Random Forest Classifier on the test fold set 
        y_pred = clone_clf.predict(X_test)

        # y_pred_prob = clone_clf.predict_proba(X_test)[:, 1]
        # X_test_pairs['pred'] = y_pred_prob
        # compute_top_k_prec_recal(X_test_pairs, dim, fold+1)

        precision_scores.append(precision_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        auc_scores.append(roc_auc_score(y_test, y_pred))
        brier_scores.append(brier_score_loss(y_test, y_pred))

        print(f"{dim_label}, Fold: {fold+1}, Precision: {precision_scores[-1]}, Recall: {recall_scores[-1]}, AUC: {auc_scores[-1]}, Brier: {brier_scores[-1]}")

    # feature_importances /= (fold+1)
    prec_avg = np.average(precision_scores)
    recall_avg = np.average(recall_scores)
    auc_avg = np.average(auc_scores)
    brier_avg = np.average(brier_scores)

    print(f"{dim_label}, Precision: {prec_avg}, Recall: {recall_avg}, AUC: {auc_avg}, Brier: {brier_avg}")

    dimension_results[dim_label] += [{
        'Dimension': dim_label,
        'Precision': prec_avg,
        'Recall': recall_avg,
        'AUC': auc_avg,
        'Brier': brier_avg
    }]

Start training with Change dimension...
Change, Fold: 1, Precision: 0.009780246317314658, Recall: 0.6230769230769231, AUC: 0.7802770258224332, Brier: 0.06283416350591783
Change, Fold: 2, Precision: 0.008028352379574714, Recall: 0.7872340425531915, AUC: 0.847103421602174, Brier: 0.09314160641318416
Change, Fold: 3, Precision: 0.011615989067304407, Recall: 0.6538461538461539, AUC: 0.7964330013917226, Brier: 0.061292382844028326
Change, Fold: 4, Precision: 0.010235870048954161, Recall: 0.7379679144385026, AUC: 0.8346911940613565, Brier: 0.06877127760633027
Change, Fold: 5, Precision: 0.01065552808797204, Recall: 0.8223684210526315, AUC: 0.8790119896285096, Brier: 0.06443982827863177
Change, Fold: 6, Precision: 0.007017133500964856, Recall: 0.7547169811320755, AUC: 0.8112238299438321, Brier: 0.13240911460156682
Change, Fold: 7, Precision: 0.01100784986014617, Recall: 0.782051282051282, AUC: 0.8379847800613145, Brier: 0.10625036238186351
Change, Fold: 8, Precision: 0.009901738473167044, Rec

In [17]:
dim_imp = pd.DataFrame([item[0] for item in list(dimension_results.values())])
dim_imp.to_csv(osp.join('.', 'Results', 'Feature_importance', f'second_model_{dimension_type}_dim.csv'), index=None)

### Impact

In [11]:
df_feat_impact = pd.read_csv(osp.join('.', 'Results', 'Impact', f'third_model_feat_impact.csv'))

In [26]:
# attr = "last_mth_mod_uniq_proj_nbr_target"
# test = df_feat_impact#.loc[(df_feat_impact['fold']!=0)&(df_feat_impact[attr].duplicated()==False), [attr, "fold", "impact"]].sort_values("impact")
# test = test[test['impact'].notnull()==True].iloc[:-1, -1]
# test = f" & {round(test.min(), 2)} & {round(test.median(), 2)} & {round(test.max(), 2)}"