In [None]:
import numpy as np
import pandas as pd
import os
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve, auc, roc_auc_score, confusion_matrix, matthews_corrcoef, f1_score, precision_score
from numpy import interp
import matplotlib.pyplot as plt
import pickle
from imblearn.over_sampling import SMOTE


Main Filepaths

In [None]:
ml_vars_dir = "/Users/jorismachon/Documents/thesis/ML_data/vars"
data_in_path = "/Users/jorismachon/Documents/thesis/tabulated_commits_v14_nov.json"
data_in_path = "/Users/jorismachon/Documents/thesis/ML_data/subsets/allValid.csv"

Data Preprocessing Functions

In [None]:
def bool_cols_to_bin_cols(df):
    bin_cols = ["fix", "is_vulnerable", "neutral", "new_author", "oop_php_files_exist", "authorPresent"]
    # Change binaries to 0 and 1
    for col in bin_cols:
        df[col] = df[col].astype(int)

In [None]:
def create_time_diff_col(df):
    # TIMEZONE
    df['author_info.created_at'].replace(-1, np.nan, inplace=True)
    df['author_date'] = pd.to_datetime(df['author_date'], unit='ms')
    df['author_timezone'] = pd.to_timedelta(df['author_timezone'], unit='s')
    # Combine author_date and author_timezone to create a new column representing the actual date and time
    df['commit_time'] = df['author_date'] + df['author_timezone']    

    # columns_to_drop = df.filter(regex='^BoW_').columns
    # df_final = df.drop(columns=columns_to_drop)

    df['author_info.created_at'] = pd.to_datetime(df['author_info.created_at'], errors='coerce', utc=True)
    df['commit_time'] = pd.to_datetime(df['commit_time'], errors='coerce', utc=True)

    df.loc[df['author_x'] != None, 'time_difference'] = (df.loc[df['author_x'] != None, 'commit_time'] - df.loc[df['author_x'] != None, 'author_info.created_at']).dt.days
    df['time_difference'].fillna(-1, inplace=True)

In [None]:
# def balancing(df, desired_ratio):    
#     y = df['is_vulnerable']
#     X = df.drop(columns=['is_vulnerable'])
#     rus = RandomUnderSampler(sampling_strategy=desired_ratio, random_state=42)
#     X_balanced, y_balanced = rus.fit_resample(X, y)
#     return X_balanced, y_balanced

In [None]:
def apply_smote(X_train, y_train, desired_ratio):
    smote = SMOTE(sampling_strategy=desired_ratio, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    return X_train_resampled, y_train_resampled

In [None]:
def calc_vif(X):
    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [None]:
def feature_selection(df_in):
    nr_of_columns_init = df_in.shape[1]
    vif1 = calc_vif(df_in)
    a=vif1.VIF.max()
    while a > 5:
        maximum_a = vif1.loc[vif1['VIF'] == vif1['VIF'].max()]
        vif1 = vif1.loc[vif1['variables'] != maximum_a.iloc[0,0]]
        vif1 = calc_vif(df_in[vif1.variables.tolist()])
        a = vif1.VIF.max()
        # print(a)

    X = df_in[vif1.variables.tolist()]
    nr_of_columns_final = X.shape[1]
    print("Selected ", nr_of_columns_final, " out of ", nr_of_columns_init, " features.")
    # print("Selected features: ", X.columns)
    print("Dropped features: ", [col for col in df_in.columns if col not in X.columns])
    return X

In [None]:
def data_preprocessing(source):
    print("Pre-processing data...")
    
    
    # Check if the file ends with .csv or .json
    if source.endswith('.csv'):
        # Read CSV file
        tc_df = pd.read_csv(source)
    elif source.endswith('.json'):
        # Read JSON file
        tc_df = pd.read_json(source)
    else:
        # If the file extension is neither .csv nor .json, raise an error
        raise ValueError("Unsupported file format. Please provide a CSV or JSON file.")
    print("Non-numerical processing")
    # tc_df.fillna(-1, inplace=True) #Replace all null values with -1
    
    num_rows_with_null = tc_df.isnull().any(axis=1).sum() # Count the number of rows with at least one null value
    num_nulls = tc_df.isnull().sum().sum() # Count the number of null values in the dataframe
    # tc_df.fillna(-1, inplace=True) # Replace all null values with -1
    numeric_cols = tc_df.select_dtypes(include=[np.number]).columns
    tc_df[numeric_cols] = tc_df[numeric_cols].fillna(tc_df[numeric_cols].mean())
    print(f"Replaced {num_nulls} null values in ", num_rows_with_null, " rows with mean.")

    
    # bool_cols_to_bin_cols(tc_df) #boolean columns to binary
    
    # create_time_diff_col(tc_df) #create a time difference col that calculates the time diff between author creation and commit time
    
    # tc_df = tc_df.drop(columns=["appname", #drop unused columns
    #                                                   "author_info.username", 
    #                                                   "author_x", "author_y", 
    #                                                   "commit_sha", "repo", 
    #                                                   "author_info.created_at", 
    #                                                   "author_date", 
    #                                                   "author_timezone",
    #                                                   "neutral"])
    # Get columns that start with 'BoW'
    # cols_to_drop = tc_df.filter(regex='^BoW').columns
    # tc_df = tc_df.drop(columns=cols_to_drop)

    y = tc_df['is_vulnerable']
    X = tc_df.drop(columns=['is_vulnerable'])
    X = X.drop(columns=['appname'])

    print("Balancing...")
    # X, y = balancing(tc_df, 1)
    print("Feature selection with VIF...")
    # colX = [c for c in feature_selection(X.drop(columns=['commit_time']))]
    colX = [c for c in feature_selection(X)]

    X = X[colX]
    print("Pre-processing done!\n")
    return X, y
    

In [None]:
def save_variables(file_name, variables):
    file_path = os.path.join(ml_vars_dir, file_name)
    with open(file_path, 'wb') as handle:
        pickle.dump(variables, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def train_cv_save_results(clf_name, cv, X_final, y_final, save_file_name, Xy_are_np=False):
    if not Xy_are_np:    
        X_final_np = X_final.values
        y_final_np = y_final.values
    else:
        X_final_np = X_final
        y_final_np = y_final
    if clf_name == "AdaBoostClassifier":
        clf = AdaBoostClassifier(n_estimators=1900, learning_rate=0.1, random_state=0)
    else:
        raise Exception("Clf not set for clf name", clf_name)
    splits_indices = cv.split(X_final_np, y_final_np)

    mean_fpr = np.linspace(0, 1, 100)
    tprs = []
    aucs = []

    N, P = X_final_np.shape

    # Aggregate the importances over folds here:
    importances_random = np.zeros(P)

    # Loop over crossvalidation folds:
    scores = []  # Collect accuracies here

    TP = []
    FP = []
    TN = []
    FN = []
    tnList = []
    fpList = []
    fnList = []
    tpList = []
    precisionList = []
    f1List = []
    mccList = []

    i = 1
    count = 0
    # for train, test in cv.split(X, y):
    train_splits = []
    test_splits = []
    train_anomaly_percentage = []
    test_anomaly_percentage = []
    train_anomaly_absolute = []
    test_anomaly_absolute = []
    counterfold = 1
    for train, test in splits_indices:
        print("Fold-repetition", counterfold)
        counterfold+=1
        train_splits.append(train)
        test_splits.append(test)
        count += 1

        X_train = X_final_np[train]
        y_train = y_final_np[train]
        X_test = X_final_np[test]
        y_test = y_final_np[test]

        X_train, y_train = apply_smote(X_train, y_train, 1)

        a, b = np.unique(y_train, return_counts=True)[1]
        train_anomaly_percentage.append(b / (a + b))
        train_anomaly_absolute.append(b)
        c, d = np.unique(y_test, return_counts=True)[1]
        test_anomaly_percentage.append(d / (c + d))
        test_anomaly_absolute.append(d)

        clf.fit(X_train, y_train)

        # Predict for validation data_raw:

        
        probas_ = clf.predict_proba(X_test)
        y_pred = clf.predict(X_test)

        # Compute ROC curve and area under the curve
        
        fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1], pos_label=1)

        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)

        # calculate confusion matrix, precision, f1 and Matthews Correlation Coefficient

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        precision = precision_score(y_test, y_pred)
        mcc = matthews_corrcoef(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        TN.append(tn)
        TP.append(tp)
        FN.append(fn)
        FP.append(fp)

        tnList.append(tn / (tn + fp))
        tpList.append(tp / (fn + tp))
        fpList.append(fp / (tn + fp))
        fnList.append(fn / (fn + tp))

        precisionList.append(precision)
        f1List.append(f1)
        mccList.append(mcc)

        i += 1
        
    tnList = 100 * np.array(tnList)
    tpList = 100 * np.array(tpList)
    fnList = 100 * np.array(fnList)
    fpList = 100 * np.array(fpList)
    precisionList = 100 * np.array(precisionList)
    f1List = 100 * np.array(f1List)
    mccList = 100 * np.array(mccList)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    # mean_auc = auc(mean_fpr, mean_tpr)
    mean_auc = np.mean(aucs)
    std_auc = np.std(aucs)
    auc_meanpercent = 100 * mean_auc
    auc_stdpercent = 100 * std_auc

    variables_to_save = {
        'tprs': tprs,
        'aucs': aucs,
        'N': N,
        'P': P,
        'importances_random': importances_random,
        'scores': scores,
        'TP': TP,
        'FP': FP,
        'TN': TN,
        'FN': FN,
        'tnList': tnList,
        'fpList': fpList,
        'fnList': fnList,
        'tpList': tpList,
        'precisionList': precisionList,
        'f1List': f1List,
        'mccList': mccList,
        'train_splits': train_splits,
        'test_splits': test_splits,
        'train_anomaly_percentage': train_anomaly_percentage,
        'test_anomaly_percentage': test_anomaly_percentage,
        'train_anomaly_absolute': train_anomaly_absolute,
        'test_anomaly_absolute': test_anomaly_absolute,
        'auc_meanpercent': auc_meanpercent,
        'auc_stdpercent' : auc_stdpercent
    }
    save_variables(save_file_name, variables_to_save)
   

In [None]:
def load_variables(file_name):
    file_path = os.path.join(ml_vars_dir, file_name)
    with open(file_path, 'rb') as handle:
        loaded_variables = pickle.load(handle)    
    return loaded_variables

In [None]:
def load_show_metrics(file_name):
    print("Showing metrics for file", file_name)
    # Load variables from the file
    loaded_variables = load_variables(file_name)
    # Return each variable separately
    tprs = loaded_variables['tprs']
    aucs = loaded_variables['aucs']
    N = loaded_variables['N']
    P = loaded_variables['P']
    importances_random = loaded_variables['importances_random']
    scores = loaded_variables['scores']
    TP = loaded_variables['TP']
    FP = loaded_variables['FP']
    TN = loaded_variables['TN']
    FN = loaded_variables['FN']
    tnList = loaded_variables['tnList']
    fpList = loaded_variables['fpList']
    fnList = loaded_variables['fnList']
    tpList = loaded_variables['tpList']
    precisionList = loaded_variables['precisionList']
    f1List = loaded_variables['f1List']
    mccList = loaded_variables['mccList']
    train_splits = loaded_variables['train_splits']
    test_splits = loaded_variables['test_splits']
    train_anomaly_percentage = loaded_variables['train_anomaly_percentage']
    test_anomaly_percentage = loaded_variables['test_anomaly_percentage']
    train_anomaly_absolute = loaded_variables['train_anomaly_absolute']
    test_anomaly_absolute = loaded_variables['test_anomaly_absolute']
    
    mean_auc = np.mean(aucs)
    std_auc = np.std(aucs)
    auc_meanpercent = 100 * mean_auc
    auc_stdpercent = 100 * std_auc
    
    """Show metrics"""
    
    plt.clf()  # Clear the current figure
    
    print("TN: %.02f %% ± %.02f %% - FN: %.02f %% ± %.02f %%" % (np.mean(tnList),
                                                                    np.std(tnList),
                                                                    np.mean(fnList),
                                                                    np.std(fnList)))
    print("FP: %.02f %% ± %.02f %% - TP: %.02f %% ± %.02f %%" % (np.mean(fpList),
                                                                    np.std(fpList),
                                                                    np.mean(tpList),
                                                                    np.std(tpList)))

    print(
        "Precision: %.02f %% ± %.02f %% - F1: %.02f %% ± %.02f %% - MCC: %.02f %% ± %.02f %%" % (np.mean(precisionList),
                                                                                                    np.std(precisionList),
                                                                                                    np.mean(f1List),
                                                                                                    np.std(f1List),
                                                                                                    np.mean(mccList),
                                                                                                    np.std(mccList)))

    print("AUC: %.02f %% ± %.02f %%" % (auc_meanpercent, auc_stdpercent))
  

In [None]:
import matplotlib.pyplot as plt

def load_plot_metrics(file_name):
    print("Showing metrics for file", file_name)
    # Load variables from the file
    loaded_variables = load_variables(file_name)
    f1List = loaded_variables['f1List']
    tnList = loaded_variables['tnList']
    fpList = loaded_variables['fpList']
    fnList = loaded_variables['fnList']
    tpList = loaded_variables['tpList']

    # Create 2x2 grid of boxplots
    fig, axs = plt.subplots(2, 2, figsize=(10, 10))
    # Boxplots
    axs[0, 0].boxplot(tnList, vert=False, showfliers=False)
    axs[0, 0].set_title('TN List')
    axs[0, 1].boxplot(fpList, vert=False, showfliers=False)
    axs[0, 1].set_title('FP List')
    axs[1, 0].boxplot(fnList, vert=False, showfliers=False)
    axs[1, 0].set_title('FN List')
    axs[1, 1].boxplot(tpList, vert=False, showfliers=False)
    axs[1, 1].set_title('TP List')
    # Display the plot
    plt.tight_layout()
    plt.show()
    
    # Plot boxplot of scores
    plt.figure()
    plt.boxplot(f1List)
    plt.title('F1 Distribution')
    plt.show()

In [None]:
def train_ml_from_file(source, clf_name, save_file_name, folds=10, repeats=1):
    X_final, y_final = data_preprocessing(source)
    cv = RepeatedStratifiedKFold(n_splits=folds, n_repeats=repeats, random_state=1)
    train_cv_save_results(clf_name, cv, X_final, y_final, save_file_name)
    load_show_metrics(save_file_name)

Run it

In [None]:
output_filename = "abc_k5_r1_with_nulls_dataset13.pkl"
data_in_path = "/Users/jorismachon/Documents/thesis/ML_data/subsets/13_subsets_with_nulls.csv"
train_ml_from_file(data_in_path, "AdaBoostClassifier", output_filename, folds=5, repeats=1)

In [None]:
load_plot_metrics(output_filename)