In [None]:
# Define a DEBUG flag with values from '0' to '5'. Default is '0' which is OFF. 
# Use this cautiously - we are not validating for this

DEBUG = 2

In [None]:
#Read the data files
import pandas as pd

tensorflow_commits = pd.read_csv('/home/kc/Projects/data_files/tensorflow.csv')
vscode_commits=pd.read_csv('/home/kc/Projects/data_files/vscode.csv')
react_commits=pd.read_csv('/home/kc/Projects/data_files/react-native.csv')

total_commits=tensorflow_commits.append(vscode_commits, ignore_index=True)
total_commits=total_commits.append(react_commits, ignore_index=True)
                             

if DEBUG >=1:
    print(total_commits.shape)

In [None]:
# Use h2o4gpu if you have it installed

# Add a flag with Default as False. Don't change this unless your kernel uses h2o4gpu.
h2o4gpu_enabled = True

In [None]:
#Creating various features for each modification

# Total number of lines changed
total_commits['total_changed'] = total_commits['lines_added'] + total_commits['lines_removed']

# Fraction of lines changed per total numbe of lines in file
# We need to account for the fact that new files added with have existing size as '0' and divide by '0' is indeterminate
total_commits['size'].loc[total_commits['size'] == 0] = total_commits['total_changed']
total_commits['ratio_changed'] = total_commits['total_changed'] / total_commits['size']

# Need to weigh the complexity by quantum of change. 
total_commits['rated_complexity'] = total_commits['ratio_changed'] * total_commits['complexity'] * total_commits['total_changed']

# weighing the dmm params by the total changed lines
total_commits['total_dmm_size'] = total_commits['total_changed'] * total_commits['dmm_unit_size']
total_commits['total_dmm_unit_complexity'] = total_commits['total_changed'] * total_commits['dmm_unit_complexity']
total_commits['total_dmm_unit_interfacing'] = total_commits['total_changed'] * total_commits['dmm_unit_interfacing']

# We picked the sqrt of no_of_mod_files to reduce weightage of this feature
total_commits['scaled_rated_complexity']=total_commits['rated_complexity'] * (total_commits['no._of_mod_files'] ** 0.5)

In [None]:
# Function to Remove outliers. 
from scipy import stats
import numpy as np

def filter_outliers(data_frame):
    
    # Calculate z_scores and if zscore is greater than '3', then its an outlier
    
    # Get non-Outliers: 
    data_frame_non_outliers = data_frame[(np.abs(stats.zscore(data_frame.select_dtypes(exclude='object'))) < 3).all(axis=1)]

    # Collect outliers
    data_frame_outliers = data_frame[~(np.abs(stats.zscore(data_frame.select_dtypes(exclude='object'))) < 3).all(axis=1)]
    
    return data_frame_non_outliers, data_frame_outliers

In [None]:
# Prepare frames for each file type extension

def prepare_frame(total_commits, file_ext):
    import numpy as np
    from scipy import stats
    
    #Filter the mods based on file type extension
    file_ext_commits = total_commits[total_commits['file_ext']==file_ext]

    ml_commits = file_ext_commits[['hash','Author','total_changed','rated_complexity',
                                'total_dmm_unit_complexity','total_dmm_size','total_dmm_unit_interfacing', 'scaled_rated_complexity']]

    # Resetting the frame's index. It is required to retain the integrity of the frame
    ml_commits = ml_commits.reset_index().drop(columns = 'index')

    # Temporarily dropping text columns for numeric processing
    ml_commits_noText = ml_commits.drop(columns = ['Author','hash'])

    # Explicitely converting fields to numeric types and filling the NaNs with zeros
    ml_commits_numeric = ml_commits_noText.apply(pd.to_numeric,errors ='coerce').fillna(0)

    # Adding the Author column back to create a 'total' data frame
    ml_commits_all_coloumns = ml_commits_numeric.copy()
    ml_commits_all_coloumns['Author'] = ml_commits['Author']
    ml_commits_all_coloumns['hash'] = ml_commits['hash']
    
    return ml_commits_all_coloumns

In [None]:
# Function to scale data in the frame

def scale_frame(data_frame):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    %matplotlib inline
    if (h2o4gpu_enabled == True):
        from h2o4gpu.preprocessing import MinMaxScaler
    else:
        from sklearn.preprocessing import MinMaxScaler
    
    # Use minMax scaler since this does not distort
    # https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
    scaler = MinMaxScaler()
    data_frame_numeric = data_frame.drop(columns = ['Author','hash'])
    scaled_data_frame = scaler.fit_transform(data_frame_numeric)
    
    return scaled_data_frame, scaler

In [None]:
def create_cluster(scaled_file_ext_frame, file_ext, file_ext_frame, scaler, k):
    
    # We are currently using GMM from sklearn. We need to get the GPU version of GMM.
    # https://pypi.org/project/pycave/
    from sklearn.mixture import GaussianMixture

    # Initializing the Gaussian mixture model 
    mix = GaussianMixture(n_components=k, random_state=42)

    # Learning the Gaussian mixture model from data   
    mix.fit(scaled_file_ext_frame)

    # Saving the parameters of Gaussian mixture model in a file
    import pickle
    vfilename = '/home/kc/Projects/data_files/sav_files/'+file_ext+'_gmm_pickle.sav'
    pickle.dump(mix, open(vfilename, 'wb'))

    # Predicting the cluster labels of the data for the Gaussian mixture model
    cluster_frame = pd.DataFrame(scaled_file_ext_frame)
    gmm_hash_clusters = mix.predict(cluster_frame)

    # Collecting the mean of the Gaussian mixture model in 'gmmcentroids'
    gmm_centroids = mix.means_
    gmm_covariances = mix.covariances_
    combinedCentroids = gmm_centroids[gmm_hash_clusters].sum(axis=1)

    # Converting the input data series into pan
    file_ext_frame['Cluster'] = gmm_hash_clusters
    real_centroids = scaler.inverse_transform(gmm_centroids)

    # Write these to dataframe
    real_centroids_dataFrame = pd.DataFrame(real_centroids, columns=['total_changed','rated_complexity',
                                'total_dmm_unit_complexity','total_dmm_size','total_dmm_unit_interfacing', 'scaled_rated_complexity'])

    # Add a cloumn for summing all centroids "This is the value of the individual clusters"
    real_centroids_dataFrame['Sum_centroids'] = real_centroids_dataFrame.sum(axis = 1)
    
    # Save centroids of the clusters to a file for audit
    centroid_file = '/home/kc/Projects/data_files/sav_files/'+file_ext+'centroids.csv'
    real_centroids_dataFrame.to_csv(centroid_file)
    
    return file_ext_frame

In [None]:
def create_boost_model(file_ext_frame, file_ext, folder):
    
    from xgboost import XGBClassifier
    
    if (h2o4gpu_enabled == True):
        import h2o4gpu as sklearn
        # We assume that if h2o4gpu is enabled then, GPU is available and we can use xgboost on GPU.
        # Instantiate the xgboost model with relevant params
        # https://gist.github.com/shreyasbapat/89c6d6e09ff3f763e21ea68f98d74f84
        # https://xgboost.readthedocs.io/en/latest/gpu/index.html
        xgboost_model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    else:
        import sklearn
        xgboost_model = XGBClassifier()

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    import pickle

    # Remove text fields before numeric manipulations
    file_ext_frame_numeric_xg = file_ext_frame.drop(columns=['Author','hash'])

    # Prepare the 'X' and 'Y' for the model
    X_file_ext_frame_numeric_xg = file_ext_frame_numeric_xg.drop(columns = ['Cluster'])
    Y_file_ext_frame_numeric_xg = file_ext_frame_numeric_xg['Cluster']

    # Split the data for 'Training' and 'Testing' datasets
    X_train, X_test, y_train, y_test = train_test_split(X_file_ext_frame_numeric_xg, Y_file_ext_frame_numeric_xg, random_state=7)


    # Training the xgboost classifier
    xgboost_model.fit(X_train, y_train)

    # Predicting the class labels of test data for xgboost classifier
    y_pred = xgboost_model.predict(X_test)

    # check accuracy
    if DEBUG >= 1:
        accuracy = accuracy_score(y_test, y_pred)
        print(file_ext+'_size: ', file_ext_frame.shape[0])
        print(file_ext+'_accuracy: ', accuracy)
    
    # Save the model to a file
    filename = folder+file_ext+'_finalized_model.sav'
    pickle.dump(xgboost_model, open(filename, 'wb'))
    

In [None]:
import os
import pathlib
import re
import glob


# Create a coloumn 'file_ext' which is the file 'type'
total_commits['file_ext'] = total_commits['file_path'].apply(lambda x:pathlib.Path(str(x)).suffix).apply(lambda x:re.split(r"[^a-zA-Z0-9\s\++\_\-]",x)[-1])

# For files without any extension, mark 'file_ext' as "NoExt" 
total_commits.file_ext = total_commits.file_ext.replace(r'^\s*$', 'NoExt', regex=True)

# Print for Debugging 
if DEBUG >=1:
    print(len(total_commits['file_ext'].unique()))

# Prepare a list of all unique file extensions
unique_extensions = total_commits['file_ext'].unique()

# What does the below line do ? Remove all previous models? 
pickled_files = glob.glob('/home/kc/Projects/data_files/sav_files/*.sav')
for f in pickled_files:
    os.remove(f)

# For every file extension: prepare the data frame, create cluster, train xgBoost model and save it.
# We should change this to only those extensions supported by lizard/pydriller
for file_ext in unique_extensions:
    
    # Set number of clusters
    k=5
    
    # Extract data frame for the specific file type extension
    file_ext_frame = prepare_frame(total_commits, file_ext)
    
    # Remove outliers from the frame (only if you reasonable amount of data points) 
    if file_ext_frame.shape[0] > 10:
        file_ext_frame_non_outliers, file_ext_frame_outliers = filter_outliers(file_ext_frame)
    else :
        file_ext_frame_non_outliers = file_ext_frame
        #file_ext_frame_outliers = NULL

    # We need at least 5 data points to ensure that we get at least 5 clusters
    # Get the number of rows
    count_row = file_ext_frame_non_outliers.shape[0]
    
    # Scale the data if you have more than one row
    if count_row>1:
        scaled_file_ext_frame, scaler = scale_frame(file_ext_frame_non_outliers)
        
        # If we have less than 5 rows, we cannot have 5 clusters, we have to reduce clusters
        #      to handle this boundary case.
        if count_row<5:
            k = count_row
         
        # Create the actual clusters from the data.
        clustered_frame = create_cluster(scaled_file_ext_frame, file_ext, file_ext_frame_non_outliers, scaler, k)
        
        # Train xgboost model for each extension
        create_boost_model(clustered_frame, file_ext, '/home/kc/Projects/data_files/sav_files/')

In [None]:
# Covariance matrix heatmap visualization as a sanity check

if DEBUG >=4:

    import seaborn as sns

    plt.figure(figsize=(10,10))
    sns.set(font_scale=1.5)

    # We need as many as cols as we have features
    cols=['1','2','3','4','5','6']

    # visualization for cluster number
    cluster_number = 2
    hm = sns.heatmap(gmm_covariances[cluster_number,:,:],
                     cbar=True,
                     annot=True,
                     square=True,
                     fmt='.5f',
                     annot_kws={'size': 12},
                     yticklabels=cols,
                     xticklabels=cols)

    plt.title('Covariance matrix of the cluster showing correlation coefficients')
    plt.tight_layout()
    plt.show()

In [None]:
if DEBUG >=4:
    if (h2o4gpu_enabled == True):
        from h2o4gpu import metrics
    else:
        from sklearn import metrics

    metrics.silhouette_score(data_scaled, gmm_hash_clusters)