In [None]:
# Define a DEBUG flag with values from '0' to '5'. Default is '0' which is OFF. 
# Use this cautiously - we are not validating for this

DEBUG = 0

In [None]:
# Use h2o4gpu if you have it installed
# Add a flag with Default as False.
h2o4gpu_enabled = False

In [None]:
#Read the data files and concantenate into a dataframe
import pandas as pd
import glob
import os

training_data_files_path = r'/home/kc/Projects/data_files/Training_data_from_public_git/'                     
all_files = glob.glob(os.path.join(training_data_files_path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent

df_from_each_file = (pd.read_csv(f) for f in all_files)
total_commits = pd.concat(df_from_each_file, ignore_index=True)
                             
if DEBUG >=2:
    print("Total No. of rows: ", total_commits.shape)

In [None]:
# This cell is temporary. We need to remove this cell for the release. i.e. when we move this logic to getData
# Create a coloumn 'file_ext' which is the file 'type'
total_commits['file_ext'] = total_commits['file_path'].\
                                                    apply(lambda x:pathlib.Path(str(x)).suffix).\
                                                    apply(lambda x:re.split(r"[^a-zA-Z0-9\s\++\_\-]",x)[-1])

# For files without any extension, mark 'file_ext' as "NoExt" 
total_commits.file_ext = total_commits.file_ext.replace(r'^\s*$', 'NoExt', regex=True)

In [None]:
#Creating various features for each modification

# Note: We will add a prefix "feature" to variables to create the features for the ML model.

# From documentation: https://pydriller.readthedocs.io/en/latest/commit.html
#     dmm_unit_size (float): DMM metric value for the unit size property.
#     dmm_unit_complexity (float): DMM metric value for the unit complexity property.
#     dmm_unit_interfacing (float): DMM metric value for the unit interfacing property.

# https://pydriller.readthedocs.io/en/latest/modifications.html
#     complexity: Cyclomatic Complexity of the file
#     changed_methods: subset of _methods_ containing only the changed methods.

# Here is more about dmm: https://pydriller.readthedocs.io/en/latest/deltamaintainability.html
#     The delta-maintainability metric is the proportion of low-risk change in a commit. 
#     The resulting value ranges from 0.0 (all changes are risky) to 1.0 (all changes are low risk). 
#     It rewards making methods better, and penalizes making things worse.

# "total lines changed" is important but can be very misleading metric of a commit. 
# We will create a feature which blunts the weight of nloc
# Total number of lines changed
total_commits['total_changed'] = total_commits['number_lines_added'] + total_commits['number_lines_removed']
total_commits['feature_total_changed'] = (total_commits['total_changed'] ** 0.7)

# Fraction of lines changed per total numbe of lines in file
# We need to account for the fact that new files added with have existing size as '0'.
#           and divide by '0' is indeterminate
total_commits['file_number_loc'].loc[total_commits['file_number_loc'] == 0] = total_commits['total_changed']
total_commits['ratio_changed'] = total_commits['total_changed'] / total_commits['file_number_loc']
# We can change above logic if we use mod.change_type. Maybe in a future version.

# 'complexity' is given for the *WHOLE* file. We need to scale/weigh it for only the changed lines.
#     Let us weight it by 2 variables:
#         "ratio changed" AND
#         a feature related to 'size' of file i.e. we are making an assumption that larger files are 
#         more difficult to change but we need to taper this off too. Let us go with cube root of size 
total_commits['feature_rated_complexity'] = total_commits['ratio_changed'] * total_commits['file_complexity'] * \
                                                (total_commits['total_changed'] ** 0.3)

# dmm values are given for the commit. We need to scale them for individual commits.
# We should weight this by "changed_methods" but we missed mining this. We will add this later.
# When adding "changed methods", We will do Something like (changed_methods ** 1.5) to reflect importance
#       of adding and deleting methods. We can then change to use (total_changed ** 0.3)
total_commits['feature_dmm_size'] = (total_commits['total_changed'] ** 0.5) * total_commits['dmm_unit_size']
total_commits['feature_dmm_unit_complexity'] = (total_commits['total_changed'] ** 0.5) * \
                                                    total_commits['dmm_unit_complexity']
total_commits['feature_dmm_unit_interfacing'] = (total_commits['total_changed'] ** 0.5) * \
                                                    total_commits['dmm_unit_interfacing']

In [None]:
# Function to Remove outliers. 
from scipy import stats
import numpy as np

def filter_outliers(data_frame):
    
    # Calculate z_scores and if zscore is greater than '3', then its an outlier
    # https://pbpython.com/pandas_dtypes.html
    
    # Get non-Outliers: 
    data_frame_non_outliers = \
    data_frame[(np.abs(stats.zscore(data_frame.select_dtypes(exclude=['object', 'bool'], \
                                                             include=['int64', 'float64']))) < 3).all(axis=1)]

    # Collect outliers
    data_frame_outliers = \
    data_frame[~(np.abs(stats.zscore(data_frame.select_dtypes(exclude=['object', 'bool'], \
                                                             include=['int64', 'float64']))) < 3).all(axis=1)]
    
    return data_frame_non_outliers, data_frame_outliers
    #return data_frame, data_frame

In [None]:
# Prepare frames for each file type extension

def prepare_frame(total_commits, file_ext):
    
    #Filter the mods based on file type extension
    file_ext_commits = total_commits[total_commits['file_ext']==file_ext]

    ml_commits = file_ext_commits[['hash','Author','feature_total_changed','feature_rated_complexity',
                                'feature_dmm_unit_complexity','feature_dmm_size','feature_dmm_unit_interfacing',
                                  'language_supported']]

    # Resetting the frame's index. It is required to retain the integrity of the frame
    ml_commits = ml_commits.reset_index().drop(columns = 'index')

    # Temporarily dropping text columns for numeric processing
    ml_commits_noText = ml_commits.drop(columns = ['Author','hash','language_supported'])

    # Explicitely converting fields to numeric types and filling the NaNs with zeros
    ml_commits_numeric = ml_commits_noText.apply(pd.to_numeric,errors ='coerce').fillna(0)

    # Adding the Author column back to create a 'total' data frame
    ml_commits_all_coloumns = ml_commits_numeric.copy()
    ml_commits_all_coloumns['Author'] = ml_commits['Author']
    ml_commits_all_coloumns['hash'] = ml_commits['hash']
    ml_commits_all_coloumns['language_supported'] = ml_commits['language_supported']
    
    return ml_commits_all_coloumns

In [None]:
# Function to scale data in the frame

def scale_frame(data_frame):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    %matplotlib inline
    if (h2o4gpu_enabled == True):
        from h2o4gpu.preprocessing import MinMaxScaler
    else:
        from sklearn.preprocessing import MinMaxScaler
    
    # Use minMax scaler since this does not distort
    # https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
    scaler = MinMaxScaler()
    data_frame_numeric = data_frame.drop(columns = ['Author','hash','language_supported'])
    scaled_data_frame = scaler.fit_transform(data_frame_numeric)
    
    return scaled_data_frame, scaler

In [None]:
def create_cluster(scaled_file_ext_frame, file_ext, file_ext_frame, scaler, k, gmm_models_folder, 
                   centroids_folder):
    
    # We are currently using GMM from sklearn. We need to get the GPU version of GMM.
    # https://pypi.org/project/pycave/
    from sklearn.mixture import GaussianMixture

    # Initializing the Gaussian mixture model 
    mix = GaussianMixture(n_components=k, random_state=42)

    # Learning the Gaussian mixture model from data   
    mix.fit(scaled_file_ext_frame)

    # Saving the parameters of Gaussian mixture model in a file
    import pickle
    vfilename = gmm_models_folder+file_ext+'_gmm_model_pickle.sav'
    pickle.dump(mix, open(vfilename, 'wb'))

    # Predicting the cluster labels of the data for the Gaussian mixture model
    cluster_frame = pd.DataFrame(scaled_file_ext_frame)
    gmm_hash_clusters = mix.predict(cluster_frame)

    # Collecting the mean of the Gaussian mixture model in 'gmmcentroids'
    gmm_centroids = mix.means_
    gmm_covariances = mix.covariances_
    combinedCentroids = gmm_centroids[gmm_hash_clusters].sum(axis=1)

    # Converting the input data series into pan
    file_ext_frame['Cluster'] = gmm_hash_clusters
    real_centroids = scaler.inverse_transform(gmm_centroids)

    # Write these to dataframe
    real_centroids_dataFrame = pd.DataFrame(real_centroids, columns=['feature_total_changed',
                                                                     'feature_rated_complexity',
                                'feature_dmm_unit_complexity','feature_dmm_size','feature_dmm_unit_interfacing'])

    # Add a cloumn for summing all centroids "This is the value of the individual clusters"
    real_centroids_dataFrame['Sum_centroids'] = real_centroids_dataFrame.sum(axis = 1)
    
    # Save centroids of the clusters to a file for audit
    centroid_file = centroids_folder+file_ext+'centroids.csv'
    real_centroids_dataFrame.to_csv(centroid_file)
    
    return file_ext_frame

In [None]:
def create_boost_model(file_ext_frame, file_ext, xgboost_models_folder):
    
    from xgboost import XGBClassifier
    
    if (h2o4gpu_enabled == True):
        import h2o4gpu as sklearn
        # We assume that if h2o4gpu is enabled then, GPU is available and we can use xgboost on GPU.
        # Instantiate the xgboost model with relevant params
        # https://gist.github.com/shreyasbapat/89c6d6e09ff3f763e21ea68f98d74f84
        # https://xgboost.readthedocs.io/en/latest/gpu/index.html
        xgboost_model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    else:
        import sklearn
        xgboost_model = XGBClassifier()

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    import pickle

    # Remove text fields before numeric manipulations
    file_ext_frame_numeric_xg = file_ext_frame.drop(columns=['Author','hash','language_supported'])

    # Prepare the 'X' and 'Y' for the model
    X_file_ext_frame_numeric_xg = file_ext_frame_numeric_xg.drop(columns = ['Cluster'])
    Y_file_ext_frame_numeric_xg = file_ext_frame_numeric_xg['Cluster']

    # Split the data for 'Training' and 'Testing' datasets
    X_train, X_test, y_train, y_test = \
            train_test_split(X_file_ext_frame_numeric_xg, Y_file_ext_frame_numeric_xg, random_state=7)


    # Training the xgboost classifier
    xgboost_model.fit(X_train, y_train)

    # Predicting the class labels of test data for xgboost classifier
    y_pred = xgboost_model.predict(X_test)

    # check accuracy
    if DEBUG >= 1:
        accuracy = accuracy_score(y_test, y_pred)
        print(file_ext+'_size: ', file_ext_frame.shape[0])
        print(file_ext+'_accuracy: ', accuracy)
    
    # Save the model to a file
    filename = xgboost_models_folder+file_ext+'_xgboost_model.sav'
    pickle.dump(xgboost_model, open(filename, 'wb'))


In [None]:
import os
import pathlib
import re
import glob


# Print for Debugging 
if DEBUG >=1:
    print('No. of file extensions: ', len(total_commits['file_ext'].unique()))

# Prepare a list of all unique file extensions
unique_extensions = total_commits['file_ext'].unique()

# Folder to save models and centroids
gmm_models_folder = '/home/kc/Projects/data_files/sav_files/gmm_sav/'
centroids_folder = '/home/kc/Projects/data_files/sav_files/centroids/'
xgboost_models_folder = '/home/kc/Projects/data_files/sav_files/xgboost_sav/'
        
# Remove all previous models 
#for folders in [gmm_models_folder, centroids_folder, xgboost_models_folder]:
#    for file in glob.glob(folders+'*'):
#        os.remove(file)

for file_ext in unique_extensions:
    
    # Set number of clusters
    k=5
    
    # Extract data frame for the specific file type extension
    file_ext_frame = prepare_frame(total_commits, file_ext)
   
    # We need a good sample size for accurate results. 
    #    We will ignore'file_ext' if it has less than 50 rows 
    if file_ext_frame.shape[0] < 50:
        print('ignoring file type: ', file_ext, '(Not enough rows)')
        continue
        
    # If file_ext is not supported by 'lizard', then ignore 'file_ext' and proceed to next
    if file_ext_frame.at[0,'language_supported'] == False:
        if DEBUG >= 2:
            print('Ignoring unsupported file extention: ', file_ext)
        continue
        
    # Remove outliers from the frame (only if you reasonable amount of data points) 
    file_ext_frame_non_outliers, file_ext_frame_outliers = \
                                    filter_outliers(file_ext_frame)
    if DEBUG >= 2:
        print('file_ext: ', file_ext, ', no. of non-outlier rows: ', file_ext_frame_non_outliers.shape[0])
        print('file_ext: ', file_ext, ', no. of outlier rows: ', file_ext_frame_outliers.shape[0])
    
    # Scale the data    
    # There is a peculiar behaviour here. For lot of file_ext (= 'md', 'html', etc,) All data points are 
    #     showing up as outliers. This is very funny. We should probably restrict our processing to files
    #     supported by 'lizard/pydriller'. 
    # We are forced to check that we have at least 1 non_outlier.
    if file_ext_frame_non_outliers.shape[0] >= 1:
        scaled_file_ext_frame, scaler = scale_frame(file_ext_frame_non_outliers)
        if DEBUG >= 2:
            print('rows in scaled_file_ext_frame: ', scaled_file_ext_frame.shape[0])
        
        # Create the actual clusters from the data.
        clustered_frame = create_cluster(scaled_file_ext_frame, file_ext, file_ext_frame_non_outliers, scaler, 
                                            k, gmm_models_folder, centroids_folder)

        # Train xgboost model for each extension
        create_boost_model(clustered_frame, file_ext, xgboost_models_folder)
    else:
        if DEBUG >= 2:
            print('ignoring file type: ', file_ext, '(All rows are outliers)')