In [None]:
# Define a DEBUG flag with values from '0' to '5'. Default is '0' which is OFF. 
# Use this cautiously - we are not validating for this

DEBUG = 0

In [None]:
# Use h2o4gpu if you have it installed
# Add a flag with Default as False. Don't change this unless your kernel uses h2o4gpu.
h2o4gpu_enabled = False

In [None]:
import pickle
import pandas as pd

# Read the source file of raw commit data
target_repo_dir = '/home/kc/Projects/data_files/target_repo_data/'
target_repo_raw_data_file = 'MyPlace.git.csv'
target_repo_commits = pd.read_csv(target_repo_dir+target_repo_raw_data_file)

In [None]:
# Prepare the features from raw commit data
def create_ml_frame(pred_commits,ext):
    pred_commits = pred_commits[pred_commits['file_ext']==ext]
    
    pred_commits['total_changed'] = pred_commits['number_lines_added']+pred_commits['number_lines_removed']
    pred_commits['feature_total_changed'] = (pred_commits['total_changed'] ** 0.7)

    pred_commits['file_number_loc'].loc[pred_commits['file_number_loc'] == 0] = pred_commits['total_changed']
    pred_commits['ratio_changed'] = pred_commits['total_changed']/pred_commits['file_number_loc']
    
    pred_commits['feature_rated_complexity'] = pred_commits['ratio_changed'] * pred_commits['file_complexity'] * \
                                                (pred_commits['total_changed'] ** 0.3)  
    
    pred_commits['feature_dmm_size'] = (pred_commits['total_changed'] ** 0.5) * pred_commits['dmm_unit_size']
    pred_commits['feature_dmm_unit_complexity'] = (pred_commits['total_changed'] ** 0.5) * \
                                                    pred_commits['dmm_unit_complexity']
    pred_commits['feature_dmm_unit_interfacing'] = (pred_commits['total_changed'] ** 0.5) * \
                                                    pred_commits['dmm_unit_interfacing']


    pred_ml_commits = pred_commits[['hash','Author','Committer','committed_date','feature_total_changed',
                                    'feature_rated_complexity', 'feature_dmm_unit_complexity',
                                    'feature_dmm_size','feature_dmm_unit_interfacing']]

    # Resetting the frame's index. It is required to retain the integrity of the frame
    pred_ml_commits = pred_ml_commits.reset_index().drop(columns = 'index')

    # Author/text column needs to be dropped before converting the all the fields into numeric types
    pred_ml_commits_na = pred_ml_commits.drop(columns = ['Author','hash','Committer','committed_date'])

    # Converting the fields to numeric types, filling the NaNs with zeros
    pred_ml_commits_numeric = pred_ml_commits_na.apply(pd.to_numeric,errors ='coerce').fillna(0)

    # Adding teh Author/text column back
    pred_ml_commits_all_coloumns = pred_ml_commits_numeric.copy()
    pred_ml_commits_all_coloumns['Author'] = pred_ml_commits['Author']
    pred_ml_commits_all_coloumns['hash'] = pred_ml_commits['hash']
    pred_ml_commits_all_coloumns['Committer'] = pred_ml_commits['Committer']
    pred_ml_commits_all_coloumns['committed_date'] = pred_ml_commits['committed_date']
    # We need to name this better. pred_ml_commits_all_coloumns ? I think the name is opposite of what this describes
    #pred_ml_commits_numeric_na = pred_ml_commits_numeric.drop(columns=['Author','hash','committed_date','Committer'])
    
    return pred_ml_commits_numeric, pred_ml_commits_all_coloumns

In [None]:
import os
import pathlib
import re
import glob
from os.path import isfile, join
from os import listdir
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from datetime import datetime

# Find the file extensions for the target repo commits.
#target_repo_commits['file_ext'] = \
#        target_repo_commits['file_new_path'].apply(lambda x:pathlib.Path(str(x)).suffix).\
#        apply(lambda x:re.split(r"[^a-zA-Z0-9\s\++\_\-]",x)[-1])
#target_repo_commits.file_ext = target_repo_commits.file_ext.replace(r'^\s*$', 'NoExt', regex=True)

if DEBUG >=1:
    print(len(target_repo_commits['file_ext'].unique()))

# Get the list of file extension from the target repo data
target_repo_file_exts = target_repo_commits['file_ext'].unique()

# Scaling the data
scaler = MinMaxScaler()

# Get the file names of saved GMM models. No idea whats happening below. Need to improve comment
gmm_model_files = [f for f in listdir('/home/kc/Projects/data_files/sav_files/gmm_sav') if isfile(join('/home/kc/Projects/data_files/sav_files/gmm_sav', f))]
file_ext_models = [x.split('_')[0] for x in gmm_model_files]

# Folder having the GMM pickle files
gmm_models_folder = '/home/kc/Projects/data_files/sav_files/gmm_sav/'

# Folder having the xgboost trained classifiers
xgboost_models_folder = '/home/kc/Projects/data_files/sav_files/xgboost_sav/'

# Folder for storing results
predictions_folder = '/home/kc/Projects/data_files/predictions/'

# Remove any previous files. This is probably not needed
files = glob.glob(predictions_folder+'*.csv')
for f in files:
    os.remove(f)

## Process the target repos mods against our trained files    
for file_ext in target_repo_file_exts:
    
    if DEBUG >= 3:
        print('Processing file extension: ', file_ext)
        store_start = datetime.now()
        print('starting at: ', store_start)
    
    # Prepare the features from raw data 
    target_repo_data_frame_numeric, target_repo_data_frame_all_coloumns = create_ml_frame(target_repo_commits, file_ext)
    
    ## Ensure that we have models for this file extension
    if file_ext in file_ext_models:
        xgboost_model_file = xgboost_models_folder+file_ext+'_xgboost_model.sav'
        xboost_model = pickle.load(open(xgboost_model_file, 'rb'))
        
        # Use the xgboost model to predict the cluster
        predicted_clusters = xboost_model.predict(target_repo_data_frame_numeric)
        target_repo_data_frame_all_coloumns['predicted_cluster'] = predicted_clusters

        ## Now use the GMM pickled models to calculate the probability of the mod belonging to predicted cluster
        # First get the relevant GMM pickel file for this file type/extension
        gmm_model_file = gmm_models_folder+file_ext+'_gmm_model_pickle.sav'
        mix = pickle.load(open(gmm_model_file, 'rb'))
        #pred_ml_commits_numeric_all = target_repo_data_frame_all_coloumns.drop(columns=['Author','hash','committed_date','Committer'])
        
        # Scale the data for GMM processing
        data_scaled = scaler.fit_transform(target_repo_data_frame_numeric)
        
        # Put this in a pandas frame
        cluster_frame = pd.DataFrame(data_scaled)
        
        # Not sure why we are doing this. I think this is redundant.
        # gmm_hash_clusters = mix.predict(cluster_frame)
        
        # Get the 'real world' value of the centroids. We need these to calculate the 'score' of each mod. 
        gmm_centroids = mix.means_
        real_centroids = scaler.inverse_transform(gmm_centroids)

        # Write these to dataframe
        real_centroids_dataFrame = pd.DataFrame(real_centroids, columns=['feature_total_changed',
                                    'feature_rated_complexity', 'feature_dmm_unit_complexity',
                                    'feature_dmm_size','feature_dmm_unit_interfacing'])
                                                
        # Add a column for summing all coloumns (https://github.com/kcramakrishna/cg/issues/10)
        # This is basically assigning a 'real world value' to each centroid i.e. cluster
        real_centroids_dataFrame['Sum_centroids'] = real_centroids_dataFrame.sum(axis = 1)
        real_centroids_dataFrame['original_cluster_labels'] = real_centroids_dataFrame.index
        
        # Now we need to map the cluster labels to the 'sum of centroids' for that cluster 
        centroid_map={}
        for i in range(real_centroids_dataFrame.shape[0]):
            centroid_map[real_centroids_dataFrame['original_cluster_labels'].values[i]]=real_centroids_dataFrame['Sum_centroids'].values[i]
        
        # Initialise a coloumn for holding the probabilities of the prediction
        probability_for_labels = np.zeros((len(predicted_clusters),1))
        
        # xgboost Gave the prediction, From GMM, get the probability of this prediction
        # Need to understand the below lines in more depth
        member_probs = mix.predict_proba(cluster_frame)
        for i in range(len(predicted_clusters)):
            probability_for_labels[i] = member_probs[i,predicted_clusters[i]]
        
        # Add the probabilities coloumn to the data Frame
        target_repo_data_frame_all_coloumns['probablities'] = probability_for_labels
        
        # Look up the Sum of Centroids for each cluster for each mod and add it to the row.
        target_repo_data_frame_all_coloumns['sum_centroid']=np.arange(0.0,target_repo_data_frame_all_coloumns.shape[0],1.0)
        for i in range(target_repo_data_frame_all_coloumns.shape[0]):
            target_repo_data_frame_all_coloumns['sum_centroid'].values[i]=centroid_map[target_repo_data_frame_all_coloumns['predicted_cluster'].values[i]]
        
        # Finally calculate the score for each mod in the target repo
        target_repo_data_frame_all_coloumns['mod_score'] = target_repo_data_frame_all_coloumns['sum_centroid'] * target_repo_data_frame_all_coloumns['probablities']
        
        # Append these results to target_predictions file
        with open(predictions_folder+'scores_'+target_repo_raw_data_file, 'a') as predictions_file:
            target_repo_data_frame_all_coloumns.to_csv(predictions_file, mode='a', \
                                                       header=predictions_file.tell()==0)
    else:
        target_repo_data_frame_all_coloumns['predicted_cluster'] = 'No Model found'
        target_repo_data_frame_all_coloumns['sum_centroid'] = 0
        target_repo_data_frame_all_coloumns['probablities'] = 0
        target_repo_data_frame_all_coloumns['mod_score'] = 0
        with open(predictions_folder+'scores_'+target_repo_raw_data_file, 'a') as predictions_file:
            target_repo_data_frame_all_coloumns.to_csv(predictions_file, mode='a', \
                                                       header=predictions_file.tell()==0)
    if DEBUG >=2:
            print(predictions_folder+'scores_'+target_repo_raw_data_file)    
    
    if DEBUG >=3:
            store_end = datetime.now()
            print('processing complete: ', store_end)
            print('time taken: ', (store_end - store_start))