In [8]:
# Define a DEBUG flag with values from '0' to '5'. Default is '0' which is OFF. 
# Use this cautiously - we are not validating for this

DEBUG = 3

In [9]:
# Use h2o4gpu if you have it installed
# Add a flag with Default as False. Don't change this unless your kernel uses h2o4gpu.
h2o4gpu_enabled = True

In [10]:
#Read the data files and concantenate into a dataframe
import pandas as pd
import glob
import os

training_data_files_path = r'/home/kc/Projects/data_files/Training_data_from_public_git/'                     
all_files = glob.glob(os.path.join(training_data_files_path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent

df_from_each_file = (pd.read_csv(f) for f in all_files)
total_commits = pd.concat(df_from_each_file, ignore_index=True)
                             
if DEBUG >=1:
    print(total_commits.shape)

(3199793, 21)


In [11]:
#Creating various features for each modification

# Note: We will add a prefix "feature" to variables to create the features for the ML model.

# From documentation: https://pydriller.readthedocs.io/en/latest/commit.html
#     dmm_unit_size (float): DMM metric value for the unit size property.
#     dmm_unit_complexity (float): DMM metric value for the unit complexity property.
#     dmm_unit_interfacing (float): DMM metric value for the unit interfacing property.

# https://pydriller.readthedocs.io/en/latest/modifications.html
#     complexity: Cyclomatic Complexity of the file
#     changed_methods: subset of _methods_ containing only the changed methods.

# Here is more about dmm: https://pydriller.readthedocs.io/en/latest/deltamaintainability.html
#     The delta-maintainability metric is the proportion of low-risk change in a commit. 
#     The resulting value ranges from 0.0 (all changes are risky) to 1.0 (all changes are low risk). 
#     It rewards making methods better, and penalizes making things worse.

# "total lines changed" is important but can be very misleading metric of a commit. 
# We will create a feature which blunts the weight of nloc
# Total number of lines changed
total_commits['total_changed'] = total_commits['lines_added'] + total_commits['lines_removed']
total_commits['feature_total_changed'] = (total_commits['total_changed'] ** 0.7)

# Fraction of lines changed per total numbe of lines in file
# We need to account for the fact that new files added with have existing size as '0'.
#           and divide by '0' is indeterminate
total_commits['size'].loc[total_commits['size'] == 0] = total_commits['total_changed']
total_commits['ratio_changed'] = total_commits['total_changed'] / total_commits['size']

# 'complexity' is given for the whole file. We need to rate it for only the changed lines.
#     Let us weight it by 2 variables:
#         "ratio changed" AND
#         a feature related to 'size' of file i.e. we are making an assumption that larger files are 
#         more difficult to change but we need to taper this off too. Let us go with cube root of size 
total_commits['feature_rated_complexity'] = total_commits['ratio_changed'] * total_commits['complexity'] * \
                                                (total_commits['total_changed'] ** 0.3)

# dmm values are given for the commit. We need to scale them for individual commits.
# We should weight this by "changed_methods" but we missed mining this. We will add this later.
# When adding "changed methods", We will do Something like (changed_methods ** 1.5) to reflect importance
#       of adding and deleting methods. We can then change to use (total_changed ** 0.3)
total_commits['feature_dmm_size'] = (total_commits['total_changed'] ** 0.5) * total_commits['dmm_unit_size']
total_commits['feature_dmm_unit_complexity'] = (total_commits['total_changed'] ** 0.5) * \
                                                    total_commits['dmm_unit_complexity']
total_commits['feature_dmm_unit_interfacing'] = (total_commits['total_changed'] ** 0.5) * \
                                                    total_commits['dmm_unit_interfacing']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [12]:
# Function to Remove outliers. 
from scipy import stats
import numpy as np

def filter_outliers(data_frame):
    
    # Calculate z_scores and if zscore is greater than '3', then its an outlier
    
    # Get non-Outliers: 
    data_frame_non_outliers = data_frame[(np.abs(stats.zscore(data_frame.select_dtypes(exclude='object'))) < 3).all(axis=1)]

    # Collect outliers
    data_frame_outliers = data_frame[~(np.abs(stats.zscore(data_frame.select_dtypes(exclude='object'))) < 3).all(axis=1)]
    
    return data_frame_non_outliers, data_frame_outliers

In [13]:
# Prepare frames for each file type extension

def prepare_frame(total_commits, file_ext):
    import numpy as np
    
    #Filter the mods based on file type extension
    file_ext_commits = total_commits[total_commits['file_ext']==file_ext]

    ml_commits = file_ext_commits[['hash','Author','feature_total_changed','feature_rated_complexity',
                                'feature_dmm_unit_complexity','feature_dmm_size','feature_dmm_unit_interfacing']]

    # Resetting the frame's index. It is required to retain the integrity of the frame
    ml_commits = ml_commits.reset_index().drop(columns = 'index')

    # Temporarily dropping text columns for numeric processing
    ml_commits_noText = ml_commits.drop(columns = ['Author','hash'])

    # Explicitely converting fields to numeric types and filling the NaNs with zeros
    ml_commits_numeric = ml_commits_noText.apply(pd.to_numeric,errors ='coerce').fillna(0)

    # Adding the Author column back to create a 'total' data frame
    ml_commits_all_coloumns = ml_commits_numeric.copy()
    ml_commits_all_coloumns['Author'] = ml_commits['Author']
    ml_commits_all_coloumns['hash'] = ml_commits['hash']
    
    return ml_commits_all_coloumns

In [14]:
# Function to scale data in the frame

def scale_frame(data_frame):
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    %matplotlib inline
    if (h2o4gpu_enabled == True):
        from h2o4gpu.preprocessing import MinMaxScaler
    else:
        from sklearn.preprocessing import MinMaxScaler
    
    # Use minMax scaler since this does not distort
    # https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
    scaler = MinMaxScaler()
    data_frame_numeric = data_frame.drop(columns = ['Author','hash'])
    scaled_data_frame = scaler.fit_transform(data_frame_numeric)
    
    return scaled_data_frame, scaler

In [15]:
def create_cluster(scaled_file_ext_frame, file_ext, file_ext_frame, scaler, k, gmm_models_folder, 
                   centroids_folder):
    
    # We are currently using GMM from sklearn. We need to get the GPU version of GMM.
    # https://pypi.org/project/pycave/
    from sklearn.mixture import GaussianMixture

    # Initializing the Gaussian mixture model 
    mix = GaussianMixture(n_components=k, random_state=42)

    # Learning the Gaussian mixture model from data   
    mix.fit(scaled_file_ext_frame)

    # Saving the parameters of Gaussian mixture model in a file
    import pickle
    vfilename = gmm_models_folder+file_ext+'_gmm_model_pickle.sav'
    pickle.dump(mix, open(vfilename, 'wb'))

    # Predicting the cluster labels of the data for the Gaussian mixture model
    cluster_frame = pd.DataFrame(scaled_file_ext_frame)
    gmm_hash_clusters = mix.predict(cluster_frame)

    # Collecting the mean of the Gaussian mixture model in 'gmmcentroids'
    gmm_centroids = mix.means_
    gmm_covariances = mix.covariances_
    combinedCentroids = gmm_centroids[gmm_hash_clusters].sum(axis=1)

    # Converting the input data series into pan
    file_ext_frame['Cluster'] = gmm_hash_clusters
    real_centroids = scaler.inverse_transform(gmm_centroids)

    # Write these to dataframe
    real_centroids_dataFrame = pd.DataFrame(real_centroids, columns=['feature_total_changed',
                                                                     'feature_rated_complexity',
                                'feature_dmm_unit_complexity','feature_dmm_size','feature_dmm_unit_interfacing'])

    # Add a cloumn for summing all centroids "This is the value of the individual clusters"
    real_centroids_dataFrame['Sum_centroids'] = real_centroids_dataFrame.sum(axis = 1)
    
    # Save centroids of the clusters to a file for audit
    centroid_file = centroids_folder+file_ext+'centroids.csv'
    real_centroids_dataFrame.to_csv(centroid_file)
    
    return file_ext_frame

In [16]:
def create_boost_model(file_ext_frame, file_ext, xgboost_models_folder):
    
    from xgboost import XGBClassifier
    
    if (h2o4gpu_enabled == True):
        import h2o4gpu as sklearn
        # We assume that if h2o4gpu is enabled then, GPU is available and we can use xgboost on GPU.
        # Instantiate the xgboost model with relevant params
        # https://gist.github.com/shreyasbapat/89c6d6e09ff3f763e21ea68f98d74f84
        # https://xgboost.readthedocs.io/en/latest/gpu/index.html
        xgboost_model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    else:
        import sklearn
        xgboost_model = XGBClassifier()

    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    import pickle

    # Remove text fields before numeric manipulations
    file_ext_frame_numeric_xg = file_ext_frame.drop(columns=['Author','hash'])

    # Prepare the 'X' and 'Y' for the model
    X_file_ext_frame_numeric_xg = file_ext_frame_numeric_xg.drop(columns = ['Cluster'])
    Y_file_ext_frame_numeric_xg = file_ext_frame_numeric_xg['Cluster']

    # Split the data for 'Training' and 'Testing' datasets
    X_train, X_test, y_train, y_test = train_test_split(X_file_ext_frame_numeric_xg, Y_file_ext_frame_numeric_xg, random_state=7)


    # Training the xgboost classifier
    xgboost_model.fit(X_train, y_train)

    # Predicting the class labels of test data for xgboost classifier
    y_pred = xgboost_model.predict(X_test)

    # check accuracy
    if DEBUG >= 1:
        accuracy = accuracy_score(y_test, y_pred)
        print(file_ext+'_size: ', file_ext_frame.shape[0])
        print(file_ext+'_accuracy: ', accuracy)
    
    # Save the model to a file
    filename = xgboost_models_folder+file_ext+'_xgboost_model.sav'
    pickle.dump(xgboost_model, open(filename, 'wb'))
    

In [17]:
import os
import pathlib
import re
import glob


# Create a coloumn 'file_ext' which is the file 'type'
total_commits['file_ext'] = total_commits['file_path'].apply(lambda x:pathlib.Path(str(x)).suffix).apply(lambda x:re.split(r"[^a-zA-Z0-9\s\++\_\-]",x)[-1])

# For files without any extension, mark 'file_ext' as "NoExt" 
total_commits.file_ext = total_commits.file_ext.replace(r'^\s*$', 'NoExt', regex=True)

# Print for Debugging 
if DEBUG >=1:
    print(len(total_commits['file_ext'].unique()))

# Prepare a list of all unique file extensions
unique_extensions = total_commits['file_ext'].unique()

# Folder to save models and centroids
gmm_models_folder = '/home/kc/Projects/data_files/sav_files/gmm_sav/'
centroids_folder = '/home/kc/Projects/data_files/sav_files/centroids/'
xgboost_models_folder = '/home/kc/Projects/data_files/sav_files/xgboost_sav/'
        
# Remove all previous models 
for folders in [gmm_models_folder, centroids_folder, xgboost_models_folder]:
    for file in glob.glob(folders+'*'):
        os.remove(file)

# For every file extension: prepare the data frame, create cluster, train xgBoost model and save it.
# We should change this to only those extensions supported by lizard/pydriller
for file_ext in unique_extensions:
    
    # Set number of clusters
    k=5
    
    # Extract data frame for the specific file type extension
    file_ext_frame = prepare_frame(total_commits, file_ext)
    
    # We need a good sample size for accurate results. 
    #    We will ignore'file_ext' if it has less than 50 rows 
    if file_ext_frame.shape[0] < 50:
        print('ignoring file type: ', file_ext, '(Not enough rows)')
        continue
        
    # Remove outliers from the frame (only if you reasonable amount of data points) 
    file_ext_frame_non_outliers, file_ext_frame_outliers = filter_outliers(file_ext_frame)
    if DEBUG >= 2:
        print('no. of non-outlier rows: ', file_ext_frame_non_outliers.shape[0])
        print('no. of outlier rows: ', file_ext_frame_outliers.shape[0])
    
    # Scale the data    
    # There is a peculiar behaviour here. For lot of file_ext (= 'md', 'html', etc,) All data points are 
    #     showing up as outliers. This is very funny. We should probably restrict our processing to files
    #     supported by 'lizard/pydriller'. 
    # We are forced to check that we have at least 1 non_outlier.
    if file_ext_frame_non_outliers.shape[0] >= 1:
        scaled_file_ext_frame, scaler = scale_frame(file_ext_frame_non_outliers)
        if DEBUG >= 2:
            print('rows in scaled_file_ext_frame: ', scaled_file_ext_frame.shape[0])
        
        # Create the actual clusters from the data.
        clustered_frame = create_cluster(scaled_file_ext_frame, file_ext, file_ext_frame_non_outliers, scaler, 
                                            k, gmm_models_folder, centroids_folder)

        # Train xgboost model for each extension
        create_boost_model(clustered_frame, file_ext, xgboost_models_folder)
    else:
        print('ignoring file type: ', file_ext, '(All rows are outliers)')

1817


  return (a - mns) / sstd


no. of non-outlier rows:  0
no. of outlier rows:  76774
ignoring file type:  md (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  38928
ignoring file type:  html (All rows are outliers)
no. of non-outlier rows:  190450
no. of outlier rows:  3448
rows in scaled_file_ext_frame:  190450


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


js_size:  190450
js_accuracy:  0.9914939197278054
no. of non-outlier rows:  0
no. of outlier rows:  20416
ignoring file type:  json (All rows are outliers)


  return (a - mns) / sstd


no. of non-outlier rows:  0
no. of outlier rows:  186480
ignoring file type:  NoExt (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  4934
ignoring file type:  snap (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  22958
ignoring file type:  yml (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  2329
ignoring file type:  lock (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  7027
ignoring file type:  css (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  28335
ignoring file type:  png (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  7711
ignoring file type:  svg (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  119
ignoring file type:  map (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  2200
ignoring file type:  coffee (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  1831

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


ts_size:  21727
ts_accuracy:  0.9755154639175257
no. of non-outlier rows:  0
no. of outlier rows:  321
ignoring file type:  ico (All rows are outliers)


  return (a - mns) / sstd


no. of non-outlier rows:  0
no. of outlier rows:  60
ignoring file type:  flow (All rows are outliers)
ignoring file type:  swp (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  140255
ignoring file type:  xml (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  58
ignoring file type:  jpeg (All rows are outliers)
ignoring file type:  snapshot (Not enough rows)
no. of non-outlier rows:  21803
no. of outlier rows:  712
rows in scaled_file_ext_frame:  21803


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


cpp_size:  21803
cpp_accuracy:  0.9834892680242158
no. of non-outlier rows:  95971
no. of outlier rows:  1728
rows in scaled_file_ext_frame:  95971


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


h_size:  95971
h_accuracy:  0.9930396365606635
no. of non-outlier rows:  0
no. of outlier rows:  66
ignoring file type:  toml (All rows are outliers)


  return (a - mns) / sstd


ignoring file type:  mp4 (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  1260
ignoring file type:  zip (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  4600
ignoring file type:  gif (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  1686
ignoring file type:  jpg (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  74005
ignoring file type:  txt (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  10761
ignoring file type:  yaml (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  308
ignoring file type:  scss (All rows are outliers)
no. of non-outlier rows:  1048
no. of outlier rows:  33
rows in scaled_file_ext_frame:  1048


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


rb_size:  1048
rb_accuracy:  0.9351145038167938
no. of non-outlier rows:  653589
no. of outlier rows:  13798
rows in scaled_file_ext_frame:  653589


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


py_size:  653589
py_accuracy:  0.9958751025104347
no. of non-outlier rows:  0
no. of outlier rows:  864
ignoring file type:  gyp (All rows are outliers)


  return (a - mns) / sstd
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


no. of non-outlier rows:  2619
no. of outlier rows:  144
rows in scaled_file_ext_frame:  2619
php_size:  2619
php_accuracy:  0.9938931297709923
ignoring file type:  gemspec (Not enough rows)


  return (a - mns) / sstd


no. of non-outlier rows:  0
no. of outlier rows:  28621
ignoring file type:  jsx (All rows are outliers)
ignoring file type:  mustache (Not enough rows)
ignoring file type:  diff (Not enough rows)
no. of non-outlier rows:  839082
no. of outlier rows:  28143
rows in scaled_file_ext_frame:  839082


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


java_size:  839082
java_accuracy:  0.9963293305557012
no. of non-outlier rows:  0
no. of outlier rows:  8474
ignoring file type:  gradle (All rows are outliers)


  return (a - mns) / sstd


no. of non-outlier rows:  0
no. of outlier rows:  10752
ignoring file type:  adoc (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  967
ignoring file type:  kt (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  33477
ignoring file type:  properties (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  1782
ignoring file type:  bat (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  4703
ignoring file type:  jar (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  1409
ignoring file type:  xsd (All rows are outliers)
ignoring file type:  multipart (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  328
ignoring file type:  aj (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  525
ignoring file type:  sql (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  560
ignoring file type:  factories (All rows are outliers)
ignor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


c_size:  56823
c_accuracy:  0.994227791074194
no. of non-outlier rows:  0
no. of outlier rows:  9905
ignoring file type:  tsx (All rows are outliers)


  return (a - mns) / sstd


no. of non-outlier rows:  129217
no. of outlier rows:  4665
rows in scaled_file_ext_frame:  129217


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


cc_size:  129217
cc_accuracy:  0.9894134034979105
no. of non-outlier rows:  0
no. of outlier rows:  65
ignoring file type:  index (All rows are outliers)


  return (a - mns) / sstd


no. of non-outlier rows:  0
no. of outlier rows:  63
ignoring file type:  pb (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  59
ignoring file type:  data-00000-of-00001 (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  4190
ignoring file type:  bzl (All rows are outliers)
no. of non-outlier rows:  8046
no. of outlier rows:  247
rows in scaled_file_ext_frame:  8046


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


m_size:  8046
m_accuracy:  0.9811133200795229
ignoring file type:  code-workspace (Not enough rows)


  return (a - mns) / sstd


no. of non-outlier rows:  0
no. of outlier rows:  88
ignoring file type:  xcscheme (All rows are outliers)
ignoring file type:  xcsettings (Not enough rows)
ignoring file type:  xcworkspacedata (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  55
ignoring file type:  xib (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  646
ignoring file type:  plist (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  50
ignoring file type:  pro (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  1266
ignoring file type:  pbxproj (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  120
ignoring file type:  bazel (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  90
ignoring file type:  whl (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  82
ignoring file type:  vue (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  56
ignoring 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


cs_size:  440
cs_accuracy:  0.9181818181818182
ignoring file type:  stderr (Not enough rows)
ignoring file type:  output (Not enough rows)
ignoring file type:  systemd (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  90
ignoring file type:  psm1 (All rows are outliers)


  return (a - mns) / sstd


ignoring file type:  vault (Not enough rows)
ignoring file type:  bak (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  970
ignoring file type:  rc (All rows are outliers)
ignoring file type:  bash (Not enough rows)
ignoring file type:  te (Not enough rows)
ignoring file type:  reg (Not enough rows)
ignoring file type:  pfx (Not enough rows)
ignoring file type:  iso (Not enough rows)
ignoring file type:  p12 (Not enough rows)
ignoring file type:  local (Not enough rows)
ignoring file type:  rpm (Not enough rows)
ignoring file type:  inventory (Not enough rows)
ignoring file type:  control (Not enough rows)
ignoring file type:  mof (Not enough rows)
ignoring file type:  psd1 (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  473
ignoring file type:  out (All rows are outliers)
ignoring file type:  jinja2_native_types (Not enough rows)
ignoring file type:  htpasswd (Not enough rows)
ignoring file type:  cert (Not enough rows)
no. of non-outlier rows:  0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


go_size:  2370
go_accuracy:  0.984822934232715
no. of non-outlier rows:  0
no. of outlier rows:  66
ignoring file type:  include (All rows are outliers)


  return (a - mns) / sstd


ignoring file type:  MD (Not enough rows)
ignoring file type:  dynamic (Not enough rows)
ignoring file type:  py3 (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  211
ignoring file type:  css_t (All rows are outliers)
ignoring file type:  gpg (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  57
ignoring file type:  asciidoc (All rows are outliers)
ignoring file type:  events (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  374
ignoring file type:  dat (All rows are outliers)
ignoring file type:  gram (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  274
ignoring file type:  props (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  1893
ignoring file type:  decTest (All rows are outliers)
ignoring file type:  tsv (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  92
ignoring file type:  rtf (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  600
ignoring

ignoring file type:  adb (Not enough rows)
ignoring file type:  emx (Not enough rows)
ignoring file type:  pas (Not enough rows)
ignoring file type:  586 (Not enough rows)
ignoring file type:  clp (Not enough rows)
ignoring file type:  mms (Not enough rows)
ignoring file type:  qpg (Not enough rows)
ignoring file type:  dj2 (Not enough rows)
ignoring file type:  gcc (Not enough rows)
ignoring file type:  sas (Not enough rows)
ignoring file type:  gpr (Not enough rows)
ignoring file type:  686 (Not enough rows)
ignoring file type:  pup (Not enough rows)
ignoring file type:  contrib (Not enough rows)
ignoring file type:  dir (Not enough rows)
ignoring file type:  cls (Not enough rows)
ignoring file type:  help (Not enough rows)
ignoring file type:  pm (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  165
ignoring file type:  mcp (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  2546
ignoring file type:  src (All rows are outliers)
ignoring file t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


cxx_size:  135
cxx_accuracy:  0.9117647058823529
no. of non-outlier rows:  0
no. of outlier rows:  147
ignoring file type:  f (All rows are outliers)


  return (a - mns) / sstd


ignoring file type:  swg (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  126
ignoring file type:  pxi (All rows are outliers)
no. of non-outlier rows:  710
no. of outlier rows:  11
rows in scaled_file_ext_frame:  710


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


hpp_size:  710
hpp_accuracy:  0.9775280898876404
ignoring file type:  enc (Not enough rows)
ignoring file type:  npy (Not enough rows)
ignoring file type:  npz (Not enough rows)
ignoring file type:  fig (Not enough rows)
ignoring file type:  lyx (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  54
ignoring file type:  pyf (All rows are outliers)
ignoring file type:  unused (Not enough rows)


  return (a - mns) / sstd


no. of non-outlier rows:  0
no. of outlier rows:  56
ignoring file type:  eps (All rows are outliers)
ignoring file type:  layout (Not enough rows)
ignoring file type:  nsi (Not enough rows)
ignoring file type:  fits (Not enough rows)
ignoring file type:  pkl (Not enough rows)
no. of non-outlier rows:  40937
no. of outlier rows:  1749
rows in scaled_file_ext_frame:  40937


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


scala_size:  40937
scala_accuracy:  0.9812408402540302
ignoring file type:  ConfigProvider (Not enough rows)
ignoring file type:  ServiceLoadedClass (Not enough rows)
ignoring file type:  ConnectorClientConfigOverridePolicy (Not enough rows)
ignoring file type:  ConnectRestExtension (Not enough rows)
ignoring file type:  header (Not enough rows)
ignoring file type:  ldiff (Not enough rows)
ignoring file type:  sbt (Not enough rows)
ignoring file type:  dll (Not enough rows)
ignoring file type:  kafka (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  281
ignoring file type:  clj (All rows are outliers)


  return (a - mns) / sstd


ignoring file type:  config (Not enough rows)
ignoring file type:  StyleCop (Not enough rows)
ignoring file type:  Targets (Not enough rows)
ignoring file type:  FxCop (Not enough rows)
ignoring file type:  40 (Not enough rows)
ignoring file type:  39 (Not enough rows)
ignoring file type:  38 (Not enough rows)
ignoring file type:  37 (Not enough rows)
ignoring file type:  36 (Not enough rows)
ignoring file type:  35 (Not enough rows)
ignoring file type:  34 (Not enough rows)
ignoring file type:  33 (Not enough rows)
ignoring file type:  32 (Not enough rows)
ignoring file type:  31 (Not enough rows)
ignoring file type:  30 (Not enough rows)
ignoring file type:  29 (Not enough rows)
ignoring file type:  28 (Not enough rows)
ignoring file type:  27 (Not enough rows)
ignoring file type:  26 (Not enough rows)
ignoring file type:  25 (Not enough rows)
ignoring file type:  24 (Not enough rows)
ignoring file type:  23 (Not enough rows)
ignoring file type:  22 (Not enough rows)
ignoring file ty

ignoring file type:  7-list-view-serializer-snapshot (Not enough rows)
ignoring file type:  6-long-value-with-proper-hash-code-serializer-snapshot (Not enough rows)
ignoring file type:  6-list-view-serializer-snapshot (Not enough rows)
ignoring file type:  7-context-state-serializer-data (Not enough rows)
ignoring file type:  7-transaction-state-serializer-snapshot (Not enough rows)
ignoring file type:  6-long-value-with-proper-hash-code-serializer-data (Not enough rows)
ignoring file type:  6-context-state-serializer-data (Not enough rows)
ignoring file type:  7-lockable-type-serializer-snapshot (Not enough rows)
ignoring file type:  6-java-serializer-data (Not enough rows)
ignoring file type:  6-list-view-serializer-data (Not enough rows)
ignoring file type:  7-java-serializer-snapshot (Not enough rows)
ignoring file type:  7-long-value-with-proper-hash-code-serializer-data (Not enough rows)
ignoring file type:  7-lockable-type-serializer-data (Not enough rows)
ignoring file type:  7

ignoring file type:  6-node-id-serializer-snapshot (Not enough rows)
ignoring file type:  7-event-id-serializer-snapshot (Not enough rows)
ignoring file type:  7-nullable-padded-serializer-snapshot (Not enough rows)
ignoring file type:  6-kryo-type-serializer-custom-snapshot (Not enough rows)
ignoring file type:  6-scala-try-serializer-snapshot (Not enough rows)
ignoring file type:  7-kryo-type-serializer-empty-config-data (Not enough rows)
ignoring file type:  6-shared-buffer-node-serializer-data (Not enough rows)
ignoring file type:  7-node-id-serializer-snapshot (Not enough rows)
ignoring file type:  7-shared-buffer-node-serializer-data (Not enough rows)
ignoring file type:  6-scala-try-serializer-data (Not enough rows)
ignoring file type:  7-kryo-type-serializer-empty-config-snapshot (Not enough rows)
ignoring file type:  6-dewey-number-serializer-data (Not enough rows)
ignoring file type:  6-dewey-number-serializer-snapshot (Not enough rows)
ignoring file type:  6-nullable-not-pad

ignoring file type:  7-double-primitive-array-serializer-snapshot (Not enough rows)
ignoring file type:  7-date-serializer-data (Not enough rows)
ignoring file type:  7-long-primitive-array-serializer-snapshot (Not enough rows)
ignoring file type:  6-int-primitive-array-serializer-data (Not enough rows)
ignoring file type:  6-long-serializer-data (Not enough rows)
ignoring file type:  6-big-int-serializer-snapshot (Not enough rows)
ignoring file type:  7-char-serializer-data (Not enough rows)
ignoring file type:  7-int-value-serializer-data (Not enough rows)
ignoring file type:  6-char-primitive-array-serializer-data (Not enough rows)
ignoring file type:  7-double-serializer-snapshot (Not enough rows)
ignoring file type:  6-boolean-primitive-array-serializer-data (Not enough rows)
ignoring file type:  7-int-primitive-array-serializer-data (Not enough rows)
ignoring file type:  6-boolean-primitive-array-serializer-snapshot (Not enough rows)
ignoring file type:  7-byte-serializer-snapsho

ignoring file type:  b4bcb0e9-5c9e-45dd-8963-1b163343544d (Not enough rows)
ignoring file type:  a88d5993-77bc-44ce-880b-9f2a43b59ab4 (Not enough rows)
ignoring file type:  8fec17e9-5d54-4fa9-aebb-70736fe03c82 (Not enough rows)
ignoring file type:  a70190d6-d080-43a8-b414-746b09d3a8a0 (Not enough rows)
ignoring file type:  666acf3e-935c-4621-8171-f7c897496524 (Not enough rows)
ignoring file type:  0af18f41-d8f8-4a4e-a92e-de12851be20b (Not enough rows)
ignoring file type:  1d423406-097a-4deb-bfde-d023d3477cd5 (Not enough rows)
ignoring file type:  f121b73d-ac74-4fbd-b70d-f13e51c9132c (Not enough rows)
ignoring file type:  a156884a-f090-4c3f-a271-0b63ab539c45 (Not enough rows)
ignoring file type:  6a837aa3-4736-4098-a878-fdeffe227628 (Not enough rows)
ignoring file type:  10833090-dd8c-4e36-884d-bb9758a3a8ef (Not enough rows)
ignoring file type:  9414371c-1fb9-4646-8e52-83af81a1634b (Not enough rows)
ignoring file type:  5db416c2-1714-4240-8bfb-c9380ac2b5cd (Not enough rows)
ignoring fil

ignoring file type:  base64 (Not enough rows)
ignoring file type:  joptsimple (Not enough rows)
ignoring file type:  webbit (Not enough rows)
ignoring file type:  hdrhistogram (Not enough rows)
ignoring file type:  7-empty-state-snapshot (Not enough rows)
ignoring file type:  edn (Not enough rows)
ignoring file type:  jython (Not enough rows)
ignoring file type:  jsch (Not enough rows)
ignoring file type:  xmlenc (Not enough rows)
ignoring file type:  scopt (Not enough rows)
ignoring file type:  grizzled-slf4j (Not enough rows)
ignoring file type:  6-boolean-serializer-data (Not enough rows)
ignoring file type:  6-array-type-serializer-snapshot (Not enough rows)
ignoring file type:  6-string-array-serializer-snapshot (Not enough rows)
ignoring file type:  6-string-array-serializer-data (Not enough rows)
ignoring file type:  6-byte-serializer-data (Not enough rows)
ignoring file type:  4-serializer-java-serialized (Not enough rows)
ignoring file type:  6-double-serializer-data (Not enou

ignoring file type:  swig (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  814
ignoring file type:  BUILD (All rows are outliers)
ignoring file type:  cpu-noavx-demo (Not enough rows)
ignoring file type:  gpu-noavx (Not enough rows)
ignoring file type:  gpu-demo (Not enough rows)
ignoring file type:  cpu-noavx-devel (Not enough rows)
ignoring file type:  cpu-devel (Not enough rows)
ignoring file type:  gpu-noavx-demo (Not enough rows)
ignoring file type:  gpu-devel (Not enough rows)
ignoring file type:  gpu-noavx-devel (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  69
ignoring file type:  cpu (All rows are outliers)
ignoring file type:  cpu-noavx (Not enough rows)
ignoring file type:  cpu-demo (Not enough rows)
ignoring file type:  yapf (Not enough rows)
ignoring file type:  prototxt (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  640
ignoring file type:  Dockerfile (All rows are outliers)
ignoring file type:  dockerfile (Not 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


mm_size:  5024
mm_accuracy:  0.9689490445859873
ignoring file type:  mojom (Not enough rows)
ignoring file type:  lnk (Not enough rows)
ignoring file type:  gitattributes (Not enough rows)
ignoring file type:  grdp (Not enough rows)
ignoring file type:  asar (Not enough rows)
ignoring file type:  cer (Not enough rows)
ignoring file type:  grd (Not enough rows)
ignoring file type:  arm32v7 (Not enough rows)
ignoring file type:  arm64v8 (Not enough rows)
ignoring file type:  workflow (Not enough rows)
ignoring file type:  attributes (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  705
ignoring file type:  gypi (All rows are outliers)


  return (a - mns) / sstd


ignoring file type:  chromium (Not enough rows)
ignoring file type:  circleci (Not enough rows)
ignoring file type:  arm64 (Not enough rows)
ignoring file type:  armv7 (Not enough rows)
ignoring file type:  pump (Not enough rows)
ignoring file type:  odp (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  99
ignoring file type:  PNG (All rows are outliers)
ignoring file type:  ppt (Not enough rows)
ignoring file type:  launch (Not enough rows)
ignoring file type:  exsd (Not enough rows)
ignoring file type:  JPG (Not enough rows)
ignoring file type:  php_old (Not enough rows)
ignoring file type:  xhtml (Not enough rows)
ignoring file type:  GIF (Not enough rows)
ignoring file type:  markdown (Not enough rows)
ignoring file type:  maven (Not enough rows)
ignoring file type:  kate-swp (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  178
ignoring file type:  mplstyle (All rows are outliers)
ignoring file type:  glade (Not enough rows)
ignoring file type: 

ignoring file type:  DocumentBuilderFactory (Not enough rows)
ignoring file type:  driver (Not enough rows)
ignoring file type:  DatatypeFactory (Not enough rows)
ignoring file type:  jjt (Not enough rows)
ignoring file type:  rej (Not enough rows)
ignoring file type:  incl (Not enough rows)
ignoring file type:  triggers (Not enough rows)
ignoring file type:  DynamicAccessHandler (Not enough rows)
ignoring file type:  smd (Not enough rows)
ignoring file type:  Init (Not enough rows)
ignoring file type:  nodebug (Not enough rows)
ignoring file type:  xpdl (Not enough rows)
ignoring file type:  goovy (Not enough rows)
ignoring file type:  attr (Not enough rows)
ignoring file type:  multi (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  499
ignoring file type:  podspec (All rows are outliers)
ignoring file type:  storyboard (Not enough rows)
no. of non-outlier rows:  134
no. of outlier rows:  9
rows in scaled_file_ext_frame:  134


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


swift_size:  134
swift_accuracy:  0.9411764705882353
no. of non-outlier rows:  0
no. of outlier rows:  58
ignoring file type:  xcconfig (All rows are outliers)


  return (a - mns) / sstd


ignoring file type:  gv (Not enough rows)
ignoring file type:  entitlements (Not enough rows)
ignoring file type:  input (Not enough rows)
ignoring file type:  stack (Not enough rows)
ignoring file type:  profmap (Not enough rows)
ignoring file type:  cpuprofile (Not enough rows)
ignoring file type:  so (Not enough rows)
ignoring file type:  android-base (Not enough rows)
ignoring file type:  aar (Not enough rows)
ignoring file type:  javascript (Not enough rows)
ignoring file type:  a (Not enough rows)
ignoring file type:  facebook (Not enough rows)
ignoring file type:  base (Not enough rows)
ignoring file type:  jsbundle (Not enough rows)
ignoring file type:  awk (Not enough rows)
ignoring file type:  bundle (Not enough rows)
ignoring file type:  applescript (Not enough rows)
ignoring file type:  graphql (Not enough rows)
ignoring file type:  dummy (Not enough rows)
ignoring file type:  litcoffee (Not enough rows)
ignoring file type:  wxi (Not enough rows)
ignoring file type:  arson 

ignoring file type:  seek (Not enough rows)
ignoring file type:  der (Not enough rows)
ignoring file type:  jfc (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  75
ignoring file type:  PostingsFormat (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  56
ignoring file type:  aff (All rows are outliers)
no. of non-outlier rows:  0
no. of outlier rows:  53
ignoring file type:  dic (All rows are outliers)
ignoring file type:  TODO (Not enough rows)
ignoring file type:  DocValuesFormat (Not enough rows)
ignoring file type:  nrm (Not enough rows)
ignoring file type:  mdtext (Not enough rows)
ignoring file type:  utf8 (Not enough rows)
no. of non-outlier rows:  0
no. of outlier rows:  289
ignoring file type:  alg (All rows are outliers)
ignoring file type:  brk (Not enough rows)
ignoring file type:  rbbi (Not enough rows)
ignoring file type:  flv (Not enough rows)
ignoring file type:  m4a (Not enough rows)
ignoring file type:  pages (Not enough rows)
