In [1]:
# Define a DEBUG flag with values from '0' to '5'. Default is '0' which is OFF. 
# Use this cautiously - we are not validating for this

DEBUG = 0

In [None]:
# Code to check if you have GPU and cuda working
# https://colab.research.google.com/notebooks/gpu.ipynb
#%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [2]:
# Use cuda - nvidia-smi if you have it installed
# Add a flag with Default as False. Don't change this unless you have cuda installed.
nvidia_cuda = False

In [3]:
#Read the data files and concantenate into a dataframe
import pandas as pd
import glob
import os

training_data_files_path = r'/home/kc/Projects/data_files/Training_data_from_public_git/'                     
all_files = glob.glob(os.path.join(training_data_files_path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent

df_from_each_file = (pd.read_csv(f) for f in all_files)
total_commits = pd.concat(df_from_each_file, ignore_index=True)
                             
if DEBUG >=2:
    print("Total No. of rows: ", total_commits.shape)

In [4]:
#Creating various features for each modification

# Note: We will add a prefix "feature" to variables to create the features for the ML model.

# From documentation: https://pydriller.readthedocs.io/en/latest/commit.html
#     dmm_unit_size (float): DMM metric value for the unit size property.
#     dmm_unit_complexity (float): DMM metric value for the unit complexity property.
#     dmm_unit_interfacing (float): DMM metric value for the unit interfacing property.

# https://pydriller.readthedocs.io/en/latest/modifications.html
#     complexity: Cyclomatic Complexity of the file
#     changed_methods: subset of _methods_ containing only the changed methods.

# Here is more about dmm: https://pydriller.readthedocs.io/en/latest/deltamaintainability.html
#     The delta-maintainability metric is the proportion of low-risk change in a commit. 
#     The resulting value ranges from 0.0 (all changes are risky) to 1.0 (all changes are low risk). 
#     It rewards making methods better, and penalizes making things worse.

# "total lines changed" is important but can be very misleading metric of a mod/commit. 
# We will create a feature which blunts the weight of nloc. From a code review perspective and
# general best practices, we can say that 3 commits of 30 lines each is better than a single commit of 90 lines
# We should probably increase weightage if the file has been added or deleted. Future versions?
total_commits['total_changed'] = total_commits['number_lines_added'] + total_commits['number_lines_removed']
total_commits['feature_total_changed'] = (total_commits['total_changed'] ** 0.7)

# Feature to account for Methods/functions added and deleted. We propose that writing
#     more methods should be exponentially weighted since it makes the code more modular 
# What happens when a function/method is added and another is deleted. This is a significant
#      change which we are currently ignoring. Some future version to accomodate this?
# We need to prevent 'gaming' by programmers who may write useless methods to increase score
total_commits['n_functions_add_del'] = abs(total_commits['number_functions_before'] - total_commits['number_functions_after'])
total_commits['feature_add_del_functions'] = (total_commits['n_functions_add_del'] ** 1.3) * (total_commits['total_changed'] ** 0.5)  
# Other comments:
# 1. Why are we raising 'n_functions_add_del' to this particular exponent? Here is the thought process:
#     We wanted to make sure that the 'avg' mod (i.e. the middle cluster) ended up with features with approx. the same scale.
#     This ensures that nloc does not determine the 'value' of the cluster. The rest of the feaures of the mod should be able to
#     have a non-trivial impact on the'value' of the cluster.
# 2. We will not worry about maintainig scale for the rest of the clusters. We will let that self organise.
# 3. We used https://www.desmos.com/calculator to see the curves for the exponents and tweaked them to ensure that the 
#     'sum of centroids' for the 'avg' or 'middle' cluster was similarly affected by each of the feature centroid. 
# (Is this a hack? Yep but it sorta plays into our solution as you will see)
# Note to self: Write a paper giving a more mathematical description to the solution     


# Feature to account for methods which were edited. From experience, we postulate that
#    if the effort to change 2 methods is 'x', then the effort to change 4 methods 
#    slightly more than 2x since there will be more complexity involved. 
# Here too, the exponents have been selected keeping in mind the 'middle' cluster
total_commits['feature_changed_functions'] = (total_commits['number_functions_edited'] ** 1.1) * (total_commits['total_changed'] ** 0.5)  

# 1. dmm values are given for the commit. i.e. these features will repeat as many times as
#    no. of mods that exist in the commit. To balnce this, we will divide by no. of 
#    mods in commit which is nothing but the no. of files in the commit.
# 2. If zero methods are added/deleted, we need to account for effort for changed methods. 
#     Lets make surethat the max of either of these features is selected.
#total_commits['feature_dmm_size'] = total_commits['dmm_unit_size'] * (1 / total_commits['number_of_mod_files']) * total_commits[['feature_add_del_functions','feature_changed_functions']].max(axis=1) 
#total_commits['feature_dmm_unit_complexity'] = total_commits['dmm_unit_complexity'] * (1 / total_commits['number_of_mod_files']) * total_commits[['feature_add_del_functions','feature_changed_functions']].max(axis=1)
#total_commits['feature_dmm_unit_interfacing'] = total_commits['dmm_unit_interfacing'] * (1 / total_commits['number_of_mod_files']) * total_commits[['feature_add_del_functions','feature_changed_functions']].max(axis=1)
#total_commits['feature_dmm_size'] = total_commits['dmm_unit_size'] * (1 / total_commits['number_of_mod_files']) * (((total_commits['n_functions_add_del'] ** 1.3) * (total_commits['total_changed'] ** 0.3)) + ((total_commits['number_functions_edited'] ** 1.1) * (total_commits['total_changed'] ** 0.3))) 
#total_commits['feature_dmm_unit_complexity'] = total_commits['dmm_unit_complexity'] * (1 / total_commits['number_of_mod_files']) * (((total_commits['n_functions_add_del'] ** 1.3) * (total_commits['total_changed'] ** 0.3)) + ((total_commits['number_functions_edited'] ** 1.1) * (total_commits['total_changed'] ** 0.3))) 
#total_commits['feature_dmm_unit_interfacing'] = total_commits['dmm_unit_interfacing'] * (1 / total_commits['number_of_mod_files']) * (((total_commits['n_functions_add_del'] ** 1.3) * (total_commits['total_changed'] ** 0.3)) + ((total_commits['number_functions_edited'] ** 1.1) * (total_commits['total_changed'] ** 0.3))) 
total_commits['feature_dmm_size'] = total_commits['dmm_unit_size'] * (((total_commits['n_functions_add_del'] ** 1.3) * (total_commits['total_changed'] ** 0.3)) + ((total_commits['number_functions_edited'] ** 1.1) * (total_commits['total_changed'] ** 0.3))) 
total_commits['feature_dmm_unit_complexity'] = total_commits['dmm_unit_complexity'] * (((total_commits['n_functions_add_del'] ** 1.3) * (total_commits['total_changed'] ** 0.3)) + ((total_commits['number_functions_edited'] ** 1.1) * (total_commits['total_changed'] ** 0.3))) 
total_commits['feature_dmm_unit_interfacing'] = total_commits['dmm_unit_interfacing'] * (((total_commits['n_functions_add_del'] ** 1.3) * (total_commits['total_changed'] ** 0.3)) + ((total_commits['number_functions_edited'] ** 1.1) * (total_commits['total_changed'] ** 0.3))) 

# We should add a feature to reflect number of lines of comments in the modification. 
# We should have a feature measring code churn - Code churn can be due to sub-optimal coding or due to too many change requests. Need to distinguish this. 

In [5]:
# Function to Remove outliers. 
from scipy import stats
import numpy as np

def filter_outliers(data_frame):
    
    # Calculate z_scores and if zscore is greater than '3', then its an outlier
    
    # Get non-Outliers: 
    data_frame_non_outliers = data_frame[(np.abs(stats.zscore(data_frame.select_dtypes(exclude=['object','bool']))) < 3).all(axis=1)]

    # Collect outliers
    data_frame_outliers = data_frame[~(np.abs(stats.zscore(data_frame.select_dtypes(exclude=['object','bool']))) < 3).all(axis=1)]
    
    return data_frame_non_outliers, data_frame_outliers

In [6]:
# Prepare frames for each file type extension

def prepare_frame(total_commits, file_ext):
    import numpy as np
    
    #Filter the mods based on file type extension
    file_ext_commits = total_commits[total_commits['file_ext']==file_ext]

    ml_commits = file_ext_commits[['hash','Author','feature_total_changed','feature_add_del_functions', 
                                   'feature_changed_functions', 'feature_dmm_unit_complexity','feature_dmm_size',
                                   'feature_dmm_unit_interfacing', 'language_supported']]

    # Resetting the frame's index. It is required to retain the integrity of the frame
    ml_commits = ml_commits.reset_index().drop(columns = 'index')

    # Temporarily dropping text columns for numeric processing
    ml_commits_noText = ml_commits.drop(columns = ['Author','hash','language_supported'])

    # Explicitely converting fields to numeric types and filling the NaNs with zeros
    ml_commits_numeric = ml_commits_noText.apply(pd.to_numeric,errors ='coerce').fillna(0)

    # Adding the Author column back to create a 'total' data frame
    ml_commits_all_coloumns = ml_commits_numeric.copy()
    ml_commits_all_coloumns['Author'] = ml_commits['Author']
    ml_commits_all_coloumns['hash'] = ml_commits['hash']
    ml_commits_all_coloumns['language_supported'] = ml_commits['language_supported']

    return ml_commits_all_coloumns

In [7]:
# Function to scale data in the frame

def scale_frame(data_frame):
    import pandas as pd
    import numpy as np

    from sklearn.preprocessing import MinMaxScaler
    
    # Use minMax scaler since this does not distort
    # https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
    scaler = MinMaxScaler()
    data_frame_numeric = data_frame.drop(columns = ['Author','hash'])
    scaled_data_frame = scaler.fit_transform(data_frame_numeric)
    
    return scaled_data_frame, scaler

In [8]:
def create_cluster(scaled_file_ext_frame, file_ext, file_ext_frame, scaler, k, gmm_models_folder, 
                   centroids_folder):
    
    if nvidia_cuda == False:
        # Initializing the regular CPU based Gaussian mixture model
        from sklearn.mixture import GaussianMixture
        mix = GaussianMixture(n_components=k, random_state=42)
    else:
        return
    
    # Learning the Gaussian mixture model from data   
    mix.fit(scaled_file_ext_frame)

    # Saving the parameters of Gaussian mixture model in a file
    import pickle
    vfilename = gmm_models_folder+file_ext+'_cpu_gmm_model_pickle.sav'
    pickle.dump(mix, open(vfilename, 'wb'))

    # Predicting the cluster labels of the data for the Gaussian mixture model
    cluster_frame = pd.DataFrame(scaled_file_ext_frame)
    gmm_hash_clusters = mix.predict(cluster_frame)

    # Collecting the mean of the Gaussian mixture model in 'gmmcentroids'
    gmm_centroids = mix.means_
    gmm_covariances = mix.covariances_
    combinedCentroids = gmm_centroids[gmm_hash_clusters].sum(axis=1)

    # Converting the input data series into pan
    file_ext_frame['Cluster'] = gmm_hash_clusters
    real_centroids = scaler.inverse_transform(gmm_centroids)

    # Write these to dataframe
    real_centroids_dataFrame = pd.DataFrame(real_centroids, columns=['feature_total_changed',
                                              'feature_add_del_functions', 'feature_changed_functions',
                                'feature_dmm_unit_complexity','feature_dmm_size','feature_dmm_unit_interfacing'])

    # Add a cloumn for summing all centroids "This is the value of the individual clusters"
    real_centroids_dataFrame['Sum_centroids'] = real_centroids_dataFrame.sum(axis = 1)
    
    # Save centroids of the clusters to a file for audit
    centroid_file = centroids_folder+file_ext+'_cpu_centroids.csv'
    real_centroids_dataFrame.to_csv(centroid_file)
    
    return file_ext_frame

In [None]:
# Now we need a GPU implementation for GMM. Not easy to find or to run. Most are old. 
# Finally found something which was working. Downloaded it and including this into the notebook.
# Note that this code uses diagonal covariance. Full covariance would have been better but we
#           can make do with diagonal for now.
# https://mg.readthedocs.io/importing-local-python-modules-from-jupyter-notebooks/sys-path-in-notebook/path-notebook.html
# https://github.com/ldeecke/gmm-torch
# 

if nvidia_cuda == True:
    import os
    import sys
    sys.path.insert(0, os.path.abspath('/home/kc/Projects/gpu_sklearn_tensor/gmm-torch/'))

In [None]:
def create_cluster_gpu(scaled_file_ext_frame, file_ext, file_ext_frame, scaler, k, gmm_models_folder, 
                   centroids_folder):
    
    import pandas as pd
    import numpy as np
    import torch
    import tensorflow as tf
    
    # Instantiate GMM and xg boost models
    if nvidia_cuda == True:
        # Initializing the GPU enabled Gaussian mixture model
        from gmm import GaussianMixture
        dimensions = scaled_file_ext_frame.shape[1] # No of 'features'. 
        mix_gpu = GaussianMixture(k, dimensions)
        torch.cuda.set_device('cuda:0')
        mix_gpu.cuda()
    else:
        return

    # Convert pandas dataframe/numpy arrray to a tensor
    scaled_file_ext_tensor_cpu = torch.tensor(list(scaled_file_ext_frame))
    scaled_file_ext_tensor = scaled_file_ext_tensor_cpu.to(device='cuda')
    
    # Running the Gaussian mixture model from data   
    mix_gpu.fit(scaled_file_ext_tensor)

    # Saving the parameters of Gaussian mixture model in a file
    import pickle
    gmm_model_file = gmm_models_folder+file_ext+'_gpu_gmm_model_pickle.sav'
    pickle.dump(mix_gpu, open(gmm_model_file, 'wb'))

    # Predicting the cluster labels of the data for the Gaussian mixture model
    gmm_clusters = mix_gpu.predict(scaled_file_ext_tensor, probs=True)
    classlabels = torch.argmax(gmm_clusters, 1)
    # Move this to CPU for further processing
    if DEBUG >= 3:
      print('classlabels ', classlabels)
    gmm_hash_clusters = classlabels.to('cpu')
    if DEBUG >= 3:
      print('gmm_hash_clusters ', gmm_hash_clusters)
    
    # Collecting the mean of the Gaussian mixture model in 'gmmcentroids'
    means_gpu = mix_gpu.mu 
    if DEBUG >= 3:
      print('means_gpu: ', means_gpu)  
    vars_gpu = mix_gpu.var 
    if DEBUG >= 3:
      print('vars_gpu: ', vars_gpu)    
    means_cpu = means_gpu.to('cpu')
    if DEBUG >= 3:
      print('means_cpu: ', means_cpu)
    vars_cpu = vars_gpu.to('cpu')
    if DEBUG >= 3:
      print('vars_cpu: ', vars_cpu)
    gmm_centroids = means_cpu.numpy()[0,:]

    # Converting the input data series into pan
    # https://stackoverflow.com/questions/57942487/how-to-convert-torch-tensor-to-pandas-dataframe
    #file_ext_frame['Cluster'] = pd.DataFrame(gmm_hash_clusters.numpy())
    file_ext_frame['Cluster'] = gmm_hash_clusters.numpy()
    #print('allClusters ', gmm_hash_clusters.numpy())
    #print('Clusters: ', file_ext_frame['Cluster'])
    # Realised that some prediction are Nan. research pointed to this:
    #     https://github.com/AlexanderFabisch/gmr/issues/5
    #     https://www.researchgate.net/post/What-is-the-way-to-get-rid-off-NaN-values-in-GMM-UBM
    #     https://github.com/AlexanderFabisch/gmr/issues/5#issuecomment-312727530
    # We should probably increase number of clusters to make sure we are being thorough.
    # Let us remove all rows where prediction is NaNs
    file_ext_frame.dropna(inplace=True)
    
    # Now get the 'real value' of centroids by inverse scaling
    real_centroids = scaler.inverse_transform(gmm_centroids)

    # Write these to dataframe
    real_centroids_dataFrame = pd.DataFrame(real_centroids, columns=['feature_total_changed',
                                              'feature_add_del_functions', 'feature_changed_functions',
                                'feature_dmm_unit_complexity','feature_dmm_size','feature_dmm_unit_interfacing'])

    # Add a cloumn for summing all centroids "This is the value of the individual clusters"
    real_centroids_dataFrame['Sum_centroids'] = real_centroids_dataFrame.sum(axis = 1)
    
    # Save centroids of the clusters to a file for audit
    centroid_file = centroids_folder+file_ext+'_gpu_centroids.csv'
    real_centroids_dataFrame.to_csv(centroid_file)
    
    # Some memory shortage issue. Lets free it up.
    #del scaled_file_ext_tensor
    #del gmm_hash_clusters
    #del means_cpu, gmm_centroids
    #del real_centroids_dataFrame
    del mix_gpu
    torch.cuda.empty_cache()
     
    return file_ext_frame

In [9]:
def create_boost_model(file_ext_frame, file_ext, xgboost_models_folder):
    
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    import pickle

    # Instantiate GMM and xg boost models
    if nvidia_cuda == True:
        # GPU has issues which we can ignore: 
        #  https://medium.com/data-design/xgboost-gpu-performance-on-low-end-gpu-vs-high-end-cpu-a7bc5fcd425b
        # Instantiate the XgBoost model with GPU enabled
        from xgboost import XGBClassifier
        xgboost_model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
    else:
        # Instantiate the CPU XgBoost model
        from xgboost import XGBClassifier
        xgboost_model = XGBClassifier()
        # xgboost_model = XGBClassifier(tree_method='gpu_hist', gpu_id=0)
        
    # Remove text fields before numeric manipulations
    file_ext_frame_numeric_xg = file_ext_frame.drop(columns=['Author','hash'])

    # Prepare the 'X' and 'Y' for the model
    X_file_ext_frame_numeric_xg = file_ext_frame_numeric_xg.drop(columns = ['Cluster'])
    Y_file_ext_frame_numeric_xg = file_ext_frame_numeric_xg['Cluster']

    # Split the data for 'Training' and 'Testing' datasets
    X_train, X_test, y_train, y_test = train_test_split(X_file_ext_frame_numeric_xg, Y_file_ext_frame_numeric_xg, random_state=7)


    # Training the xgboost classifier
    xgboost_model.fit(X_train, y_train)

    # Predicting the class labels of test data for xgboost classifier
    y_pred = xgboost_model.predict(X_test)

    # check accuracy
    if DEBUG >= 1:
        accuracy = accuracy_score(y_test, y_pred)
        print(file_ext+'_size: ', file_ext_frame.shape[0])
        print(file_ext+'_accuracy: ', accuracy)
    
    # Save the model to a file
    if nvidia_cuda == True:
      filename = xgboost_models_folder+file_ext+'_gpu_xgboost_model.sav'
    else:
      filename = xgboost_models_folder+file_ext+'_cpu_xgboost_model.sav'
    pickle.dump(xgboost_model, open(filename, 'wb'))
    
    #Free up memory
    #del file_ext_frame_numeric_xg
    #del X_file_ext_frame_numeric_xg
    #del X_train, X_test, y_train, y_test
    #del y_pred
    del xgboost_model

In [10]:
%%time

# https://ipython.readthedocs.io/en/stable/interactive/magics.html
# We have to use %%time but NOT %%timeit. %%timeit does multiple runs.

import os
import pathlib
import re
import glob


# Create a coloumn 'file_ext' which is the file 'type'
#total_commits['file_ext'] = total_commits['file_path'].apply(lambda x:pathlib.Path(str(x)).suffix).apply(lambda x:re.split(r"[^a-zA-Z0-9\s\++\_\-]",x)[-1])

# For files without any extension, mark 'file_ext' as "NoExt" 
#total_commits.file_ext = total_commits.file_ext.replace(r'^\s*$', 'NoExt', regex=True)


# Prepare a list of all unique file extensions
unique_extensions = total_commits['file_ext'].unique()
# Print for Debugging 
if DEBUG >=1:
    print('no. of unique_extensions: ', len(unique_extensions))


# Folder to save models and centroids
gmm_models_folder = '/home/kc/Projects/data_files/sav_files/gmm_sav/'
centroids_folder = '/home/kc/Projects/data_files/sav_files/centroids/'
xgboost_models_folder = '/home/kc/Projects/data_files/sav_files/xgboost_sav/'
        
# Remove all previous models 
#for folders in [gmm_models_folder, centroids_folder, xgboost_models_folder]:
#    for file in glob.glob(folders+'*'):
#        os.remove(file)

# Set number of clusters
k=5
    
# For every file extension: prepare the data frame, create cluster, train xgBoost model and save it.
# We should change this to only those extensions supported by lizard/pydriller
for file_ext in unique_extensions:
    
    # Extract data frame for the specific file type extension
    file_ext_frame = prepare_frame(total_commits, file_ext)
    if DEBUG >= 2:
        print('Shape of file_ext_frame:', file_ext_frame.shape, file_ext)

    # We seem to have some training data with missing data. Filter them here
    if 'language_supported' not in file_ext_frame.columns:
        if DEBUG >= 2:
          print('ignoring file type: ', file_ext, '(missing column - language_supported)')      
        continue
    
    # We need a good sample size for accurate results. 
    #    We will ignore'file_ext' if it has less than 50 rows 
    if file_ext_frame.shape[0] < 50:
        if DEBUG >= 2:
          print('ignoring file type: ', file_ext, '(Not enough rows)')
        continue
           
    # Check if we support the file_ext
    if file_ext_frame['language_supported'].values[0] == False:
        if DEBUG >= 2:
            print('Language not supported (ignoring):', file_ext)
        continue 

    # Remove outliers from the frame (only if you reasonable amount of data points) 
    file_ext_frame.drop(columns='language_supported', inplace = True)
    file_ext_frame_non_outliers, file_ext_frame_outliers = filter_outliers(file_ext_frame)
    if DEBUG >= 2:
        print('no. of non-outlier rows: ', file_ext_frame_non_outliers.shape[0])
        print('no. of outlier rows: ', file_ext_frame_outliers.shape[0])
    
    # Scale the data    
    # There is a peculiar behaviour here. For lot of file_ext (= 'md', 'html', etc,) All data points are 
    #     showing up as outliers. This is very funny. We should probably restrict our processing to files
    #     supported by 'lizard/pydriller'. 
    # We are forced to check that we have at least 1 non_outlier.
    if file_ext_frame_non_outliers.shape[0] >= 1:
        scaled_file_ext_frame, scaler = scale_frame(file_ext_frame_non_outliers)
        if DEBUG >= 2:
            print('rows in scaled_file_ext_frame: ', scaled_file_ext_frame.shape[0])
        
        # Create the actual clusters from the data.
        if nvidia_cuda == True:
            clustered_frame = create_cluster_gpu(scaled_file_ext_frame, file_ext, file_ext_frame_non_outliers, scaler, k, gmm_models_folder, centroids_folder)
        else:
            clustered_frame = create_cluster(scaled_file_ext_frame, file_ext, file_ext_frame_non_outliers, scaler, k, gmm_models_folder, centroids_folder)

        # Train xgboost model for each extension
        create_boost_model(clustered_frame, file_ext, xgboost_models_folder)
    else:
        if DEBUG >= 2:
          print('ignoring file type: ', file_ext, '(All rows are outliers)')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file_ext_frame['Cluster'] = gmm_hash_clusters


CPU times: user 38min 4s, sys: 5min 34s, total: 43min 39s
Wall time: 7min 18s
