In [None]:
#Read the data files
import pandas as pd

tensorflow_commits = pd.read_csv('/home/kc/Projects/data_files/tensorflow.csv')
vscode_commits=pd.read_csv('/home/kc/Projects/data_files/vscode.csv')
react_commits=pd.read_csv('/home/kc/Projects/data_files/react-native.csv')

total_commits=tensorflow_commits.append(vscode_commits, ignore_index=True)
total_commits=total_commits.append(react_commits, ignore_index=True)
                             

#total_commits.shape

In [None]:
# Use h2o4gpu if you have it installed

# Add a flag with Default as False. Don't change this unless your kernel uses h2o4gpu.
h2o4gpu_enabled = False

In [None]:
#Creating various features for each modification

# Total number of lines changed
total_commits['total_changed'] = total_commits['lines_added'] + total_commits['lines_removed']

# Fraction of lines changed per total numbe of lines in file
# We need to account for the fact that new files added with have existing size as '0' and divide by '0' is indeterminate
total_commits['size'].loc[total_commits['size'] == 0] = total_commits['total_changed']
total_commits['ratio_changed'] = total_commits['total_changed'] / total_commits['size']

# Need to weigh the complexity by quantum of change. 
total_commits['rated_complexity'] = total_commits['ratio_changed'] * total_commits['complexity'] * total_commits['total_changed']

# weighing the dmm params by the total changed lines
total_commits['total_dmm_size'] = total_commits['total_changed'] * total_commits['dmm_unit_size']
total_commits['total_dmm_unit_complexity'] = total_commits['total_changed'] * total_commits['dmm_unit_complexity']
total_commits['total_dmm_unit_interfacing'] = total_commits['total_changed'] * total_commits['dmm_unit_interfacing']

# We picked the sqrt of no_of_mod_files to reduce weightage of this feature
total_commits['scaled_rated_complexity']=total_commits['rated_complexity'] * (total_commits['no._of_mod_files'] ** 0.5)

In [None]:
# Preprocessing the data. ML requires the data to be converted to numericals
#ml_commits = total_commits[['hash','Author', 'no._of_mod_files', 'dmm_unit_size',
#       'dmm_unit_complexity', 'dmm_unit_interfacing', 'complexity', 'functions', 'lines_added', 'lines_removed', 
#       'tokens', 'type']]

ml_commits = total_commits[['hash','Author','total_changed','rated_complexity',
                            'total_dmm_unit_complexity','total_dmm_size','total_dmm_unit_interfacing', 'scaled_rated_complexity']]

# Resetting the frame's index. It is required to retain the integrity of the frame
ml_commits = ml_commits.reset_index().drop(columns = 'index')

# Temporarily dropping text columns for numeric processing
ml_commits_noText = ml_commits.drop(columns = ['Author','hash'])

# Explicitely converting fields to numeric types and filling the NaNs with zeros
ml_commits_numeric = ml_commits_noText.apply(pd.to_numeric,errors ='coerce').fillna(0)

# Adding the Author column back to create a 'total' data frame
ml_commits_total = ml_commits_numeric.copy()
ml_commits_total['Author'] = ml_commits['Author']
ml_commits_total['hash'] = ml_commits['hash']

In [None]:
# Remove outliers. 
from scipy import stats
import numpy as np

# Calculate z_scores (and if zscore is greater than '3', then its an outlier) and collect normal subset.
ml_commits_nout = ml_commits_total[(np.abs(stats.zscore(ml_commits_total.select_dtypes(exclude='object'))) < 3).all(axis=1)]
ml_commits_nout.to_csv('/home/kc/junk/totalCommits_nout.csv')

# Collect outliers
ml_commits_out = ml_commits_total[~(np.abs(stats.zscore(ml_commits_total.select_dtypes(exclude='object'))) < 3).all(axis=1)]
ml_commits_out.to_csv('/home/kc/junk/totalCommits_out.csv')

In [None]:
# Applying scaler to regular data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
if (h2o4gpu_enabled == True):
    from h2o4gpu.preprocessing import MinMaxScaler
else:
    from sklearn.preprocessing import MinMaxScaler
    
# Use minMax scaler since this does not distort
# https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
scaler = MinMaxScaler()
ml_commits_nout_numeric = ml_commits_nout.drop(columns = ['Author','hash'])
data_scaled = scaler.fit_transform(ml_commits_nout_numeric)

In [None]:
# Try and figure out how many clusters are optimum
# Plot the inertia curve to find the 'elbow'

''' Realised that h2o4GPU has not implemented "Intertia". Hence we will have to use regular Kmeans library for intertia. 
Please ignore this cell entirely.


import h2o4gpu

SSE = []

for cluster in range(1,20):
    kmeans_ss = h2o4gpu.KMeans(n_gpus=1, n_clusters = cluster, init='k-means++', random_state = 42, backend=h2o4gpu)
    %time kmeans_ss.fit(data_scaled)
    SSE.append(kmeans_ss.inertia_)
    print(kmeans_ss.cluster_centers_)
    print(len(SSE))
    print(str(SSE))
    
    # converting the results into a dataframe and plotting them
frame = pd.DataFrame({'Cluster':range(1,20), 'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['SSE'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
'''

In [None]:
# Try and figure out how many clusters are optimum
# Plot the inertia curve to find the 'elbow'
# You don't need to run this for every iteration. Just uncomment and run whenever you need to

# Realised that h2o4GPU has not implemented "Intertia". Hence we will have to use regular Kmeans library for intertia.
# http://docs.h2o.ai/h2o4gpu/latest-stable/h2o4gpu-py-docs/html/_modules/h2o4gpu/solvers/kmeans.html

# Set this to 'True' if you want to plot graph to find elbow
find_elbow = False

if (find_elbow == True): 
    from sklearn.cluster import KMeans

    SSE = []

    for cluster in range(1,20):
        %time kmeans_ss = KMeans(n_clusters = cluster, init='k-means++', random_state = 42)
        kmeans_ss.fit(data_scaled)
        SSE.append(kmeans_ss.inertia_)

    # converting the results into a dataframe and plotting them
    frame = pd.DataFrame({'Cluster':range(1,20), 'SSE':SSE})
    plt.figure(figsize=(12,6))
    plt.plot(frame['Cluster'], frame['SSE'], marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Inertia')

In [None]:
# Instantiating the KMeans object based on number discovered above
k = 5

if (h2o4gpu_enabled == True):
    import h2o4gpu
    kmeans = h2o4gpu.KMeans(n_gpus=1, n_clusters = k, init='k-means++', max_iter = 20, random_state = 42, backend=h2o4gpu)
else:
    from sklearn.cluster import KMeans
    kmeans = sklearn.cluster.KMeans(n_clusters = k, init='k-means++', max_iter = 20, random_state = 42)

# Creating the model by passing our non-outlier data to Kmeans object
trained_model = kmeans.fit(data_scaled)

# Retrieving centroids
centroids = kmeans.cluster_centers_

# Labels of KMeans clusters change for every iteration. 
# We need to preserve these lables for runs with multiple/incremental input data sets.
# Creating a hack to attempt to preserve their identities.
# Calculating the arithmetic sum of all values in each centroid. This is done to fix labels for each training iteration
# Assumption is that bigger the values in a centroid, higher the original feature values. This assumption is true only for 
#        data set and our features. We need to definitely confirm/verify this assumption.
combinedCentroids = centroids[trained_model.labels_].sum(axis=1)

# adding column with combined centroid values to the original dataframe 
ml_commits_nout['center'] = combinedCentroids
#print(combinedCentroids)

# Creating a dictionary with combined centroid values and target cluster labels
unique_centroids = np.unique(combinedCentroids).tolist()
cluster_labels = np.arange(k).tolist()
cluster_dict = dict(zip(unique_centroids,cluster_labels))
#print(g)
ml_commits_nout['fixed_cluster'] = ml_commits_nout['center'].map(cluster_dict)
#ml_commits_nout

In [None]:
#from numpy import savetxt

# Look at the centroids
centroids[trained_model.labels_]

# Look at the values of inverted scaling of centroids for sanity
real_centroids = scaler.inverse_transform(centroids)

# Write these to dataframe
real_centroids_dataFrame = pd.DataFrame(real_centroids, columns=['total_changed','rated_complexity',
                            'total_dmm_unit_complexity','total_dmm_size','total_dmm_unit_interfacing', 'scaled_rated_complexity'])

# Add a cloumn for summing all coloumns
real_centroids_dataFrame['Sum_centroids'] = real_centroids_dataFrame.sum(axis = 1)

#You can write it out as csv if required.
real_centroids_dataFrame.to_csv('/home/kc/junk/totalCommits_centroids.csv')

In [None]:
# Counting each cluster
ml_clustered = ml_commits_nout.groupby(['fixed_cluster'],as_index=True).count()
ml_clustered['hash']
ml_commits_nout[ml_commits_nout['fixed_cluster']==4].head(5)
ml_commits_nout.to_csv('/home/kc/junk/total_commits_clusters_202007281715.csv')

In [None]:
# ml_commits_nout.columns

In [None]:
# Train the classifier
from xgboost import XGBClassifier
if (h2o4gpu_enabled == True):
    import h2o4gpu as sklearn
else:
    import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Remove text fields
ml_commits_nout_numeric_xg = ml_commits_nout.drop(columns = ['Author','hash','center'])

# Prepare the 'X' and 'Y' for the model
X_ml_commits_nout_numeric_xg = ml_commits_nout_numeric_xg.drop(columns = ['fixed_cluster'])
Y_ml_commits_nout_numeric_xg = ml_commits_nout_numeric_xg['fixed_cluster']

# Split the data for 'Training' and 'Testing' datasets
X_train, X_test, y_train, y_test = train_test_split(X_ml_commits_nout_numeric_xg, Y_ml_commits_nout_numeric_xg, random_state=7)

# Instantiate the model and train it
model = XGBClassifier()
model.fit(X_train, y_train)

# Prediction on Test set.
y_pred = model.predict(X_test)

# Check accuracy
accuracy = accuracy_score(y_test, y_pred)

print(accuracy)

In [None]:
if (h2o4gpu_enabled == True):
    from h2o4gpu import metrics
else:
    from sklearn import metrics

metrics.silhouette_score(data_scaled, trained_model.labels_)

In [None]:
# Pickle the XGBoost model
import pickle

filename = '/home/kc/Projects/data_files/finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))