In [None]:
import pickle
import pandas as pd

# Read the source file of raw commit data
pred_commits = pd.read_csv('C:/Users/aveli/Downloads/elasticray.csv')

In [None]:
# Prepare the features from raw commit data

pred_commits['total_changed'] = pred_commits['lines_added']+pred_commits['lines_removed']
pred_commits['size'].loc[pred_commits['size'] == 0] = pred_commits['total_changed']
pred_commits['ratio_changed'] = pred_commits['total_changed']/pred_commits['size']
pred_commits['rated_complexity'] = pred_commits['ratio_changed']*pred_commits['complexity']
pred_commits['total_dmm_size'] = pred_commits['total_changed']*pred_commits['dmm_unit_size']
pred_commits['total_dmm_unit_complexity'] = pred_commits['total_changed']*pred_commits['dmm_unit_complexity']
pred_commits['total_dmm_unit_interfacing'] = pred_commits['total_changed']*pred_commits['dmm_unit_interfacing']

# We picked the sqrt of no_of_mod_files to reduce weightage of this feature
pred_commits['scaled_rated_complexity'] = pred_commits['rated_complexity'] * (pred_commits['no._of_mod_files'] ** 0.5)


pred_ml_commits = pred_commits[['hash','Author','Committer','committed_date','total_changed','rated_complexity',
                               'total_dmm_unit_complexity','total_dmm_size','total_dmm_unit_interfacing', 'scaled_rated_complexity']]

# Resetting the frame's index. It is required to retain the integrity of the frame
pred_ml_commits = pred_ml_commits.reset_index().drop(columns = 'index')

# Author column needs to be dropped before converting the all the fields into numeric types
pred_ml_commits_na = pred_ml_commits.drop(columns = ['Author','hash','Committer','committed_date'])

# Converting the fields to numeric types, filling the NaNs with zeros
pred_ml_commits_numeric = pred_ml_commits_na.apply(pd.to_numeric,errors ='coerce').fillna(0)

# Adding teh Author column back
pred_ml_commits_numeric['Author'] = pred_ml_commits['Author']
pred_ml_commits_numeric['hash'] = pred_ml_commits['hash']
pred_ml_commits_numeric['Committer'] = pred_ml_commits['Committer']
pred_ml_commits_numeric['committed_date'] = pred_ml_commits['committed_date']
pred_ml_commits_numeric_na = pred_ml_commits_numeric.drop(columns=['Author','hash','committed_date','Committer'])

In [None]:
from sklearn.preprocessing import MinMaxScaler

ml_commits_nout = pred_ml_commits_numeric

# Removing the text columns
pred_ml_commits_numeric_all = ml_commits_nout.drop(columns=['Author','hash','committed_date','Committer'])

# Scaling the data
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(pred_ml_commits_numeric_all)

In [None]:
# Get the pickled model
filename = 'C:/Users/aveli/Downloads/finalized_model.sav'
xboost_model = pickle.load(open(filename, 'rb'))

In [None]:
# Predict the cluster using the model
from xgboost import XGBClassifier

predicted_clusters = xboost_model.predict(pred_ml_commits_numeric_na)
pred_ml_commits_numeric['predicted_cluster'] = predicted_clusters

In [None]:
pred_ml_commits_numeric.to_csv('C:/Users/aveli/Downloads/elasticray_predicted.csv')

In [None]:
# Loading the previously saved gmm model.
from sklearn.mixture import GaussianMixture

vfilename = 'C:/Users/aveli/Downloads/gmm_cluster_model.sav'
mix = pickle.load(open(vfilename, 'rb'))

# As per our logic from https://github.com/kcramakrishna/cg/issues/10
# We need P(x)
cluster_frame = pd.DataFrame(data_scaled)

gmm_hash_clusters = mix.predict(cluster_frame)
gmm_centroids = mix.means_

In [None]:
# Look at the values of inverted scaling of centroids for sanity
real_centroids = scaler.inverse_transform(gmm_centroids)

# Write these to dataframe
real_centroids_dataFrame = pd.DataFrame(real_centroids, columns=['total_changed','rated_complexity',
                            'total_dmm_unit_complexity','total_dmm_size','total_dmm_unit_interfacing', 'scaled_rated_complexity'])

# Add a column for summing all coloumns (https://github.com/kcramakrishna/cg/issues/10)
real_centroids_dataFrame['Sum_centroids'] = real_centroids_dataFrame.sum(axis = 1)
real_centroids_dataFrame['original_cluster_labels'] = real_centroids_dataFrame.index
real_centroids_dataFrame.to_csv('C:/Users/aveli/Downloads/totalCommits_centroids.csv')

In [None]:
centroid_map={}
for i in range(real_centroids_dataFrame.shape[0]):
    centroid_map[real_centroids_dataFrame['original_cluster_labels'].values[i]]=real_centroids_dataFrame['Sum_centroids'].values[i]

In [None]:
import numpy as np

# To store probabilities for predicted cluster labels
probability_for_labels = np.zeros((len(predicted_clusters),1))

In [None]:
# Probability of belonging to each cluster
member_probs = mix.predict_proba(cluster_frame)
for i in range(len(predicted_clusters)):
    probability_for_labels[i] = member_probs[i,predicted_clusters[i]]

In [None]:
ml_commits_nout['Cluster'] = predicted_clusters
ml_commits_nout['probablities'] = probability_for_labels


In [None]:
ml_commits_nout['sum_centroid']=np.arange(0.0,ml_commits_nout.shape[0],1.0)
for i in range(ml_commits_nout.shape[0]):
    ml_commits_nout['sum_centroid'].values[i]=centroid_map[ml_commits_nout['Cluster'].values[i]]

In [None]:
ml_commits_nout['mod_score'] = ml_commits_nout['sum_centroid'] * ml_commits_nout['probablities']

In [None]:
centroid_authors = {}
for idx, word in enumerate(ml_commits_nout['Author']):
    centroid_authors[word] = 0

In [None]:
for idx, (a, b) in enumerate(zip(ml_commits_nout['Author'], ml_commits_nout['mod_score'])):
    print(idx, a, b)
    centroid_authors[a] = centroid_authors[a] + b

In [None]:
centroid_authors