In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import MinMaxScaler

In [78]:
#import pandas as pd
tensorflow_commits = pd.read_csv('C:/Users/aveli/Downloads/tensorflow.csv')


#tensorflow_commits = pd.read_csv('/home/kc/Projects/data_files/tensorflow.csv')
vscode_commits=pd.read_csv('C:/Users/aveli/Downloads/vscode.csv')
react_commits=pd.read_csv('C:/Users/aveli/Downloads/react-native.csv')

total_commits=tensorflow_commits.append(vscode_commits, ignore_index=True)
total_commits=total_commits.append(react_commits, ignore_index=True)
                             
total_commits.shape

(709762, 21)

In [79]:
#Creating various features for each modification
# Total number of lines changed
total_commits['total_changed'] = total_commits['lines_added'] + total_commits['lines_removed']

# Fraction of lines changed per total numbe of lines in file
# We need to account for the fact that new files added with have existing size as '0' and divide by '0' is indeterminate
total_commits['size'].loc[total_commits['size'] == 0] = total_commits['total_changed']
total_commits['ratio_changed'] = total_commits['total_changed'] / total_commits['size']

# Need to weigh the complexity by quantum of change. 
total_commits['rated_complexity'] = total_commits['ratio_changed'] * total_commits['complexity'] * total_commits['total_changed']

# weighing the dmm params by the total changed lines
total_commits['total_dmm_size'] = total_commits['total_changed'] * total_commits['dmm_unit_size']
total_commits['total_dmm_unit_complexity'] = total_commits['total_changed'] * total_commits['dmm_unit_complexity']
total_commits['total_dmm_unit_interfacing'] = total_commits['total_changed'] * total_commits['dmm_unit_interfacing']

# We picked the sqrt of no_of_mod_files to reduce weightage of this feature
total_commits['scaled_rated_complexity']=total_commits['rated_complexity'] * (total_commits['no._of_mod_files'] ** 0.5)

total_commits.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


(709762, 28)

In [80]:
# Preprocessing the data. ML requires the data to be converted to numericals
#ml_commits = total_commits[['hash','Author', 'no._of_mod_files', 'dmm_unit_size',
#       'dmm_unit_complexity', 'dmm_unit_interfacing', 'complexity', 'functions', 'lines_added', 'lines_removed', 
#       'tokens', 'type']]

ml_commits = total_commits[['hash','Author','total_changed','rated_complexity',
                            'total_dmm_unit_complexity','total_dmm_size','total_dmm_unit_interfacing', 'scaled_rated_complexity']]

# Resetting the frame's index. It is required to retain the integrity of the frame
ml_commits = ml_commits.reset_index().drop(columns = 'index')

# Temporarily dropping text columns for numeric processing
ml_commits_noText = ml_commits.drop(columns = ['Author','hash'])

# Explicitely converting fields to numeric types and filling the NaNs with zeros
ml_commits_numeric = ml_commits_noText.apply(pd.to_numeric,errors ='coerce').fillna(0)

# Adding the Author column back to create a 'total' data frame
ml_commits_total = ml_commits_numeric.copy()
ml_commits_total['Author'] = ml_commits['Author']
ml_commits_total['hash'] = ml_commits['hash']

print(ml_commits_total.shape)


(709762, 8)


In [81]:
# Attempt to remove outliers. May not be required
from scipy import stats
import numpy as np
# Calculate z_scores (and if zscore is greater than '3', then its an outlier) and collect normal subset.
ml_commits_nout = ml_commits_total[(np.abs(stats.zscore(ml_commits_total.select_dtypes(exclude='object'))) < 3).all(axis=1)]
ml_commits_nout.to_csv('C:/Users/aveli/Downloads/totalCommits_nout.csv')

# Collect outliers
ml_commits_out = ml_commits_total[~(np.abs(stats.zscore(ml_commits_total.select_dtypes(exclude='object'))) < 3).all(axis=1)]
ml_commits_out.to_csv('C:/Users/aveli/Downloads/totalCommits_out.csv')



In [82]:
# Applying scaler to regular data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler

# Use minMax scaler since this does not distort
# https://towardsdatascience.com/scale-standardize-or-normalize-with-scikit-learn-6ccc7d176a02
scaler = MinMaxScaler()
ml_commits_nout_numeric = ml_commits_nout.drop(columns = ['Author','hash'])
data_scaled = scaler.fit_transform(ml_commits_nout_numeric)

In [83]:
mix = GaussianMixture(n_components=5,random_state=42)
s = mix.fit(data_scaled)
cluster_frame = pd.DataFrame(data_scaled)
gmmhash_clusters = mix.predict(cluster_frame)
gmmcentroids = mix.means_
gmmcentroids


array([[1.80774932e-03, 7.12723923e-07, 8.05812368e-04, 6.76028487e-04,
        9.01465246e-04, 2.24471370e-07],
       [5.48828399e-02, 8.66249956e-04, 4.16912417e-02, 3.01486633e-02,
        4.47182590e-02, 2.45470616e-04],
       [2.74691436e-01, 1.74056240e-02, 1.59041418e-01, 1.35382271e-01,
        1.69151256e-01, 9.09486254e-03],
       [1.25222674e-02, 4.88573676e-05, 1.11096466e-02, 8.15527423e-03,
        1.21797644e-02, 1.43515101e-05],
       [2.03943134e-02, 8.53335747e-06, 2.65581311e-05, 2.47453989e-05,
        4.09561424e-05, 1.11290198e-06]])

In [84]:
combinedCentroids = gmmcentroids[gmmhash_clusters].sum(axis=1)
# adding column with combined centroid values to the original dataframe 
ml_commits_nout['center'] = combinedCentroids
#print(combinedCentroids)
# Creating a dictionary with combined centroid values and target cluster labels
unique_centroids = np.unique(combinedCentroids).tolist()
cluster_labels = np.arange(5).tolist()
cluster_dict = dict(zip(unique_centroids,cluster_labels))
#print(g)
ml_commits_nout['fixed_cluster'] = ml_commits_nout['center'].map(cluster_dict)
# Converting the input data series into pan
ml_commits_nout['Cluster'] = gmmhash_clusters
#ml_commits_nout[ml_commits_nout['Cluster']==0]
#ml_commits_nout['sum_value'] = ml_commits_nout['scaled_rated_complexity']+ml_commits_nout['total_dmm_unit_interfacing']+ ml_commits_nout['total_dmm_size']+ml_commits_nout['total_dmm_unit_complexity']+ml_commits_nout['rated_complexity']+ml_commits_nout['ratio_changed']+ml_commits_nout['total_changed']
#ml_commits_nout[ml_commits_nout['Cluster']==3]  
ml_commits_nout

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,total_changed,rated_complexity,total_dmm_unit_complexity,total_dmm_size,total_dmm_unit_interfacing,scaled_rated_complexity,Author,hash,center,fixed_cluster,Cluster
0,14,13.177872,14.000000,7.175000,7.525000,26.355745,A. Unique TensorFlower,0ef962b1a5b4e80a7029b0b159af6817b12a04df,0.004192,0,0
1,75,573.979592,75.000000,38.437500,40.312500,1147.959184,A. Unique TensorFlower,0ef962b1a5b4e80a7029b0b159af6817b12a04df,0.044030,2,3
2,4,0.301887,4.000000,2.050000,2.150000,0.603774,A. Unique TensorFlower,0ef962b1a5b4e80a7029b0b159af6817b12a04df,0.004192,0,0
3,47,266.518875,47.000000,24.087500,25.262500,533.037750,A. Unique TensorFlower,0ef962b1a5b4e80a7029b0b159af6817b12a04df,0.044030,2,3
4,19,58.992683,18.050000,18.050000,18.050000,117.985366,George Karpenkov,ef47bbbd57cba8fcc7ae11df8c7141d6c68ba0d0,0.044030,2,3
...,...,...,...,...,...,...,...,...,...,...,...
709757,175,0.000000,133.586576,99.679797,156.212240,0.000000,Ben Alpert,a15603d8f1ecdd673d80be318293cee53eb4475d,0.172553,3,1
709758,41,246.000000,31.297426,23.353552,36.598296,4808.025790,Ben Alpert,a15603d8f1ecdd673d80be318293cee53eb4475d,0.044030,2,3
709759,157,157.000000,119.846242,89.427018,140.144696,3068.536785,Ben Alpert,a15603d8f1ecdd673d80be318293cee53eb4475d,0.172553,3,1
709760,574,33866.000000,438.163968,326.949734,512.376148,661904.883795,Ben Alpert,a15603d8f1ecdd673d80be318293cee53eb4475d,0.764767,4,2


In [85]:
# Look at the values of inverted scaling of centroids for sanity
real_centroids = scaler.inverse_transform(gmmcentroids)

# Write these to dataframe
real_centroids_dataFrame = pd.DataFrame(real_centroids, columns=['total_changed','rated_complexity',
                            'total_dmm_unit_complexity','total_dmm_size','total_dmm_unit_interfacing', 'scaled_rated_complexity'])

# Add a cloumn for summing all coloumns
real_centroids_dataFrame['Sum_centroids'] = real_centroids_dataFrame.sum(axis = 1)

#You can write it out as csv if required.
real_centroids_dataFrame.to_csv('C:/Users/aveli/Downloads/totalCommits_centroids.csv')

In [86]:
ml_commits_nout.groupby('fixed_cluster')['total_changed'].count()

fixed_cluster
0    472234
1     68990
2    110468
3     47968
4      9188
Name: total_changed, dtype: int64

In [87]:
ml_commits_nout.groupby('Cluster')['total_changed'].count()

Cluster
0    472234
1     47968
2      9188
3    110468
4     68990
Name: total_changed, dtype: int64

In [88]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import RobustScaler
from sklearn.naive_bayes import GaussianNB 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
import pickle
from sklearn.metrics import r2_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

ml_commits_nout_numeric_xg = ml_commits_nout.drop(columns=['Author','hash','center','Cluster'])


# Prepare the 'X' and 'Y' for the model
X_ml_commits_nout_numeric_xg = ml_commits_nout_numeric_xg.drop(columns = ['fixed_cluster'])
Y_ml_commits_nout_numeric_xg = ml_commits_nout_numeric_xg['fixed_cluster']

# Split the data for 'Training' and 'Testing' datasets
X_train, X_test, y_train, y_test = train_test_split(X_ml_commits_nout_numeric_xg, Y_ml_commits_nout_numeric_xg, random_state=7)




In [89]:
xgboostmodel = XGBClassifier()

In [90]:
xgboostmodel.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [91]:
y_pred = xgboostmodel.predict(X_test)
 
# accuracy on X_test 
accuracy = accuracy_score(y_test, y_pred)
#sameaccuracy = xgboostmodel.score(X_test, y_test) 
print('accuracy', accuracy)
#print('sameaccuracy', sameaccuracy)

accuracy 0.998419971559488


In [92]:
X_train.columns

Index(['total_changed', 'rated_complexity', 'total_dmm_unit_complexity',
       'total_dmm_size', 'total_dmm_unit_interfacing',
       'scaled_rated_complexity'],
      dtype='object')

In [93]:
import pickle

filename = 'C:/Users/aveli/Downloads/finalized_model.sav'
pickle.dump(xgboostmodel, open(filename, 'wb'))