\# Developer: Ali Hashaam (ali.hashaam@initos.com) <br>
\# 10th January 2019 <br>

\# © 2019 initOS GmbH <br>
\# License MIT <br>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import sklearn, re
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score

In [7]:
def check_cluster_qualities(df, clustered_df, bugs_df, max_f, eps):
    clusters_found = False
    count = clustered_df['cluster_#'].max()
    if math.isnan(count):
        count = 0
    for clu in set(df['cluster_dbscan']):
        if clu >= 0:
            temp_df = df[df['cluster_dbscan']==clu]
            if eps <= 0.2:
                decision_boundry = 0.7
            else:
                decision_boundry = 0.3
            if temp_df['cluster_dbscan_silhouette'].mean() > decision_boundry:
                count += 1
                clusters_found= True
                intracluster_ids = bugs_df[bugs_df['id'].isin(temp_df['id'])][['id', 'reporter_id', 'severity', 'priority']]
                intracluster_ids['cluster'] = '{}_{}_{}'.format(max_f, eps, clu, count)
                intracluster_ids['cluster_#'] = count
                intracluster_ids['cluster_dbscan_silhouette_mean'] = temp_df['cluster_dbscan_silhouette'].mean()
                clustered_df = pd.concat([clustered_df, intracluster_ids], axis=0)
                print "{} issues clustered".format(len(clustered_df))
    return clustered_df, clusters_found

In [None]:
regex_square_brackets = re.compile(r'(\[)|(\])')
bugs = pd.read_csv('../datasets/lexical_semantic_preprocessed_mantis_bugs_less_columns.csv')
bug_notes = pd.read_csv('../datasets/lexical_semantic_preprocessed_mantis_bugnotes.csv')
bug_notes['bug_note'] = bug_notes['bug_note'].str.replace(regex_square_brackets, '')
bugs['additional_information'] = bugs['additional_information'].str.replace(regex_square_brackets, '')
bugs['description'] = bugs['description'].str.replace(regex_square_brackets, '')
bugs['summary'] = bugs['summary'].str.replace(regex_square_brackets, '')

df_bug_note_table = bug_notes.groupby(['bug_id'])['bug_note'].apply(','.join).to_frame('bug_notes').reset_index()
result = pd.merge(bugs, df_bug_note_table, how='left', left_on='id', right_on='bug_id')
result.fillna('', inplace=True)
result['textual_data'] = result['summary'] + ',' + result['description'] + ',' + result['additional_information'] + ',' + result['bug_notes']

max_features = 50
clustered = pd.DataFrame(columns=['id', 'reporter_id', 'severity', 'priority', 'cluster', 'cluster_#', 'cluster_dbscan_silhouette_mean'])
eps = 0.03
# decrease features by 500 in each iteration
# for each selection of max_features try to find good clusters
# apply recursion to increase eps by 0.01 at each step

In [104]:
eps_iterations = True
while(eps_iterations):
    max_features = 500
    while(True):
        bug_bugnotes_df = result[['id', 'textual_data']].copy()
        bug_bugnotes_df = bug_bugnotes_df[~(bug_bugnotes_df['id'].isin(clustered['id']))]
        vectorizer = TfidfVectorizer(min_df=2, max_df=0.9, max_features=max_features, stop_words='english')
        X = vectorizer.fit_transform(bug_bugnotes_df['textual_data'])
        print "Iteration for the combination max_features: {} and eps: {} started ...".format(max_features, eps)
        dbscan = DBSCAN(eps=eps, metric='cosine').fit_predict(X)
        bug_bugnotes_df['cluster_dbscan'] = dbscan
        clusters_found = False
        if len(set(bug_bugnotes_df['cluster_dbscan'])) > 1:
            sample_silhouette_values = silhouette_samples(X, dbscan)
            bug_bugnotes_df['cluster_dbscan_silhouette'] = sample_silhouette_values
            ### Search for good clusters
            clustered, clusters_found = check_cluster_qualities(bug_bugnotes_df, clustered, bugs, max_features, eps)
            #if clusters_found:
             #   max_features = 50
              #  eps = 0.03
        if not clusters_found:
            if max_features == 100:
                max_features = 500
            elif max_features == 500:
                max_features = 1000
            elif max_features == 1000:
                max_features += 5000
            #elif max_features <= 10000:
            #    max_features += 2000
            #elif max_features <= 20000:
            #    max_features += 5000
            else:
                break
    eps += 0.03
    if eps > 0.4:
        eps_iterations = False


In [26]:
clustered.to_csv('../datasets/clusters_for_outsourcing', index = False)
sources = clustered.groupby(['cluster_#', 'cluster_dbscan_silhouette_mean'])['cluster_#'].count().to_frame('issues').reset_index()

In [None]:
# Results when threshold for silhouette coeffiecient is set to 0.5, 0.7 and 0.3