In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import sklearn, re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_samples, silhouette_score

In [2]:
regex_square_brackets = re.compile(r'(\[)|(\])')

In [3]:
bugs = pd.read_csv('../datasets/lexical_semantic_preprocessed_mantis_bugs_less_columns.csv')
bug_notes = pd.read_csv('../datasets/lexical_semantic_preprocessed_mantis_bugnotes.csv')

In [4]:
bug_notes['bug_note'] = bug_notes['bug_note'].str.replace(regex_square_brackets, '')
bugs['additional_information'] = bugs['additional_information'].str.replace(regex_square_brackets, '')
bugs['description'] = bugs['description'].str.replace(regex_square_brackets, '')
bugs['summary'] = bugs['summary'].str.replace(regex_square_brackets, '')

In [5]:
df_bug_note_table = bug_notes.groupby(['bug_id'])['bug_note'].apply(','.join).to_frame('bug_notes').reset_index()
result = pd.merge(bugs, df_bug_note_table, how='left', left_on='id', right_on='bug_id')
result.fillna('', inplace=True)
result['textual_data'] = result['summary'] + ',' + result['description'] + ',' + result['additional_information'] + ',' + result['bug_notes']

In [6]:
max_features = 50
clustered = pd.DataFrame(columns=['id', 'reporter_id', 'severity', 'priority', 'cluster', 'cluster_#', 'cluster_dbscan_silhouette_mean'])
eps = 0.03
# decrease features by 500 in each iteration
# for each selection of max_features try to find good clusters
# apply recursion to increase eps by 0.01 at each step

In [7]:
def check_cluster_qualities(df, clustered_df, bugs_df, max_f, eps):
    clusters_found = False
    count = clustered_df['cluster_#'].max()
    if not count:
        
    for clu in set(df['cluster_dbscan']):
        if clu >= 0:
            temp_df = df[df['cluster_dbscan']==clu]
            if eps <= 0.2:
                decision_boundry = 0.5
            else:
                decision_boundry = 0.4
            if temp_df['cluster_dbscan_silhouette'].mean() > decision_boundry:
                count += 1
                clusters_found= True
                intracluster_ids = bugs_df[bugs_df['id'].isin(temp_df['id'])][['id', 'reporter_id', 'severity', 'priority']]
                intracluster_ids['cluster'] = '{}_{}_{}'.format(max_f, eps, clu, count)
                intracluster_ids['cluster_#'] = count
                intracluster_ids['cluster_dbscan_silhouette_mean'] = temp_df['cluster_dbscan_silhouette'].mean()
                clustered_df = pd.concat([clustered_df, intracluster_ids], axis=0)
                print "{} issues clustered".format(len(clustered_df))
    return clustered_df, clusters_found

In [8]:
eps_iterations = True
while(eps_iterations):
    max_features = 50
    while(True):
        bug_bugnotes_df = result[['id', 'textual_data']].copy()
        bug_bugnotes_df = bug_bugnotes_df[~(bug_bugnotes_df['id'].isin(clustered['id']))]
        vectorizer = TfidfVectorizer(min_df=2, max_df=0.9, max_features=max_features, stop_words='english')
        X = vectorizer.fit_transform(bug_bugnotes_df['textual_data'])
        print "Iteration for the combination max_features: {} and eps: {} started ...".format(max_features, eps)
        dbscan = DBSCAN(eps=eps, metric='cosine').fit_predict(X)
        bug_bugnotes_df['cluster_dbscan'] = dbscan
        clusters_found = False
        if len(set(bug_bugnotes_df['cluster_dbscan'])) > 1:
            sample_silhouette_values = silhouette_samples(X, dbscan)
            bug_bugnotes_df['cluster_dbscan_silhouette'] = sample_silhouette_values
            ### Search for good clusters
            clustered, clusters_found = check_cluster_qualities(bug_bugnotes_df, clustered, bugs, max_features, eps)
            #if clusters_found:
             #   max_features = 50
              #  eps = 0.03
        if not clusters_found:
            if max_features <= 100:
                max_features += 20
            elif max_features <= 1000:
                max_features += 100
            #elif max_features <= 2000:
            #    max_features += 500
            #elif max_features <= 10000:
            #    max_features += 2000
            elif max_features <= 20000:
                max_features += 5000
            else:
                break
    eps += 0.03
    if eps > 0.4:
        eps_iterations = False

Iteration for the combination max_features: 50 and eps: 0.03 started ...
75 issues clustered
82 issues clustered
88 issues clustered
107 issues clustered
113 issues clustered
121 issues clustered
143 issues clustered
159 issues clustered
165 issues clustered
183 issues clustered
219 issues clustered
236 issues clustered
253 issues clustered
264 issues clustered
276 issues clustered
292 issues clustered
297 issues clustered
315 issues clustered
323 issues clustered
328 issues clustered
335 issues clustered
349 issues clustered
370 issues clustered
375 issues clustered
387 issues clustered
392 issues clustered
404 issues clustered
413 issues clustered
442 issues clustered
449 issues clustered
456 issues clustered
469 issues clustered
476 issues clustered
500 issues clustered
510 issues clustered
517 issues clustered
525 issues clustered
531 issues clustered
541 issues clustered
546 issues clustered
551 issues clustered
556 issues clustered
570 issues clustered
575 issues clustered
Iterat

Iteration for the combination max_features: 70 and eps: 0.12 started ...
1189 issues clustered
Iteration for the combination max_features: 70 and eps: 0.12 started ...
Iteration for the combination max_features: 90 and eps: 0.12 started ...
1199 issues clustered
1204 issues clustered
1213 issues clustered
1219 issues clustered
1224 issues clustered
1229 issues clustered
Iteration for the combination max_features: 90 and eps: 0.12 started ...
1234 issues clustered
Iteration for the combination max_features: 90 and eps: 0.12 started ...
Iteration for the combination max_features: 110 and eps: 0.12 started ...
1241 issues clustered
1246 issues clustered
1251 issues clustered
Iteration for the combination max_features: 110 and eps: 0.12 started ...
Iteration for the combination max_features: 210 and eps: 0.12 started ...
1256 issues clustered
1261 issues clustered
Iteration for the combination max_features: 210 and eps: 0.12 started ...
Iteration for the combination max_features: 310 and e

Iteration for the combination max_features: 90 and eps: 0.21 started ...
1650 issues clustered
1655 issues clustered
1671 issues clustered
1688 issues clustered
1696 issues clustered
1699 issues clustered
1704 issues clustered
1719 issues clustered
1729 issues clustered
1735 issues clustered
1746 issues clustered
1752 issues clustered
1757 issues clustered
1762 issues clustered
Iteration for the combination max_features: 90 and eps: 0.21 started ...
1826 issues clustered
1830 issues clustered
1836 issues clustered
1840 issues clustered
1845 issues clustered
1850 issues clustered
1854 issues clustered
Iteration for the combination max_features: 90 and eps: 0.21 started ...
1859 issues clustered
1864 issues clustered
Iteration for the combination max_features: 90 and eps: 0.21 started ...
Iteration for the combination max_features: 110 and eps: 0.21 started ...
1877 issues clustered
1889 issues clustered
1897 issues clustered
1902 issues clustered
1906 issues clustered
1912 issues cluste

2891 issues clustered
2895 issues clustered
2897 issues clustered
2906 issues clustered
Iteration for the combination max_features: 50 and eps: 0.27 started ...
Iteration for the combination max_features: 70 and eps: 0.27 started ...
2913 issues clustered
2920 issues clustered
2926 issues clustered
2931 issues clustered
2934 issues clustered
2939 issues clustered
Iteration for the combination max_features: 70 and eps: 0.27 started ...
Iteration for the combination max_features: 90 and eps: 0.27 started ...
2945 issues clustered
2958 issues clustered
2963 issues clustered
2967 issues clustered
2974 issues clustered
2977 issues clustered
Iteration for the combination max_features: 90 and eps: 0.27 started ...
2982 issues clustered
2987 issues clustered
2992 issues clustered
Iteration for the combination max_features: 90 and eps: 0.27 started ...
Iteration for the combination max_features: 110 and eps: 0.27 started ...
3008 issues clustered
3014 issues clustered
3022 issues clustered
3026

3567 issues clustered
3571 issues clustered
3576 issues clustered
Iteration for the combination max_features: 110 and eps: 0.33 started ...
Iteration for the combination max_features: 210 and eps: 0.33 started ...
3581 issues clustered
3587 issues clustered
3591 issues clustered
3598 issues clustered
3603 issues clustered
3609 issues clustered
3614 issues clustered
Iteration for the combination max_features: 210 and eps: 0.33 started ...
3624 issues clustered
Iteration for the combination max_features: 210 and eps: 0.33 started ...
Iteration for the combination max_features: 310 and eps: 0.33 started ...
3629 issues clustered
3634 issues clustered
3639 issues clustered
3642 issues clustered
3647 issues clustered
Iteration for the combination max_features: 310 and eps: 0.33 started ...
3651 issues clustered
3654 issues clustered
Iteration for the combination max_features: 310 and eps: 0.33 started ...
3661 issues clustered
Iteration for the combination max_features: 310 and eps: 0.33 st

In [9]:
clustered

Unnamed: 0,id,reporter_id,severity,priority,cluster,cluster_#,cluster_dbscan_silhouette_mean
14,9792,82,50,30,50_0.03_0,,0.657621
26,9815,39,50,40,50_0.03_0,,0.657621
151,13425,39,50,30,50_0.03_0,,0.657621
152,13426,39,50,30,50_0.03_0,,0.657621
338,15005,39,50,30,50_0.03_0,,0.657621
806,6227,4,50,30,50_0.03_0,,0.657621
860,7474,39,50,30,50_0.03_0,,0.657621
963,9452,50,50,40,50_0.03_0,,0.657621
1025,10266,40,50,30,50_0.03_0,,0.657621
1247,11493,4,50,40,50_0.03_0,,0.657621


In [11]:
clustered[clustered['cluster_dbscan_silhouette_mean']>=0.35]

Unnamed: 0,id,reporter_id,severity,priority,cluster,cluster_#,cluster_dbscan_silhouette_mean
14,9792,82,50,30,50_0.03_0,,0.657621
26,9815,39,50,40,50_0.03_0,,0.657621
151,13425,39,50,30,50_0.03_0,,0.657621
152,13426,39,50,30,50_0.03_0,,0.657621
338,15005,39,50,30,50_0.03_0,,0.657621
806,6227,4,50,30,50_0.03_0,,0.657621
860,7474,39,50,30,50_0.03_0,,0.657621
963,9452,50,50,40,50_0.03_0,,0.657621
1025,10266,40,50,30,50_0.03_0,,0.657621
1247,11493,4,50,40,50_0.03_0,,0.657621


In [16]:
import math
if mathclustered['cluster_#'].max().isnull():
    print 123

AttributeError: 'float' object has no attribute 'isnull'