In [None]:
!pip install pandas
!pip install pickle5
!pip install matplotlib
!pip install scattertext
!pip install sentence_transformers
!nvidia-smi
!pip install transformers

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
import pandas as pd
import pickle5 as pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import string
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# Load the dataset
path = "inser path here"
df = pickle.load(open(path, 'rb'))

In [None]:
# Display Issue_Type distribution 
df.Issue_Type.value_counts()

In [None]:
# Load the Filtered Excel file.
filterd_df = pd.read_excel('Filtered.xlsx', index_col=0)

# Locate the filterd entires in the main dataframe.
filterd_indexes_df = df.loc[df.index.isin(filterd_df.index.tolist())]

df['Issue_Type_Verified'] = None
df.loc[df.index.isin(filterd_df.index.tolist()), ['Issue_Type_Verified']] = filterd_df['Issue_Type_Verified']

In [None]:
import ast

# Remove qutation mark of Issue_Type.
filterd_df['Issue_Type'] =  filterd_df[['Issue_Type']].applymap(lambda x: x.replace('"', ''))

# Remove qutation mark of Issue_Type_Manual.
filterd_df['Issue_Type_Manual'] =  filterd_df[['Issue_Type_Manual']].applymap(lambda x: x.replace('"', ''))

# Convert Issue_Type_Manual to dict.
filterd_df['Issue_Type_Manual'] =  filterd_df[['Issue_Type_Manual']].applymap(lambda x: ast.literal_eval(x))

In [None]:
# Print Issue_Type "Bug" That were verified as "Improvement".
filterd_df.loc[(filterd_df['Issue_Type_Verified'] == '"improvement"') & (filterd_df['Issue_Type'] == 'Bug')][['Issue_Type', 'Issue_Type_Verified']]

Unnamed: 0,Issue_Type,Issue_Type_Verified
12,Bug,"""improvement"""
70,Bug,"""improvement"""
83,Bug,"""improvement"""
96,Bug,"""improvement"""
98,Bug,"""improvement"""
...,...,...
109018,Bug,"""improvement"""
148997,Bug,"""improvement"""
149006,Bug,"""improvement"""
149028,Bug,"""improvement"""


In [None]:
# Modifiy Issue_Type "Bug" to "Improvemnet" according to Issue_Type_Verified.
df.loc[(df['Issue_Type'] == 'Bug') & (df['Issue_Type_Verified'] == '"improvement"'), ['Issue_Type']] = 'Improvement'

In [None]:
# Calculate the percentages of Issue_Type "New Feature" accrose all projects.
project_list = df.loc[df['Issue_Type'] == '"Bug"'].groupby('Project').count().index.tolist()
newfeature_projects = df.loc[df['Issue_Type'] == '"New Feature"'].groupby('Project').count().index.tolist()

no_newFeature_projects = [project for project in project_list if not project in newfeature_projects]
no_newFeature_projects
project_newFeature_perc = df.loc[df['Issue_Type'] == '"New Feature"'].groupby('Project').count().Issue_Type.values
project_newFeature_perc = np.insert(project_newFeature_perc, project_list.index('"cayenne"'), 0)
project_newFeature_perc = np.insert(project_newFeature_perc, project_list.index('"derby"'), 0)
project_newFeature_perc

array([505, 149, 138, 168,  79,   0,   6,  30,  35,  73,  44,  42,  19,
         1,  17, 117,   6,  37, 214, 161,  16,  39,  21,  42,  25,  48,
       191,   0,  10,  98, 103, 144, 101, 115, 333,  20, 112,  64,  35,
        69,  43, 252,  65,  41, 264, 133, 182,  70, 261,  72, 443,  81,
       412, 263, 177, 164,  82,  63, 134, 203, 183, 341, 119, 176, 141,
        11, 319,  80, 344, 125,  51, 203,  24,  58,   8, 239, 161])

In [None]:
# Calculate the percentages of Issue_Type "Bug" and "Improvement" accrose all projects.
total_bug_count = df.loc[df['Issue_Type'] == '"Bug"'].count()[0]
total_Improvement_count = df.loc[df['Issue_Type'] == '"Improvement"'].count()[0]
total_NewFeature_count = df.loc[df['Issue_Type'] == '"New Feature"'].count()[0]
project_list = df.groupby('Project').count().index.tolist()
project_bug_perc = df.loc[df['Issue_Type'] == '"Bug"'].groupby('Project').count().Issue_Type.values / total_bug_count * 100
project_Improvement_perc = df.loc[df['Issue_Type'] == '"Improvement"'].groupby('Project').count().Issue_Type.values / total_Improvement_count * 100
project_NewFeature_perc = project_newFeature_perc / total_NewFeature_count * 100
label_percentage = pd.DataFrame()
label_percentage['Project'] = project_list
label_percentage['Bug'] = project_bug_perc
label_percentage['Improvement'] = project_Improvement_perc
label_percentage['New Feature'] = project_NewFeature_perc
label_percentage

Unnamed: 0,Project,Bug,Improvement,New Feature
0,"""activemq""",5.771656,3.820730,5.363781
1,"""ant-ivy""",1.102705,1.160762,1.582581
2,"""archiva""",1.338234,1.076150,1.465746
3,"""bigtop""",2.043632,1.737176,1.784387
4,"""calcite""",2.082887,1.057641,0.839087
...,...,...,...,...
72,"""wss4j""",0.397307,0.520888,0.254912
73,"""xerces2-j""",1.755763,0.370175,0.616038
74,"""xmlgraphics-batik""",1.426260,0.251190,0.084971
75,"""zeppelin""",2.145933,2.617663,2.538502


In [None]:
# Display top 10 project with respect to each lable count per project.
top_project_list = project_stats[['Issue_Type']].sort_values(by='Issue_Type', ascending=False).head(10)
top_project_list = top_project_list.index.tolist()
top_project_stats = df[df['Project'].isin(top_project_list)].groupby(['Project', 'Issue_Type']).count()
top_project_stats

Unnamed: 0_level_0,Unnamed: 1_level_0,Title & Description
Project,Issue_Type,Unnamed: 2_level_1
"""activemq""","""Bug""",4852
"""activemq""","""Improvement""",1445
"""activemq""","""New Feature""",505
"""activemq""","""None""",95
"""activemq""","""Sub-task""",68
...,...,...
"""struts""","""New Feature""",344
"""struts""","""None""",57
"""struts""","""Sub-task""",83
"""struts""","""Task""",312


In [None]:
# Top 10 project with highies entry count.
project_stats[['Issue_Type']].sort_values(by='Issue_Type', ascending=False).head(10).value_counts

<bound method DataFrame.value_counts of               Issue_Type
Project                 
"activemq"          7324
"derby"             7027
"kafka"             7007
"maven"             6635
"nifi"              5813
"phoenix"           5603
"pig"               5491
"struts"            4931
"jackrabbit"        4617
"pdfbox"            4420>

In [None]:
# Issue_Type_verified stats.
df['Issue_Type_Verified'].value_counts()

In [None]:
# Concat Descriptions and titles
df['Issue'] = df['Issue_Title'] + df['Issue_Description']

In [None]:
# Rmove irrelevant types
df.loc[df.Issue_Type == '"Bug"', "Issue_Type"] = 'Bug'
df.loc[df.Issue_Type == '"Improvement"', "Issue_Type"] = 'Improvement'
df.loc[df.Issue_Type == '"Dependency upgrade"', "Issue_Type"] = 'New Feature'
df.loc[df.Issue_Type == '"New Feature"', "Issue_Type"] = 'New Feature'
types_to_keep = ['Bug', 'Improvement', 'New Feature']
df = df[df['Issue_Type'].isin(types_to_keep)]
df.Issue_Type.value_counts()

Bug            84066
Improvement    37820
New Feature     9522
Name: Issue_Type, dtype: int64

In [None]:
# Print stop words
print(stopwords.words("english"))
print("\nNumber of English stopwords in corpus: " + str(len(stopwords.words("english"))))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
# Create the TF-IDF vectorizer
descriptions = df.Issue.values
vectorizer = TfidfVectorizer(
                              lowercase = True,
                              max_features = 100,
                              max_df = 0.8, # words that occur more than 80% will be ignored.
                              min_df = 5, # if a word doesn't occure at lease 5 times it will be ignored.
                              ngram_range = (1,3), # look for words that occur in one gram to 3 grams.
                              stop_words = "english" #removes English stopwords, thus prevents their interference with the TF-IDF calculations
                            )

vectors = vectorizer.fit_transform(descriptions)

feature_names = vectorizer.get_feature_names()

dense = vectors.todense()
denselist= dense.tolist()

all_keywords = []

for description in denselist:
  x = 0
  keywords = []
  for word in description:
    if word > 0:
      keywords.append(feature_names[x])
    x += 1
  all_keywords.append(keywords)

print(all_keywords[2])

In [None]:
# Extract each TF-IDF word for each discription and assign a type to it.
types_list = df.Issue_Type.values
key_words = []
key_words_lables = []

labled_keywords = []
for i, keywords_list in enumerate(all_keywords):
  for word in keywords_list:
    key_words.append(word)
    key_words_lables.append(types_list[i])
    word_dict = {word:types_list[i]}
    labled_keywords.append(word_dict)

df_labled_keywords = pd.DataFrame()
df_labled_keywords["Key_Word"] = key_words
df_labled_keywords["type"] = key_words_lables
df_labled_keywords.sample(100)

Unnamed: 0,Key_Word,type
624572,use,"""Bug"""
544238,worker,"""Bug"""
214838,using,"""New Feature"""
147377,java,"""Bug"""
346734,java lang,"""Bug"""
...,...,...
208218,job,"""Bug"""
328871,thread,"""Bug"""
139008,org apache,"""Bug"""
289413,error,"""Bug"""


In [None]:
print('Original: ', descriptions[2].split())
print('TF-IDF: ', all_keywords[2])

Original:  ['"Number', 'Base', 'Conversion""I', 'think', 'a', 'maths', 'package', 'without', 'a', 'base', 'conversion', 'utility', 'is', 'quite', 'incomplete.', 'Would', 'request', 'you', 'to', 'include', 'this', 'feature', 'in', 'the', 'package', 'for', 'the', 'next', 'release.', 'From', 'a', "user's", 'perspective', 'I', 'would', 'like', 'to', 'have', 'a', 'library', 'that', 'goes', 'beyond', 'the', 'usual', 'binary,octal,hexadecimal', 'conversions.', 'Would', 'really', 'be', 'helpful', 'if', 'the', 'library', 'is', 'very', 'generic', 'so', 'as', 'to', 'support', 'convert(Base', 'from,Base', 'to)', 'calls.', 'Pls', 'let', 'me', 'know', 'what', 'you', 'feel', 'about', 'this.Currently', 'i', 'am', 'on', 'the', 'lookout', 'for', 'a', 'base', 'conversion', 'class', 'etc', 'but', "haven't", 'managed', 'to', 'hit', 'upon', 'anything', 'that', 'serves', 'my', 'purpose.', '"']
TF-IDF:  ['class', 'support', 'user']


In [None]:
# Remove stop words, punctuation, digits, tab spaces, and double spaces.
def remove_stops(text, stops):
  words = text.split()
  final= []
  for word in words:
    if word not in stops:
      final.append(word)
  final = " ".join(final)
  final = final.translate(str.maketrans("", "", string.punctuation))
  final = "".join([i for i in final if not i.isdigit()])
  while "\t" in final:
    final = final.replace("\t", " ")
  while "  " in final:
    final = final.replace("  ", " ")
  return (final)

In [None]:
def clean_doc(docs):
  # stop_words = list(stopwords.words("english")) + list(tfidf_list) # This is commented after deciding not to remove TF-IDF words.
  final = []
  for doc in docs:
    clean_doc = remove_stops(doc, stop_words)
    final.append(clean_doc)
  return (final)

In [None]:
# show new list of stop words.
# Clean the data
# tfidf_list =  df_labled_keywords["Key_Word"].unique() # This is commented after deciding not to remove TF-IDF words.
issues = df.Issue.values
stop_words = list(stopwords.words("english")) #+ list(tfidf_list)
print(stop_words)
issues = clean_doc(issues)
df['Issue'] = issues

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
from transformers import BertTokenizer
#Load the BERT tokenizer
print('Loading BERT tokneizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Loading BERT tokneizer...


In [None]:
# Tokenize all of the titles+description and map the tokens to thier word IDs.
input_ids = []
attention_masks= []
df_preprocessed_ds = pickle.load(open('/content/df_preprocessed_dataset_2.pkl', 'rb'))
descriptions = df_preprocessed_ds.Cleaned_Data.values
# For every title...
for entry in descriptions:

    encoded_dict = tokenizer.encode_plus(
        entry,                         
        add_special_tokens = True,                             
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'          
    )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the list into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [None]:
input_ids_list = []
for input_id in input_ids:
  input_ids_list.append(input_id[0].tolist())

In [None]:
# Calculate the length of each tokenized entry.
desc_stats_df = pd.DataFrame()

# desc_list = desc_df.Issue_Description.values
desc_list = input_ids_list
total_desc = len(desc_list)
desc_stats_df['Total_Descriptions'] = [total_desc]

max_length = 0
min_length = len(desc_list[0])
total_desc_length = 0
avg_length = 0
median = 0
desc_lengths_list = []
gt_512 = 0
gt_1000 = 0
gt_3000 = 0
gt_5000 = 0
gt_10000 = 0
gt_50000 = 0
gt_100000 = 0

for desc in desc_list:

  desc_length = len(desc)

  desc_lengths_list.append(desc_length)

  max_length = max(max_length, desc_length)

  if desc_length != 0:
    min_length = min(min_length, desc_length)

  total_desc_length += desc_length

  if desc_length > 512:
    gt_512 += 1

  if desc_length > 1000:
    gt_1000 += 1

  if desc_length > 3000:
    gt_3000 += 1

  if desc_length > 5000:
    gt_5000 += 1

  if desc_length > 10000:
    gt_10000 += 1

  if desc_length > 50000:
    gt_50000 += 1

  if desc_length > 100000:
    gt_100000 += 1
  


avg_length = total_desc_length /  total_desc

desc_lengths_list = np.array(desc_lengths_list)
sorted_desc_lengths_list = np.sort(desc_lengths_list)
middle = float(len(sorted_desc_lengths_list)/2)

if middle % 2 == 0:
  median =  sorted_desc_lengths_list[int(middle - .5)]
  
else:
  median = sorted_desc_lengths_list[int(middle)], sorted_desc_lengths_list[int(middle-1)]

desc_stats_df['Max_Length'] = [max_length]
desc_stats_df['Min_Length'] = [min_length]
desc_stats_df['Average_Length'] = [round(avg_length, 2)]
desc_stats_df['Median_Length'] = [median]
desc_stats_df['gt_512'] = [gt_512] 
desc_stats_df['gt_1000'] = [gt_1000]
desc_stats_df['gt_3000'] = [gt_3000]
desc_stats_df['gt_5000'] = [gt_5000]
desc_stats_df['gt_10k'] = [gt_10000]
desc_stats_df['gt_50k'] = [gt_50000]
desc_stats_df['gt_100k'] = [gt_100000]

output_str = f"""Total Descriptions: {total_desc}
Max Length: {max_length}
Min Length: {min_length}
avg length: {round(avg_length, 2)}
Median length(s): {median}
>512 : {gt_512}
>1000 : {gt_1000}
>3000 : {gt_3000}
>5000 : {gt_5000}
>10000 : {gt_10000}
>50000 : {gt_50000}
>100000 : {gt_100000}"""

print(output_str)
desc_stats_df.to_excel('/content/tokenized_descriptions_stats.xlsx')

Total Descriptions: 42514
Max Length: 68945
Min Length: 3
avg length: 77.09
Median length(s): (46, 46)
>512 : 407
>1000 : 152
>3000 : 24
>5000 : 12
>10000 : 2
>50000 : 1
>100000 : 0


In [None]:
# Create a new featrue called Clean_Issue in the dataframe
df["Clean_Issue"] = descriptions
entries = df.Cleaned_Data.values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
# Load encoder model.
# About the model used: https://huggingface.co/microsoft/mpnet-base
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)
model.cuda()

In [None]:
# Encode (vectorize) all issues.
issues = df.Issue.values
sentence_vecs = model.encode(issues)
sentence_vecs

In [None]:
# Find cosine similaries.
def find_similar_issues(data):
  similar_issues = []
  for i, dec in enumerate(data):
    try:
      if i < len(data) - 1:
        dec_similarities = cosine_similarity( [sentence_vecs[i]], sentence_vecs[i+1:] )

        for j, similar_dec_rate in enumerate(dec_similarities[0]):
          if similar_dec_rate >= 0.6: #try 0.7
            similar_issues.append({"similar_issue_indexes": (i, i+j+1), "similarity_rate": similar_dec_rate})
            
            if len(similar_issues) % 5000 == 0 and not len(similar_issues) == 0:
              print(f'Number of processed entries: {i}' )
              print(f'Number of similar entries: {len(similar_issues)}')
         
    except: 
      print(f'Could not resolve entry at : {i}')

  return similar_issues

In [None]:
similar_issues = find_similar_issues(issues)
# Save similar_issues.
with open ("/content/similar_issues_01.pkl", "wb") as f:
  pickle.dump(similar_issues, f)


In [None]:
# Display stats of similar issues.
lt_70 = 0
lt_80 = 0
lt_90 = 0
gt_90 = 0

for entry in similar_issues:

  similarity_rate = entry['similarity_rate']

  if similarity_rate < 0.7:
    lt_70 += 1
  elif similarity_rate < 0.8:
    lt_80 += 1
  elif similarity_rate < 0.9:
    lt_90 += 1
  else:
    gt_90 += 1

print(f'<70: {lt_70}')
print(f'<80:  {lt_80}')
print(f'<90:  {lt_90}')
print(f'>=90: {gt_90}')
print('over all: ', lt_80+lt_90+gt_90+lt_70)


<70: 4319716
<80:  417860
<90:  20328
>=90: 3559
over all:  4761463


In [None]:
# Extract similart issues that are 70% similar.
similar_issues_indexes = []

for entry in similar_issues:
  index = entry['similar_issue_indexes'][1]
  if entry['similarity_rate'] >= 0.7 and not index in similar_issues_indexes:
    similar_issues_indexes.append(index)

df.iloc[similar_issues_indexes,[1,2]]

Unnamed: 0,Issue_Type,Issue
1071,Bug,QRDecomposition detect matrix singularityQRDec...
4,Bug,math Function Fraction mathfractionFractionFor...
770,Bug,Fractiondouble int constructor strange behavio...
1385,Bug,MathParseException parsing fractions AndroidIm...
62859,Bug,lang FractiontoProperString returns Test case ...
...,...,...
156289,Bug,Bean cache ignores qualifier model defined Ann...
156275,Bug,OWB ignores producer methods custom annotatedT...
156273,Bug,WebContextsService fire BeforeDestroyedRequest...
156304,Improvement,destroying Session fire BeforeDestroyedSession...


In [None]:
# Delete similar bugs.
df_smiliarities = df.iloc[similar_issues_indexes]
bugs_indexes_toBe_deleted = df_smiliarities.loc[df_smiliarities['Issue_Type'] == 'Bug']
bugs_indexes_toBe_deleted = list(bugs_indexes_toBe_deleted.index.values)
final_drop_list = bugs_indexes_toBe_deleted
df = df.drop(final_drop_list)
df.Issue_Type.value_counts()