# TAG CLASSIFICATION

In the first version of tag classifications we will try to predict the tag labels for issues based only on the description. More precise, we will use our word embeddings which have already been created to compute the arithmetic representation of the description and then based on that we will try to predict the type of issue. 

Our first try is based on logistic regression. Logistic regression used for binary classification but using the method one vs rest we can train one logistic regression model for each label.  Maybe one better version will be using the multinomial logistic regression

Moreover, for the arithmetic representation of first we will use the average of the word embeddings and maybe in later stage we will try to improve the formula using a weighted average based on TF-IDF method.

## PRE PROCESSING

First, load the word embeddings, the vocabulary and for every issue the corresponding tags and description

In [1]:
import os
import json
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [2]:
def load_issues(dir_path,tag_labels,descriptions,names_list):
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:

            data = json.load(json_file)
            for issue in data:
                
                name = issue['name']
                tags = issue['tags']
                for i in range(len(tags)):
                    tags[i] = tags[i].strip()
                description = issue['description']
                
                #every issue without any label will be droped.
                if tags != []:
                    tag_labels.append(tags)
                    descriptions.append(description)
                    names_list.append(name)

In [3]:
def transform_desc(descriptions,word2id):
    
    arithmetic_descriptions = list()
    
    #define stop words
    all_stopwords = set(stopwords.words('english'))
    
    #define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    for desc in descriptions:
        
        #join all lines into one sentence
        sentence     = ' '.join(desc)
        
        #translate punctuation
        new_sentence = sentence.translate(translator)
        #split the sentense in words
        words = new_sentence.split()
        words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
        
        arithmetic_desc = list()
        
        for w in words_sw:
            arithmetic_desc.append(word2id.get(w,-2))
            
        arithmetic_descriptions.append(arithmetic_desc)
        
    #clean the list descriptions because its useless
    del descriptions
    
    return arithmetic_descriptions

In [4]:
# For beggining we will compute description embeddings just by averaging all the embeddings for every word in 
# the description

def compute_descriptions_embeddings(arithmetic_descriptions, word_embeddings_matrix):
    
    embedding_dim    = np.shape(word_embeddings_matrix)[1]
    num_descriptions = len(arithmetic_descriptions)
    print(num_descriptions)
    
    descriptions_embeddings = np.zeros((num_descriptions,embedding_dim))
    
    for counter,desc in enumerate(arithmetic_descriptions):
        total_words    = 0 
        
        for word in desc:
            if word != -2:
                total_words   += 1
                descriptions_embeddings[counter] = descriptions_embeddings[counter] + word_embedding_matrix[word]
        
        if total_words != 0:
            descriptions_embeddings[counter] /= total_words
       
    return descriptions_embeddings
    

In [5]:
#load word embeddings
word_embedding_matrix = np.loadtxt('word_embeddings_good.txt', dtype=np.float64)

#load vocabulary
word2id = dict()
id2word = dict()

file = open("vocabulary_good.txt","r")
lines = file.readlines()
for line in lines:
    temp = str(line)
    values = temp.split(',')
    word2id[values[0]] = int(values[1].replace("\n",""))
    id2word[int(values[1].replace("\n",""))] = values[0]
    
#load tags and descriptions
dir_path     = '/home/kostas/Documents/thesis/data_1'
tag_labels   = list()
descriptions = list()
names_list   = list()

load_issues(dir_path,tag_labels,descriptions,names_list)

# some prints for debugging purposes
#print(len(tag_labels))
#print(len(descriptions))
#for i in range(0,len(tag_labels)):
#    print("tags:",tag_labels[i])
#    print("description:",descriptions[i])
#    print("\n")

arithmetic_descriptions = transform_desc(descriptions,word2id)
descriptions_embeddings = compute_descriptions_embeddings(arithmetic_descriptions,word_embedding_matrix)

5128


In [6]:
tags = ['Bug','Google Play or Beta feedback','Feedback required','Feature Request','Unverified','Frontend Design']
no_tags = 6
np_tags = np.zeros((len(arithmetic_descriptions),no_tags))

for counter in range(len(tag_labels)):
    for counter_2,value in enumerate(tags):
        if value in tag_labels[counter]:
            np_tags[counter][counter_2] = 1

df_tags = pd.DataFrame(np_tags, columns = tags)


In [7]:
# train a logistic regression classification model for the label bug
bug_series = df_tags["Bug"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Bug classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Google Play or Beta feedback
bug_series = df_tags["Google Play or Beta feedback"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Google Play or Beta feedback classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Feedback required
bug_series = df_tags["Feedback required"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Feedback required classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Feature Request
bug_series = df_tags["Feature Request"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Feature Request classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Unverified
bug_series = df_tags["Unverified"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Unverified classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Frontend Design
bug_series = df_tags["Frontend Design"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Frontend Design classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

#for train_index, test_index in skf.split(descriptions_embeddings,bug_series):
#    X_train, X_test = descriptions_embeddings[train_index],descriptions_embeddings[test_index],
#    y_train, y_test = bug_series[train_index], bug_series[test_index]

Bug classifier,mean accuracy 0.734
Google Play or Beta feedback classifier,mean accuracy 0.959
Feedback required classifier,mean accuracy 0.908
Feature Request classifier,mean accuracy 0.814
Unverified classifier,mean accuracy 0.880
Frontend Design classifier,mean accuracy 0.881


10

In [9]:
def my_classifier(tags,df_tags,descriptions_embeddings,cl_label,n_splits):
    
    target_label    = df_tags[cl_label]
    counter_1       = np.sum(target_label)
    weight_0 = counter_1/target_label.shape[0]
    weight_1 = 1.3*(1 - weight_0)
    print(counter_1,target_label.shape[0])
    w = {0:weight_0,1:weight_1}
    skf             = StratifiedKFold(n_splits)
    model           = LogisticRegression(solver='lbfgs',class_weight = w)
    total_confusion = np.zeros((2,2))
    wrong_labels    = []
    wrong_indexes   = []
    for train_index, test_index in skf.split(descriptions_embeddings,target_label):
        X_train,X_test = descriptions_embeddings[train_index], descriptions_embeddings[test_index]
        y_train,y_test = target_label[train_index], target_label[test_index]
        
        #fit model 
        model.fit(X_train,y_train)
        predictions = model.predict(X_test)
        
        print(confusion_matrix(y_test,predictions))
        total_confusion = total_confusion+confusion_matrix(y_test,predictions)
        
        #for wrong classifications find true labels
        for counter,value in enumerate(predictions):
            if predictions[counter] !=  y_test.iloc[counter]:
                wrong_indexes.append(test_index[counter])
                wrong_labels.append(df_tags.iloc[[test_index[counter]]].values.tolist()[0])
                
                
    print(total_confusion)
    print("accuracy = TP+TN/(TP+TN+FP+FN)",(total_confusion[0][0]+total_confusion[1][1])/np.sum(total_confusion))
    print("custom metric",
          np.sqrt((total_confusion[0][0]/(total_confusion[0][0]+total_confusion[0][1]))*
                  (total_confusion[1][1]/(total_confusion[1][1]+total_confusion[1][0]))))
    
    return wrong_indexes,wrong_labels

In [10]:
#call my classifier for bugs
wrong_indexes = list()
wrong_labels = list()
wrong_indexes,wrong_labels = my_classifier(tags,df_tags,descriptions_embeddings,"Google Play or Beta feedback",10)

#print all issues which were classified wrong in n-stratified folds
print(tags)
for counter,value in enumerate(wrong_indexes):
    print("issue name",names_list[value])
    print("correct labels:",wrong_labels[counter])

221.0 5128
[[370 121]
 [  9  13]]
[[357 134]
 [  7  15]]
[[365 126]
 [  6  16]]
[[372 119]
 [  3  19]]
[[372 119]
 [ 10  12]]
[[383 108]
 [  3  19]]
[[369 122]
 [  1  21]]
[[371 119]
 [  5  18]]
[[360 130]
 [  4  18]]
[[361 129]
 [  3  19]]
[[3680. 1227.]
 [  51.  170.]]
accuracy = TP+TN/(TP+TN+FP+FN) 0.750780031201248
custom metric 0.7595287265414397
['Bug', 'Google Play or Beta feedback', 'Feedback required', 'Feature Request', 'Unverified', 'Frontend Design']
issue name Master to Beta?
correct labels: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
issue name Some date formats still not recognized
correct labels: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
issue name Next bugfix release
correct labels: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
issue name date format parsing might be broken
correct labels: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
issue name Next bugfix release
correct labels: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
issue name "Make list unique" menu makes cgeo crash in nearest cache list
correct labels: [1.0, 0.0, 0.0, 0.0, 0

correct labels: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
issue name ANR on first startup and when opening About c:geo
correct labels: [1.0, 0.0, 0.0, 0.0, 1.0, 0.0]
issue name Cannot log cache [NIGHTLY]
correct labels: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
issue name Update mapsforge library
correct labels: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
issue name Light skin goes black when scrolling
correct labels: [1.0, 0.0, 0.0, 0.0, 0.0, 1.0]
issue name Next release
correct labels: [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
issue name Use API 17 for building and as target
correct labels: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
issue name TB list not visible in white theme
correct labels: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
issue name c:geo prevents phone from sleeping,  causing excessive battery drain.
correct labels: [1.0, 0.0, 0.0, 0.0, 1.0, 0.0]
issue name calendar plugin does not work
correct labels: [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
issue name Crash when switching between horizotal / vertical orientation
correct labels: [0.0, 0.0, 0.0, 0

In [41]:
print(len(word2id))
print(type(word_embedding_matrix))
print(len(word_embedding_matrix))
num_rows,num_col = word_embedding_matrix.shape
print(num_rows)
print(num_col)

3892
<class 'numpy.ndarray'>
3891
3891
25


In [42]:
tag_dict = dict()
for labels in tag_labels:
    for label in labels:
        if label not in tag_dict:
            tag_dict[label]  = 1
        else:
            tag_dict[label] += 1

In [43]:
for key in tag_dict:
    print(key,",",tag_dict[key])

Feedback required , 469
Map: Mapsforge , 73
Bug , 2517
Prio - High , 450
Regression , 172
Feature Request , 971
Google Play or Beta feedback , 221
Duplicate , 329
Translation , 77
Unverified , 613
Prio - Low , 496
Website , 80
Connector OC , 127
Frontend Design , 608
Prio - Medium , 200
Won't Fix , 184
Assigned-non-collab , 87
WP Calc , 28
Refactoring , 172
Add-on: Contacts , 8
Live Map , 142
Send2cgeo , 38
CI server / Build tools , 140
Connector EC , 17
Field test , 33
Good first issue , 4
Connector GC , 20
Map: GMapsV2 , 34
Do not merge / WIP , 2
Connector GK , 47
Performance , 10
Regression SDK30/SAF , 40
Connector SU , 13
Regression SDK26 , 39
Hacktoberfest , 11
Wiki , 4
Connector LC , 11
Connector ZZ , 1
Legacy , 2


In [None]:
# issues with no tags i will not use them, because i dont know where they belong