# TAG CLASSIFICATION

In the first version of tag classifications we will try to predict the tag labels for issues based only on the description. More precise, we will use our word embeddings which have already been created to compute the arithmetic representation of the description and then based on that we will try to predict the type of issue. 

Our first try is based on logistic regression. Logistic regression used for binary classification but using the method one vs rest we can train one logistic regression model for each label.  Maybe one better version will be using the multinomial logistic regression

Moreover, for the arithmetic representation of first we will use the average of the word embeddings and maybe in later stage we will try to improve the formula using a weighted average based on TF-IDF method.

## PRE PROCESSING

First, load the word embeddings, the vocabulary and for every issue the corresponding tags and description

In [79]:
import os
import json
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [80]:
def load_issues(dir_path,tag_labels,descriptions):
    
    for fname in os.listdir(dir_path):
        with open(os.path.join(dir_path,fname)) as json_file:

            data = json.load(json_file)
            for issue in data:
                
                name = issue['name']
                tags = issue['tags']
                for i in range(len(tags)):
                    tags[i] = tags[i].strip()
                description = issue['description']
                
                if tags != []:
                    tag_labels.append(tags)
                    descriptions.append(description)
    

In [81]:
def transform_desc(descriptions,word2id):
    
    arithmetic_descriptions = list()
    
    #define stop words
    all_stopwords = set(stopwords.words('english'))
    
    #define translator to translate punctuation to white space
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    
    for desc in descriptions:
        
        #join all lines into one sentence
        sentence     = ' '.join(desc)
        
        #translate punctuation
        new_sentence = sentence.translate(translator)
        #split the sentense in words
        words = new_sentence.split()
        words_sw = [w.lower() for w in words if not w.lower() in all_stopwords and len(w)>1]
        
        arithmetic_desc = list()
        
        for w in words_sw:
            arithmetic_desc.append(word2id.get(w,-2))
            
        arithmetic_descriptions.append(arithmetic_desc)
        
    #clean the list descriptions because its useless
    descriptions = []
    
    return arithmetic_descriptions

In [82]:
# For beggining we will compute description embeddings just by averaging all the embeddings for every word in 
# the description

def compute_descriptions_embeddings(arithmetic_descriptions, word_embeddings_matrix):
    
    embedding_dim    = np.shape(word_embeddings_matrix)[1]
    num_descriptions = len(arithmetic_descriptions)
    
    descriptions_embeddings = np.zeros((num_descriptions,embedding_dim))
    
    for counter,desc in enumerate(arithmetic_descriptions):
        total_words    = 0 
        
        for word in desc:
            if word != -2:
                total_words   += 1
                descriptions_embeddings[counter] = descriptions_embeddings[counter] + word_embedding_matrix[word]
        
        if total_words != 0:
            descriptions_embeddings[counter] /= total_words
       
    return descriptions_embeddings
    

In [83]:
#load word embeddings
word_embedding_matrix = np.loadtxt('word_embeddings.txt', dtype=np.float64)

#load vocabulary
word2id = dict()
id2word = dict()

file = open("vocabulary.txt","r")
lines = file.readlines()
for line in lines:
    temp = str(line)
    values = temp.split(',')
    word2id[values[0]] = int(values[1].replace("\n",""))
    id2word[int(values[1].replace("\n",""))] = values[0]
    
#load tags and descriptions
dir_path     = '/home/kostas/Documents/thesis/data_1'
tag_labels   = list()
descriptions = list()

load_issues(dir_path,tag_labels,descriptions)

# some prints for debugging purposes
#print(len(tag_labels))
#print(len(descriptions))
#for i in range(0,len(tag_labels)):
#    print("tags:",tag_labels[i])
#    print("description:",descriptions[i])
#    print("\n")

arithmetic_descriptions = transform_desc(descriptions,word2id)
descriptions_embeddings = compute_descriptions_embeddings(arithmetic_descriptions,word_embedding_matrix)

In [84]:
tags = ['Bug','Google Play or Beta feedback','Feedback required','Feature Request','Unverified','Frontend Design']
no_tags = 6
np_tags = np.zeros((len(arithmetic_descriptions),no_tags))

for counter in range(len(tag_labels)):
    for counter_2,value in enumerate(tags):
        if value in tag_labels[counter]:
            np_tags[counter][counter_2] = 1

df_tags = pd.DataFrame(np_tags, columns = tags)

In [85]:
# train a logistic regression classification model for the label bug
bug_series = df_tags["Bug"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Bug classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Google Play or Beta feedback
bug_series = df_tags["Google Play or Beta feedback"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Google Play or Beta feedback classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Feedback required
bug_series = df_tags["Feedback required"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Feedback required classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Feature Request
bug_series = df_tags["Feature Request"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Feature Request classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Unverified
bug_series = df_tags["Unverified"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Unverified classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

# train a logistic regression classification model for the label Frontend Design
bug_series = df_tags["Frontend Design"]
model      = LogisticRegression(solver='lbfgs')
skf        = StratifiedKFold(n_splits = 10)
scores     = cross_val_score(model,descriptions_embeddings,bug_series,scoring='accuracy',cv = skf)
print("Frontend Design classifier,mean accuracy %.3f"%np.mean(scores))
skf.get_n_splits(descriptions_embeddings,bug_series)

#for train_index, test_index in skf.split(descriptions_embeddings,bug_series):
#    X_train, X_test = descriptions_embeddings[train_index],descriptions_embeddings[test_index],
#    y_train, y_test = bug_series[train_index], bug_series[test_index]

Bug classifier,mean accuracy 0.722
Google Play or Beta feedback classifier,mean accuracy 0.961
Feedback required classifier,mean accuracy 0.909
Feature Request classifier,mean accuracy 0.814
Unverified classifier,mean accuracy 0.883
Frontend Design classifier,mean accuracy 0.880


10

In [86]:
print(len(word2id))
print(type(word_embedding_matrix))
print(len(word_embedding_matrix))
num_rows,num_col = word_embedding_matrix.shape
print(num_rows)
print(num_col)

3843
<class 'numpy.ndarray'>
3842
3842
100


In [87]:
tag_dict = dict()
for labels in tag_labels:
    for label in labels:
        if label not in tag_dict:
            tag_dict[label]  = 1
        else:
            tag_dict[label] += 1

In [38]:
for key in tag_dict:
    print(key,",",tag_dict[key])

                 Bug  , 2481
                 Map: Mapsforge  , 73
                 Unverified  , 591
                 Feature Request  , 942
                 Feedback required  , 461
                 CI server / Build tools  , 135
                 Google Play or Beta feedback  , 215
                 Connector OC  , 127
                 Connector GK  , 46
                 Prio - High  , 447
                 Duplicate  , 328
                 Frontend Design  , 605
                 Won't Fix  , 184
                 Prio - Low  , 490
                 Prio - Medium  , 199
                 Assigned-non-collab  , 86
                 Refactoring  , 172
                 Send2cgeo  , 38
                 Website  , 80
                 Connector GC  , 20
                 Regression  , 168
                 Performance  , 10
                 Translation  , 77
                 Hacktoberfest  , 11
                 Map: GMapsV2  , 34
                 Add-on: Contacts  , 8
                 Field test  

In [None]:
# issues with no tags i will not use them, because i dont know where they belong
