In [None]:
# default_exp traceability.unsupervised.w2v

# Neural Unsupervised Approaches for SE Traceability [approach]

> This module is dedicated to evaluate word2vec/doc2vec or any neural unsupervised approaches on traceability datasets. Consider to Copy the entire notebook for a new and separeted empirical evaluation. 
>
> Author: @danaderp April 2020

In [None]:
#TODO
# http://www.ashukumar27.io/similarity_functions/
# https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html
# https://towardsdatascience.com/importance-of-distance-metrics-in-machine-learning-modelling-e51395ffe60d
# https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html

In [None]:
# export
# Imports
import numpy as np
import gensim
import pandas as pd
from itertools import product 
from random import sample 
import functools 
import os

In [None]:
#export
from datetime import datetime
import seaborn as sns

In [None]:
#export
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
#export
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
from prg import prg
from pandas.plotting import scatter_matrix
from pandas.plotting import lag_plot
import math as m
import random as r
import collections
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#export
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim import corpora

In [None]:
#https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cosine.html
from scipy.spatial import distance
from scipy.stats import pearsonr

In [None]:
#export
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [None]:
#hide
#!pip install pyemd
#!pip install pyprg
#!pip install seaborn

# Artifacts Similarity with BasicSequenceVectorization

We test diferent similarities based on [blog](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html) and [blog2](https://www.kdnuggets.com/2019/01/comparison-text-distance-metrics.html)

In [None]:
from enum import Enum, unique, auto

In [None]:
@unique
class VectorizationType(Enum):
    word2vec = auto()
    doc2vec = auto()
    vsm2vec = auto()

In [None]:
@unique
class LinkType(Enum):
    req2tc = auto()
    req2src = auto()

In [None]:
@unique
class DistanceMetric(Enum):
    WMD = auto()
    COS = auto()
    SCM = auto()
    EUC = auto()
    MAN = auto()

In [None]:
@unique
class SimilarityMetric(Enum):
    WMD_sim = auto()
    COS_sim = auto()
    SCM_sim = auto()
    EUC_sim = auto()
    MAN_sim = auto()
    Pearson = auto()

In [None]:
def default_params():
    return {
        "vectorizationType": VectorizationType.word2vec,
        "linkType": LinkType.req2tc,
        "system": 'libest',
        "path_to_trained_model": 'test_data/models/word2vec_libest.model',
        "source_path": 'test_data/[libest-pre-req].csv',
        "target_path": 'test_data/[libest-pre-tc].csv',
        "system_path": 'test_data/[libest-pre-all].csv',
        "saving_path": 'test_data/',
        "names": ['Source','Target','Linked?']
    }

### Defining BasicSequenceVectorization

In [None]:
parameters = default_params()
parameters

{'vectorizationType': <VectorizationType.word2vec: 1>,
 'linkType': <LinkType.req2tc: 1>,
 'system': 'libest',
 'path_to_trained_model': 'test_data/models/word2vec_libest.model',
 'source_path': 'test_data/[libest-pre-req].csv',
 'target_path': 'test_data/[libest-pre-tc].csv',
 'system_path': 'test_data/[libest-pre-all].csv',
 'saving_path': 'test_data/',
 'names': ['Source', 'Target', 'Linked?']}

In [None]:
#tst
w = list(VectorizationType)
x = list(DistanceMetric)
y = list(SimilarityMetric)
z = list(LinkType)
assert(str(w) == '[<VectorizationType.word2vec: 1>, <VectorizationType.doc2vec: 2>, <VectorizationType.vsm2vec: 3>]')
assert(str(x) == '[<DistanceMetric.WMD: 1>, <DistanceMetric.COS: 2>, <DistanceMetric.SCM: 3>, <DistanceMetric.EUC: 4>, <DistanceMetric.MAN: 5>]')
assert(str(y) == '[<SimilarityMetric.WMD_sim: 1>, <SimilarityMetric.COS_sim: 2>, <SimilarityMetric.SCM_sim: 3>, <SimilarityMetric.EUC_sim: 4>, <SimilarityMetric.MAN_sim: 5>, <SimilarityMetric.Pearson: 6>]')
assert(str(z)) == '[<LinkType.req2tc: 1>, <LinkType.req2src: 2>]'

In [None]:
#export
class BasicSequenceVectorization():
    '''Implementation of the class sequence-vanilla-vectorization other classes can inheritance this one'''
    def __init__(self, params):
                
        self.df_source = pd.read_csv(params['source_path'], names=['ids', 'text'], header=None, sep=' ')
        self.df_target = pd.read_csv(params['target_path'], names=['ids', 'text'], header=None, sep=' ')
        self.df_all_system = pd.read_csv(params['system_path'], names=['ids', 'text'], 
                                         header=0, index_col=0, sep=',')
        self.params = params
        self.df_nonground_link = None
        self.df_ground_link = None
        
        self.documents = [doc.split() for doc in self.df_all_system['text'].values] #Preparing Corpus
        self.dictionary = corpora.Dictionary( self.documents ) #Preparing Dictionary
        
        
        #This can be extended for future metrics <---------------------
        self.dict_labels = {
            DistanceMetric.COS:[DistanceMetric.COS, SimilarityMetric.COS_sim],
            SimilarityMetric.Pearson:[SimilarityMetric.Pearson],
            DistanceMetric.EUC:[DistanceMetric.EUC, SimilarityMetric.EUC_sim],
            DistanceMetric.WMD:[DistanceMetric.WMD, SimilarityMetric.WMD_sim],
            DistanceMetric.SCM:[DistanceMetric.SCM, SimilarityMetric.SCM_sim],
            DistanceMetric.MAN:[DistanceMetric.MAN, SimilarityMetric.MAN_sim]
        }

        
    def ground_truth_processing(self, path_to_ground_truth):
        'Optional class when corpus has ground truth'
        ground_truth = open(path_to_ground_truth,'r')
        #Organizing The Ground Truth under the given format
        ground_links = [ [(line.strip().split()[0], elem) for elem in line.strip().split()[1:]] for line in ground_truth]
        ground_links = functools.reduce(lambda a,b : a+b,ground_links) #reducing into one list
        assert len(ground_links) ==  len(set(ground_links)) #To Verify Redundancies in the file
        return ground_links
    
    def samplingLinks(self, sampling = False, samples = 10):
        source = [os.path.basename(elem) for elem in self.df_source['ids'].values ] 
        target = [os.path.basename(elem) for elem in self.df_target['ids'].values ]

        if sampling:
            links = sample( list( product( source , target ) ), samples)
        else:
            links = list( product( source , target ))

        return links
    
    def cos_scipy(self, vector_v, vector_w):
        cos =  distance.cosine( vector_v, vector_w )
        return [cos, 1.-cos]
    
    def euclidean_scipy(self, vector_v, vector_w):
        dst = distance.euclidean(vector_v,vector_w)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def manhattan_scipy(self, vector_v, vector_w):
        dst = distance.cityblock(vector_v,vector_w)
        n = len(vector_v)
        return [dst, 1./(1.+dst)] #Computing the inverse for similarity
    
    def pearson_abs_scipy(self, vector_v, vector_w):
        '''We are not sure that pearson correlation works well on doc2vec inference vectors'''
        corr, _ = pearsonr(x, y)
        return [abs(corr)] #Absolute value of the correlation
    

    def computeDistanceMetric(self, links, metric_list):
        '''Metric List Iteration''' 
        
        metric_labels = [ self.dict_labels[metric] for metric in metric_list] #tracking of the labels
        distSim = [[link[0], link[1], self.distance( metric_list, link )] for link in links] #Return the link with metrics
        distSim = [[elem[0], elem[1]] + elem[2] for elem in distSim] #Return the link with metrics
        
        return distSim, functools.reduce(lambda a,b : a+b, metric_labels)
    
    def ComputeDistanceArtifacts(self, metric_list, sampling = False , samples = 10):
        '''Acticates Distance and Similarity Computations
        @metric_list if [] then Computes All metrics
        @sampling is False by the default
        @samples is the number of samples (or links) to be generated'''
        links_ = self.samplingLinks( sampling, samples )
        
        docs, metric_labels = self.computeDistanceMetric( metric_list=metric_list, links=links_) #checkpoints
        self.df_nonground_link = pd.DataFrame(docs, columns =[self.params['names'][0], self.params['names'][1]]+ metric_labels) #Transforming into a Pandas
        logging.info("Non-groundtruth links computed")
        pass 
    
    
    def SaveLinks(self, grtruth=False, sep=' ', mode='a'):
        timestamp = datetime.timestamp(datetime.now())
        path_to_link = self.params['saving_path'] + '['+ self.params['system'] + '-' + str(self.params['vectorizationType']) + '-' + str(self.params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
        
        if grtruth:
            self.df_ground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        else:
            self.df_nonground_link.to_csv(path_to_link, header=True, index=True, sep=sep, mode=mode)
        
        logging.info('Saving in...' + path_to_link)
        pass
    
    def findDistInDF(self, g_tuple):
        dist = self.df_ground_link[self.df_ground_link[self.params['names'][0]].str.contains( g_tuple[0][:g_tuple[0].find('.')] + '-' ) 
                     & self.df_ground_link[self.params['names'][1]].str.contains(g_tuple[1][:g_tuple[1].find('.')]) ]        
        return dist.index.values
        
    def MatchWithGroundTruth(self, path_to_ground_truth ):
        self.df_ground_link = self.df_nonground_link.copy()
        
        matchGT = [ self.findDistInDF( g ) for g in self.ground_truth_processing(path_to_ground_truth)]
        matchGT = functools.reduce(lambda a,b : np.concatenate([a,b]), matchGT)
        
        self.df_ground_link[self.params['names'][2]] = 0
        new_column = pd.Series(np.full([len(matchGT)], 1 ), name=self.params['names'][2], index = matchGT)
        self.df_ground_link.update(new_column)
        logging.info("Groundtruth links computed")
        
        pass

### Testing BasicSequenceVectorization

In [None]:
general2vec =  BasicSequenceVectorization(params = parameters)

2020-11-01 13:17:34,072 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-11-01 13:17:34,140 : INFO : built Dictionary(6957 unique tokens: ['");', '"../../', '("\\', '();', ')))']...) from 87 documents (total 88944 corpus positions)


# Artifacts Similarity with Word2Vec

In [None]:
#export
class Word2VecSeqVect(BasicSequenceVectorization):       
    
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Word2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        #Computes cosine similarities between word embeddings and retrieves the closest 
        #word embeddings by cosine similarity for a given word embedding.
        self.similarity_index = WordEmbeddingSimilarityIndex(self.new_model.wv)
        #Build a term similarity matrix and compute the Soft Cosine Measure.
        self.similarity_matrix = SparseTermSimilarityMatrix(self.similarity_index, self.dictionary)
        
        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.WMD: self.wmd_gensim,
            DistanceMetric.SCM: self.scm_gensim
        }
    
    def wmd_gensim(self, sentence_a, sentence_b ):
        wmd = self.new_model.wv.wmdistance(sentence_a, sentence_b)
        return [wmd, self.wmd_similarity(wmd)]
    
    def wmd_similarity(self, dist):
        return 1./( 1.+float( dist ) ) #Associated Similarity
    
    def scm_gensim(self, sentence_a, sentence_b ):
        '''Compute SoftCosine Similarity of Gensim'''
        #Convert the sentences into bag-of-words vectors.
        sentence_1 = self.dictionary.doc2bow(sentence_a)
        sentence_2 = self.dictionary.doc2bow(sentence_b)
        
        #Return the inner product(s) between real vectors / corpora vec1 and vec2 expressed in a non-orthogonal normalized basis,
        #where the dot product between the basis vectors is given by the sparse term similarity matrix.
        scm_similarity = self.similarity_matrix.inner_product(sentence_1, sentence_2, normalized=True)
        return [1-scm_similarity, scm_similarity]
    
    def distance(self, metric_list,link):
        '''Iterate on the metrics'''
        #Computation of sentences can be moved directly to wmd_gensim method if we cannot generalize it for 
        #the remaining metrics
        sentence_a = self.df_source[self.df_source['ids'].str.contains(link[0])]['text'].values[0].split()
        sentence_b = self.df_target[self.df_target['ids'].str.contains(link[1])]['text'].values[0].split()
        
        dist = [ self.dict_distance_dispatcher[metric](sentence_a,sentence_b) for metric in metric_list]
        logging.info("Computed distances or similarities "+ str(link) + str(dist))    
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list


In [None]:
#export
def LoadLinks(timestamp, params, grtruth=False, sep=' ' ):
    '''Returns a pandas from a saved link computation at a give timestamp
    @timestamp is the version of the model for a given system'''
    
    path= params['saving_path'] + '['+ params['system'] + '-' + str(params['vectorizationType']) + '-' + str(params['linkType']) + '-' + str(grtruth) + '-{}].csv'.format(timestamp)
    
    logging.info("Loading computed links from... "+ path)

    return pd.read_csv(path, header=0, index_col=0, sep=sep)

### Testing Word2Vec SequenceVectorization

In [None]:
#hide
#tst
metric_list = ['a','b']
A = [[1,3,4],[4,5],[1,8,9,7]]
B = ((1,3,4),(4,5),(1,8,9,7))
functools.reduce(lambda a,b : a+b, B)
dist_sim_T = [([12,13],['metric1','metric2']),([12,13],['metric1','metric2'])]
dist_sim_T
separated_merged_list_a = functools.reduce(lambda a,b : a[1]+b[1], dist_sim_T)
assert(separated_merged_list_a == ['metric1', 'metric2', 'metric1', 'metric2'])

['metric1', 'metric2', 'metric1', 'metric2']

In [None]:
#[step 1]Creating the Vectorization Class
word2vec = Word2VecSeqVect( params = parameters )

2020-11-01 13:20:03,524 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-11-01 13:20:03,598 : INFO : built Dictionary(6957 unique tokens: ['");', '"../../', '("\\', '();', ')))']...) from 87 documents (total 88944 corpus positions)
2020-11-01 13:20:03,599 : INFO : loading Word2Vec object from test_data/models/word2vec_libest.model


FileNotFoundError: [Errno 2] No such file or directory: 'test_data/models/word2vec_libest.model'

In [None]:
word2vec.df_source['ids'][0]

NameError: name 'word2vec' is not defined

In [None]:
idss = word2vec.df_source['ids'][0] #Selecting an ID
idss = word2vec.df_source['ids'] == idss #Search for an specific ID
list(word2vec.df_source[idss]['text'])[0].split() #Retrieving text and splitting

NameError: name 'word2vec' is not defined

In [None]:
word2vec.df_source.head()

NameError: name 'word2vec' is not defined

In [None]:
word2vec.df_target.head()

NameError: name 'word2vec' is not defined

In [None]:
links = word2vec.samplingLinks(sampling=True, samples = 2)
links

NameError: name 'word2vec' is not defined

In [None]:
print( len(links), word2vec.df_source.shape, word2vec.df_target.shape )

NameError: name 'links' is not defined

In [None]:
#[optional] computeDistanceMetric Testing [WARNING!] Time Consuming!!
computeDistanceMetric = word2vec.computeDistanceMetric(links, metric_list = [DistanceMetric.WMD,DistanceMetric.SCM])
computeDistanceMetric

2020-10-18 18:53:19,395 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 18:53:19,398 : INFO : built Dictionary(397 unique tokens: ['abl', 'absent', 'altern', 'applic', 'associ']...) from 2 documents (total 1145 corpus positions)
2020-10-18 18:53:23,494 : INFO : Computed distances or similarities ('RQ25-pre.txt', 'us896.c')[[0.47075769695671116, 0.6799216499558003], [0.7143128514289856, 0.28568715]]
2020-10-18 18:53:23,497 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 18:53:23,501 : INFO : built Dictionary(536 unique tokens: ['abl', 'accept', 'administr', 'authent', 'author']...) from 2 documents (total 2598 corpus positions)
2020-10-18 18:53:25,366 : INFO : Computed distances or similarities ('RQ35-pre.txt', 'us893.c')[[0.42247504232645433, 0.7030000318068867], [0.5571423470973969, 0.44285765]]


([['RQ25-pre.txt',
   'us896.c',
   0.47075769695671116,
   0.6799216499558003,
   0.7143128514289856,
   0.28568715],
  ['RQ35-pre.txt',
   'us893.c',
   0.42247504232645433,
   0.7030000318068867,
   0.5571423470973969,
   0.44285765]],
 [<DistanceMetric.WMD: 1>,
  <SimilarityMetric.WMD_sim: 1>,
  <DistanceMetric.SCM: 3>,
  <SimilarityMetric.SCM_sim: 3>])

In [None]:
#[step 2]NonGroundTruth Computation
word2vec.ComputeDistanceArtifacts( sampling=True, samples = 50, metric_list = [DistanceMetric.WMD,DistanceMetric.SCM] )
word2vec.df_nonground_link.head()

2020-10-18 19:12:52,630 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 19:12:52,635 : INFO : built Dictionary(516 unique tokens: ['attribut', 'client', 'csr', 'csrattr', 'desir']...) from 2 documents (total 2484 corpus positions)
2020-10-18 19:12:53,192 : INFO : Computed distances or similarities ('RQ48-pre.txt', 'us893.c')[[0.3546829269802515, 0.7381801158660191], [0.35216033458709717, 0.64783967]]
2020-10-18 19:12:53,202 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 19:12:53,206 : INFO : built Dictionary(529 unique tokens: ['authent', 'author', 'base', 'cacert', 'certif']...) from 2 documents (total 2535 corpus positions)
2020-10-18 19:12:54,593 : INFO : Computed distances or similarities ('RQ33-pre.txt', 'us893.c')[[0.38586590012929317, 0.7215705357255026], [0.5318330824375153, 0.46816692]]
2020-10-18 19:12:54,597 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 19:12:54,603 : INFO : built Dictionary(111

2020-10-18 19:13:36,844 : INFO : Computed distances or similarities ('RQ45-pre.txt', 'us3496.c')[[0.3832968648173863, 0.7229106242006942], [0.5843002200126648, 0.41569978]]
2020-10-18 19:13:36,848 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 19:13:36,853 : INFO : built Dictionary(1145 unique tokens: ['accept', 'appli', 'authent', 'author', 'bootstrap']...) from 2 documents (total 2998 corpus positions)
2020-10-18 19:13:43,508 : INFO : Computed distances or similarities ('RQ26-pre.txt', 'us1883.c')[[0.36241007482871196, 0.7339933977849689], [0.4114319086074829, 0.5885681]]
2020-10-18 19:13:43,513 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 19:13:43,516 : INFO : built Dictionary(432 unique tokens: ['accept', 'access', 'also', 'altern', 'assur']...) from 2 documents (total 2762 corpus positions)
2020-10-18 19:13:45,445 : INFO : Computed distances or similarities ('RQ20-pre.txt', 'us898.c')[[0.5216272201063454, 0.6571911876879482],

2020-10-18 19:17:47,993 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 19:17:47,997 : INFO : built Dictionary(549 unique tokens: ['addit', 'administr', 'advis', 'alreadi', 'applic']...) from 2 documents (total 2584 corpus positions)
2020-10-18 19:17:51,126 : INFO : Computed distances or similarities ('RQ16-pre.txt', 'us893.c')[[0.31895590030182386, 0.7581754627058908], [0.43098413944244385, 0.56901586]]
2020-10-18 19:17:51,136 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 19:17:51,153 : INFO : built Dictionary(748 unique tokens: ['0ja7', '0pf', '1122', '200', '2lmcnf']...) from 2 documents (total 2674 corpus positions)
2020-10-18 19:19:09,813 : INFO : Computed distances or similarities ('RQ57-pre.txt', 'us901.c')[[0.3431319529988065, 0.7445284863987511], [0.45869094133377075, 0.54130906]]
2020-10-18 19:19:09,821 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 19:19:09,825 : INFO : built Dictionary(429 uniq

Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim
0,RQ48-pre.txt,us893.c,0.354683,0.73818,0.35216,0.64784
1,RQ33-pre.txt,us893.c,0.385866,0.721571,0.531833,0.468167
2,RQ48-pre.txt,us2174.c,0.344219,0.743927,0.384508,0.615492
3,RQ4-pre.txt,us896.c,0.509115,0.66264,0.756483,0.243517
4,RQ52-pre.txt,us897.c,0.452641,0.688401,0.761394,0.238606


In [None]:
#[step 3]Saving Non-GroundTruth Links
word2vec.SaveLinks()

2020-10-18 19:23:20,297 : INFO : Saving in...test_data/[libest-VectorizationType.word2vec-LinkType.req2tc-False-1603063400.288883].csv


In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks = LoadLinks(timestamp=1603063400.288883, params=parameters)
df_nonglinks.head()

2020-10-18 19:23:33,487 : INFO : Loading computed links from... test_data/[libest-VectorizationType.word2vec-LinkType.req2tc-False-1603063400.288883].csv


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim
0,RQ48-pre.txt,us893.c,0.354683,0.73818,0.35216,0.64784
1,RQ33-pre.txt,us893.c,0.385866,0.721571,0.531833,0.468167
2,RQ48-pre.txt,us2174.c,0.344219,0.743927,0.384508,0.615492
3,RQ4-pre.txt,us896.c,0.509115,0.66264,0.756483,0.243517
4,RQ52-pre.txt,us897.c,0.452641,0.688401,0.761394,0.238606


In [None]:
#[step 4]GroundTruthMatching Testing
path_to_ground_truth = 'test_data/[libest-ground-req-to-tc].txt'
word2vec.MatchWithGroundTruth(path_to_ground_truth)
word2vec.df_ground_link

2020-10-18 19:25:25,721 : INFO : NumExpr defaulting to 4 threads.
2020-10-18 19:25:25,744 : INFO : Groundtruth links computed


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,Linked?
0,RQ48-pre.txt,us893.c,0.354683,0.73818,0.35216,0.64784,1.0
1,RQ33-pre.txt,us893.c,0.385866,0.721571,0.531833,0.468167,1.0
2,RQ48-pre.txt,us2174.c,0.344219,0.743927,0.384508,0.615492,0.0
3,RQ4-pre.txt,us896.c,0.509115,0.66264,0.756483,0.243517,0.0
4,RQ52-pre.txt,us897.c,0.452641,0.688401,0.761394,0.238606,0.0
5,RQ18-pre.txt,us896.c,0.536422,0.650863,0.73483,0.26517,0.0
6,RQ36-pre.txt,us898.c,0.448945,0.690157,0.645753,0.354247,1.0
7,RQ15-pre.txt,us1005.c,0.390783,0.719019,0.482108,0.517892,0.0
8,RQ37-pre.txt,us893.c,0.344751,0.743632,0.514701,0.485299,1.0
9,RQ27-pre.txt,us3612.c,0.473896,0.678474,0.698716,0.301284,0.0


In [None]:
#[optional]GroundTruth Direct Processing
ground_links = word2vec.ground_truth_processing(path_to_ground_truth)
ground_links[141] # A tuple

('RQ33.txt', 'us894.c')

In [None]:
#Inspecting Source
ground_links[141][0][:ground_links[141][0].find('.')] + '-'

'RQ33-'

In [None]:
#Inspecting Target
ground_links[141][1][:ground_links[141][1].find('.')]

'us894'

In [None]:
#[step 5]Saving GroundTruth Links
word2vec.SaveLinks(grtruth = True)

2020-10-18 19:25:33,828 : INFO : Saving in...test_data/[libest-VectorizationType.word2vec-LinkType.req2tc-True-1603063533.824472].csv


In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks = LoadLinks(timestamp=1603063533.824472, params=parameters,grtruth = True)
df_glinks.head()

2020-10-18 19:26:09,535 : INFO : Loading computed links from... test_data/[libest-VectorizationType.word2vec-LinkType.req2tc-True-1603063533.824472].csv


Unnamed: 0,Source,Target,DistanceMetric.WMD,SimilarityMetric.WMD_sim,DistanceMetric.SCM,SimilarityMetric.SCM_sim,Linked?
0,RQ48-pre.txt,us893.c,0.354683,0.73818,0.35216,0.64784,1.0
1,RQ33-pre.txt,us893.c,0.385866,0.721571,0.531833,0.468167,1.0
2,RQ48-pre.txt,us2174.c,0.344219,0.743927,0.384508,0.615492,0.0
3,RQ4-pre.txt,us896.c,0.509115,0.66264,0.756483,0.243517,0.0
4,RQ52-pre.txt,us897.c,0.452641,0.688401,0.761394,0.238606,0.0


# Software Traceability with Artifacts Representation 
We are employing two techniques for analyzing software artifacts without groundtruth:
- Prototypes and Criticisms for Paragraph Vectors 
- Information Theory for Software Traceability (Shared Information and Mutual Information)

# Approach Evaluation and Interpretation (word2vec)
Classification/evaluation metrics for highly imbalanced data [(see Forum)](https://stats.stackexchange.com/questions/222558/classification-evaluation-metrics-for-highly-imbalanced-data).

In [None]:
#export
class VectorEvaluation():
    '''Approaches Common Evaluations and Interpretations (statistical analysis)'''
    def __init__(self, sequenceVectorization):
        self.seqVect = sequenceVectorization

In [None]:
#export
class SupervisedVectorEvaluation(VectorEvaluation):
    def __init__(self, sequenceVectorization, similarity):
        super().__init__(sequenceVectorization)
        self.y_test = sequenceVectorization.df_ground_link['Linked?'].values
        self.y_score = sequenceVectorization.df_ground_link[similarity].values
        self.label = str(sequenceVectorization.params['vectorizationType'])+'-'+str(similarity)
        pass
    
    def Compute_precision_recall_gain(self):
        '''One might choose PRG if there is little interest in identifying false negatives '''
        prg_curve = prg.create_prg_curve(self.y_test, self.y_score)
        auprg = prg.calc_auprg(prg_curve)
        prg.plot_prg(prg_curve)
        logging.info('auprg:  %.3f' %  auprg)
        logging.info("compute_precision_recall_gain Complete")
        pass
    
    def Compute_avg_precision(self):
        '''Generated precision-recall curve'''
        %matplotlib inline
        # calculate the no skill line as the proportion of the positive class
        no_skill = len(self.y_test[self.y_test==1]) / len(self.y_test)
        plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') #reference curve
        precision, recall, _ = precision_recall_curve(self.y_test, self.y_score) #compute precision-recall curve
        plt.plot(recall, precision, marker='.', label = self.label) #plot model curve
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.legend() #show the legend
        plt.show() #show the plot

        average_precision = average_precision_score(self.y_test, self.y_score)
        auc_score = auc(recall, precision)
        logging.info('Average precision-recall score: {0:0.2f}'.format(average_precision))
        logging.info('Precision-Recall AUC: %.3f' % auc_score)
        #logging.info("compute_precision_recall_gain Complete")
        pass
    
    def Compute_roc_curve(self):
        %matplotlib inline
        plt.plot([0, 1], [0, 1], linestyle='--', label='No Skill') #reference curve
        fpr, tpr, _ = roc_curve(self.y_test, self.y_score) #compute roc curve
        plt.plot(fpr, tpr, marker='.', label=self.label) #plot model curve
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend() #show the legend
        plt.show() #show the plot

        roc_auc = roc_auc_score(self.y_test, self.y_score)
        logging.info('ROC AUC %.3f' % roc_auc)

        pass

## SupervisedVectorEvaluation test

In [None]:
supevisedEval = SupervisedVectorEvaluation(word2vec, similarity=SimilarityMetric.SCM_sim)

In [None]:
supevisedEval.y_test

array([1., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1.])

In [None]:
y = supevisedEval.y_score

NameError: name 'supevisedEval' is not defined

In [None]:
#tst
for i in y:
    assert(i <= 1.0 and i >= 0.0)

SyntaxError: unexpected EOF while parsing (<ipython-input-43-b14ac58c049b>, line 2)

## Confusion Matrix

In [None]:
y_score_threshold = [0 if elem<=0.8 else 1 for elem in supevisedEval.y_score] #Hardcoded 0.7 Threshold

In [None]:
#TODO a Variation threshold analysis
tn, fp, fn, tp = confusion_matrix(supevisedEval.y_test, y_score_threshold).ravel()

In [None]:
(tn, fp, fn, tp)

(31, 0, 19, 0)

# Artifacts Similarity with Doc2Vec

Try to reproduce the same empirical evaluation like here: [link](https://arxiv.org/pdf/1507.07998.pdf). Pay attention to:
- Accuracy vs. Dimensionality (we can replace accuracy for false positive rate or true positive rate)
- Visualize paragraph vectors using t-sne
- Computing Cosine Distance and Similarity. More about similarity [link](https://www.kdnuggets.com/2017/08/comparing-distance-measurements-python-scipy.html)

In [None]:
def doc2vec_params():
    return {
        "vectorizationType": VectorizationType.doc2vec,
        "linkType": LinkType.req2tc,
        "system": 'libest',
        "path_to_trained_model": 'test_data/models/doc2vec_libest.model',
        "source_path": 'test_data/[libest-pre-req].csv',
        "target_path": 'test_data/[libest-pre-tc].csv',
        "system_path": 'test_data/[libest-pre-all].csv',
        "saving_path": 'test_data/',
        "names": ['Source','Target','Linked?']
    }

In [None]:
doc2vec_params = doc2vec_params()
doc2vec_params

{'vectorizationType': <VectorizationType.doc2vec: 2>,
 'linkType': <LinkType.req2tc: 1>,
 'system': 'libest',
 'path_to_trained_model': 'test_data/models/doc2vec_libest.model',
 'source_path': 'test_data/[libest-pre-req].csv',
 'target_path': 'test_data/[libest-pre-tc].csv',
 'system_path': 'test_data/[libest-pre-all].csv',
 'saving_path': 'test_data/',
 'names': ['Source', 'Target', 'Linked?']}

In [None]:
#Export
class Doc2VecSeqVect(BasicSequenceVectorization):
    
    def __init__(self, params):
        super().__init__(params)
        self.new_model = gensim.models.Doc2Vec.load( params['path_to_trained_model'] )
        self.new_model.init_sims(replace=True)  # Normalizes the vectors in the word2vec class.
        self.df_inferred_src = None
        self.df_inferred_trg = None
        
        self.dict_distance_dispatcher = {
            DistanceMetric.COS: self.cos_scipy,
            SimilarityMetric.Pearson: self.pearson_abs_scipy,
            DistanceMetric.EUC: self.euclidean_scipy,
            DistanceMetric.MAN: self.manhattan_scipy
        }
    
    def distance(self, metric_list, link):
        '''Iterate on the metrics'''
        ν_inferredSource = list(self.df_inferred_src[self.df_inferred_src['ids'].str.contains(link[0])]['inf-doc2vec'])
        w_inferredTarget = list(self.df_inferred_trg[self.df_inferred_trg['ids'].str.contains(link[1])]['inf-doc2vec'])
        
        dist = [ self.dict_distance_dispatcher[metric](ν_inferredSource,w_inferredTarget) for metric in metric_list]
        logging.info("Computed distances or similarities "+ str(link) + str(dist))    
        return functools.reduce(lambda a,b : a+b, dist) #Always return a list
    
    def computeDistanceMetric(self, links, metric_list):
        '''It is computed the cosine similarity'''
        
        metric_labels = [ self.dict_labels[metric] for metric in metric_list] #tracking of the labels
        distSim = [[link[0], link[1], self.distance( metric_list, link )] for link in links] #Return the link with metrics
        distSim = [[elem[0], elem[1]] + elem[2] for elem in distSim] #Return the link with metrics
        
        return distSim, functools.reduce(lambda a,b : a+b, metric_labels)

    
    def InferDoc2Vec(self, steps=200):
        '''Activate Inference on Target and Source Corpus'''
        self.df_inferred_src = self.df_source.copy()
        self.df_inferred_trg = self.df_target.copy()
        
        self.df_inferred_src['inf-doc2vec'] =  [self.new_model.infer_vector(artifact.split(),steps=steps) for artifact in self.df_inferred_src['text'].values]
        self.df_inferred_trg['inf-doc2vec'] =  [self.new_model.infer_vector(artifact.split(),steps=steps) for artifact in self.df_inferred_trg['text'].values]
        
        logging.info("Infer Doc2Vec on Source and Target Complete")
    

### Testing Doc2Vec SequenceVectorization

In [None]:
doc2vec = Doc2VecSeqVect(params = doc2vec_params)

2020-10-18 19:27:09,741 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-18 19:27:09,836 : INFO : built Dictionary(6957 unique tokens: ['");', '"../../', '("\\', '();', ')))']...) from 87 documents (total 88944 corpus positions)
2020-10-18 19:27:09,838 : INFO : loading Doc2Vec object from test_data/models/doc2vec_libest.model
2020-10-18 19:27:09,868 : INFO : loading vocabulary recursively from test_data/models/doc2vec_libest.model.vocabulary.* with mmap=None
2020-10-18 19:27:09,869 : INFO : loading trainables recursively from test_data/models/doc2vec_libest.model.trainables.* with mmap=None
2020-10-18 19:27:09,874 : INFO : loading wv recursively from test_data/models/doc2vec_libest.model.wv.* with mmap=None
2020-10-18 19:27:09,883 : INFO : loading docvecs recursively from test_data/models/doc2vec_libest.model.docvecs.* with mmap=None
2020-10-18 19:27:09,886 : INFO : loaded test_data/models/doc2vec_libest.model
2020-10-18 19:27:09,897 : INFO : precomputing L2-norms

In [None]:
#[step1]Apply Doc2Vec Inference
doc2vec.InferDoc2Vec(steps=200)

2020-10-18 19:27:23,263 : INFO : Infer Doc2Vec on Source and Target Complete


In [None]:
doc2vec.df_inferred_src.head(2)

Unnamed: 0,ids,text,inf-doc2vec
0,test_data/LibEST_semeru_format/requirements/RQ...,requir http uri control est server must suppor...,"[-2.8317213, -2.4188962, -6.478969, 3.0023212,..."
1,test_data/LibEST_semeru_format/requirements/RQ...,requir server side key generat respons request...,"[-3.5909107, -2.1294444, -6.424426, 1.1216303,..."


In [None]:
#test_inferDoc2Vec_trg = inferDoc2Vec(df_target)
#test_inferDoc2Vec_trg.head()
doc2vec.df_inferred_trg.head(2)

Unnamed: 0,ids,text,inf-doc2vec
0,test_data/LibEST_semeru_format/test/us903.c,unit test user stori server simpl enrol august...,"[-1.1435783, -0.6797689, -3.8540845, -6.446298..."
1,test_data/LibEST_semeru_format/test/us3496.c,unit test uri path segment extens support marc...,"[-7.902094, -3.171574, -2.0337508, 0.39059234,..."


In [None]:
#[step 2]NonGroundTruth Computation
doc2vec.ComputeDistanceArtifacts( sampling=True, samples = 50, metric_list = [DistanceMetric.EUC] )
doc2vec.df_nonground_link.head()

2020-10-18 19:27:35,627 : INFO : Computed distances or similarities ('RQ8-pre.txt', 'us3612.c')[[27.23440933227539, 0.035417776523374175]]
2020-10-18 19:27:35,642 : INFO : Computed distances or similarities ('RQ40-pre.txt', 'us896.c')[[22.97349739074707, 0.041712729006572274]]
2020-10-18 19:27:35,647 : INFO : Computed distances or similarities ('RQ13-pre.txt', 'us899.c')[[27.776687622070312, 0.0347503511569222]]
2020-10-18 19:27:35,653 : INFO : Computed distances or similarities ('RQ5-pre.txt', 'us3496.c')[[33.48546600341797, 0.028997723269880902]]
2020-10-18 19:27:35,661 : INFO : Computed distances or similarities ('RQ53-pre.txt', 'us2174.c')[[45.23784255981445, 0.02162730665269199]]
2020-10-18 19:27:35,669 : INFO : Computed distances or similarities ('RQ8-pre.txt', 'us4020.c')[[29.1542911529541, 0.033162775902362204]]
2020-10-18 19:27:35,673 : INFO : Computed distances or similarities ('RQ24-pre.txt', 'us897.c')[[24.291915893554688, 0.03953832537671995]]
2020-10-18 19:27:35,678 : INF

Unnamed: 0,Source,Target,DistanceMetric.EUC,SimilarityMetric.EUC_sim
0,RQ8-pre.txt,us3612.c,27.234409,0.035418
1,RQ40-pre.txt,us896.c,22.973497,0.041713
2,RQ13-pre.txt,us899.c,27.776688,0.03475
3,RQ5-pre.txt,us3496.c,33.485466,0.028998
4,RQ53-pre.txt,us2174.c,45.237843,0.021627


In [None]:
#[step 3]Saving Non-GroundTruth Links
doc2vec.SaveLinks()

2020-10-18 19:27:38,667 : INFO : Saving in...test_data/[libest-VectorizationType.doc2vec-LinkType.req2tc-False-1603063658.662815].csv


In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_nonglinks_doc2vec = LoadLinks(timestamp=1603056377.906573, params=doc2vec_params)
df_nonglinks_doc2vec.head()

2020-10-18 19:27:39,742 : INFO : Loading computed links from... test_data/[libest-VectorizationType.doc2vec-LinkType.req2tc-False-1603056377.906573].csv


Unnamed: 0,Source,Target,DistanceMetric.EUC,SimilarityMetric.EUC_sim
0,RQ5-pre.txt,us3512.c,32.463028,0.029884
1,RQ21-pre.txt,us4020.c,28.574036,0.033813
2,RQ15-pre.txt,us1005.c,32.506367,0.029845
3,RQ32-pre.txt,us897.c,24.160978,0.039744
4,RQ34-pre.txt,us898.c,29.95886,0.032301


In [None]:
#[step 4]GroundTruthMatching Testing
path_to_ground_truth = 'test_data/[libest-ground-req-to-tc].txt'
doc2vec.MatchWithGroundTruth(path_to_ground_truth)
doc2vec.df_ground_link

2020-10-18 19:27:58,664 : INFO : Groundtruth links computed


Unnamed: 0,Source,Target,DistanceMetric.EUC,SimilarityMetric.EUC_sim,Linked?
0,RQ8-pre.txt,us3612.c,27.234409,0.035418,1.0
1,RQ40-pre.txt,us896.c,22.973497,0.041713,0.0
2,RQ13-pre.txt,us899.c,27.776688,0.03475,0.0
3,RQ5-pre.txt,us3496.c,33.485466,0.028998,0.0
4,RQ53-pre.txt,us2174.c,45.237843,0.021627,0.0
5,RQ8-pre.txt,us4020.c,29.154291,0.033163,1.0
6,RQ24-pre.txt,us897.c,24.291916,0.039538,0.0
7,RQ4-pre.txt,us1159.c,33.715771,0.028805,0.0
8,RQ31-pre.txt,us896.c,21.696489,0.04406,0.0
9,RQ20-pre.txt,us1883.c,38.487007,0.025325,0.0


In [None]:
#[step 5]Saving GroundTruth Links
doc2vec.SaveLinks(grtruth = True)

2020-10-18 19:28:00,249 : INFO : Saving in...test_data/[libest-VectorizationType.doc2vec-LinkType.req2tc-True-1603063680.24686].csv


In [None]:
#Loading Non-GroundTruth Links (change the timestamp with the assigned in the previous step)
df_glinks_doc2vec = LoadLinks(timestamp=1603063680.24686, params=doc2vec_params, grtruth = True)
df_glinks_doc2vec.head()

2020-10-18 19:28:08,950 : INFO : Loading computed links from... test_data/[libest-VectorizationType.doc2vec-LinkType.req2tc-True-1603063680.24686].csv


Unnamed: 0,Source,Target,DistanceMetric.EUC,SimilarityMetric.EUC_sim,Linked?
0,RQ8-pre.txt,us3612.c,27.234409,0.035418,1.0
1,RQ40-pre.txt,us896.c,22.973497,0.041713,0.0
2,RQ13-pre.txt,us899.c,27.776688,0.03475,0.0
3,RQ5-pre.txt,us3496.c,33.485466,0.028998,0.0
4,RQ53-pre.txt,us2174.c,45.237843,0.021627,0.0


# Approach Evaluation and Interpretation (doc2vec)

In [None]:
supervisedEvalDoc2vec = SupervisedVectorEvaluation(doc2vec, similarity=SimilarityMetric.EUC_sim)

In [None]:
supervisedEvalDoc2vec.y_test

array([1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 1.])

In [None]:
y = supervisedEvalDoc2vec.y_score

array([0.03541778, 0.04171273, 0.03475035, 0.02899772, 0.02162731,
       0.03316278, 0.03953833, 0.02880535, 0.04405968, 0.02532479,
       0.02793206, 0.0287591 , 0.02225518, 0.02830499, 0.0342694 ,
       0.04598991, 0.03725973, 0.0240362 , 0.02651379, 0.03617325,
       0.02766799, 0.02998513, 0.02414327, 0.04459385, 0.03041159,
       0.02937196, 0.02391515, 0.02935402, 0.03652414, 0.02726818,
       0.04557955, 0.03119887, 0.03199925, 0.02793213, 0.03477711,
       0.04541905, 0.03039143, 0.02428295, 0.02988223, 0.02558915,
       0.03378266, 0.03583102, 0.03587346, 0.02868129, 0.03004593,
       0.02516732, 0.03438064, 0.03814636, 0.03191514, 0.03391339])

In [None]:
#tst
for i in y:
    assert(i <= 1.0 and i >= 0.0)

SyntaxError: unexpected EOF while parsing (<ipython-input-44-39b32821c5a4>, line 1)

## Combining Doc2vec and Word2vec
Please check this post for futher detatils [link](https://stats.stackexchange.com/questions/217614/intepreting-doc2vec-cosine-similarity-between-doc-vectors-and-word-vectors)

In [None]:
! nbdev_build_docs #<-------- [Activate when stable]