### Generation of 2D Scatterplot

In [16]:
from sklearn.datasets import fetch_20newsgroups

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

from scipy.spatial.distance import pdist

import spacy
from pprint import pprint
import pandas as pd
import numpy as np

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use'])

In [2]:
newsgroups_train = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [3]:
print(newsgroups_train.data[0])



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [5]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

In [6]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc])
    return texts_out

In [7]:
data = newsgroups_train.data
data_words = list(sent_to_words(data))

# Remove Stop Words
print("Start removing stop words")
data_words_nostops = remove_stopwords(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# conda install -c conda-forge spacy-model-en_core_web_sm
print("Installing spacy")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
print("Start lemmatizing words")
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

Start removing stop words
Installing spacy
Start lemmatizing words


In [8]:
print(data_lemmatized[0])

['sure', 'basher', 'pen', 'fan', 'pretty', 'confused', 'lack', 'kind', 'post', 'recent', 'pen', 'massacre', 'devil', 'actually', 'bite', 'puzzled', 'bit', 'relieve', 'however', 'go', 'put', 'end', 'non', 'pittsburgher', 'relief', 'bit', 'praise', 'pen', 'man', 'kill', 'devil', 'bad', 'thought', 'jagr', 'show', 'much', 'well', 'regular', 'season', 'stat', 'also', 'lot', 'fo', 'fun', 'watch', 'playoff', 'bowman', 'let', 'jagr', 'lot', 'fun', 'next', 'couple', 'game', 'since', 'pen', 'go', 'beat', 'pulp', 'jersey', 'anyway', 'disappoint', 'see', 'islander', 'lose', 'final', 'regular', 'season', 'game', 'pen', 'rule']


In [9]:
data_lemmatized_min_length = []

for sublist in data_lemmatized:
    # Use a list comprehension to filter out strings with less than two characters
    sublist = [word for word in sublist if len(word) > 3]
    data_lemmatized_min_length.append(sublist)

In [10]:
Y = newsgroups_train.target.tolist()

In [11]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized_min_length)

# Create Corpus
texts = data_lemmatized_min_length

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View 
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 2), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 2), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1)]]


In [13]:
from scipy.sparse import dok_matrix

# Define function to convert Gensim corpus to a sparse pandas DataFrame
def corpus_to_sparse_dataframe(corpus):
    word_freq = dok_matrix((len(corpus), len(id2word)), dtype=int)

    for i, doc in enumerate(corpus):
        for word_id, freq in doc:
            word_freq[i, word_id] = freq

    dataframe = pd.DataFrame.sparse.from_spmatrix(word_freq)
    dataframe.columns = [id2word[word_id] for word_id in range(len(id2word))]
    return dataframe

In [14]:
VSM = corpus_to_sparse_dataframe(corpus)

In [15]:
from gensim.models import TfidfModel
from gensim.models import LsiModel

model = TfidfModel(corpus)  # fit model
tfidf_corpus = model[corpus]

K = 20
tfidf_lsi_model = LsiModel(tfidf_corpus, id2word=id2word, num_topics=K)
tfidf_lsi_model.print_topics(num_topics=K, num_words=10)

[(0,
  '0.153*"would" + 0.128*"know" + 0.119*"people" + 0.117*"like" + 0.114*"think" + 0.106*"make" + 0.102*"well" + 0.101*"good" + 0.099*"window" + 0.098*"drive"'),
 (1,
  '0.254*"window" + 0.230*"drive" + 0.218*"card" + 0.203*"file" + 0.165*"thank" + 0.141*"driver" + 0.139*"disk" + -0.135*"people" + 0.132*"scsi" + 0.109*"program"'),
 (2,
  '0.505*"drive" + -0.292*"window" + -0.279*"file" + 0.271*"scsi" + 0.236*"game" + 0.140*"controller" + -0.131*"program" + 0.128*"disk" + 0.118*"hard" + 0.114*"card"'),
 (3,
  '0.549*"game" + -0.271*"drive" + 0.241*"team" + 0.167*"play" + 0.159*"player" + -0.158*"scsi" + 0.126*"hockey" + 0.116*"window" + 0.115*"baseball" + 0.109*"season"'),
 (4,
  '0.346*"file" + 0.325*"window" + 0.276*"drive" + -0.231*"thank" + -0.221*"please" + -0.193*"mail" + -0.176*"chip" + -0.176*"card" + 0.143*"disk" + 0.140*"scsi"'),
 (5,
  '0.457*"card" + -0.268*"drive" + -0.260*"thank" + -0.236*"please" + 0.216*"driver" + 0.179*"window" + 0.178*"video" + -0.171*"mail" + 0.16

In [32]:
rows = []
for doc in tfidf_corpus:
    doc_top = []
    for t in tfidf_lsi_model[doc]:
        doc_top.append(t[1])
    rows.append(doc_top)

document_topic_matrix = pd.DataFrame(rows)
document_topic_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.096438,-0.064345,0.081752,0.187791,0.037621,-0.006134,0.007457,0.057292,-0.000687,0.014441,0.003018,0.001535,0.011821,-0.005472,0.007129,-0.004340,0.008583,0.028722,0.016516,-0.021231
1,0.121678,0.160559,0.019054,0.029774,-0.163131,0.153800,0.162392,0.086754,0.050644,-0.036742,-0.112626,-0.005560,-0.088801,0.015986,-0.030117,0.060828,-0.065504,0.079354,-0.003550,0.047949
2,0.114348,-0.088036,-0.008067,-0.033921,0.019773,0.029750,-0.024900,0.051668,0.268112,-0.011121,0.014658,0.014457,0.019403,0.043303,0.136160,-0.001862,-0.068640,-0.010558,-0.041912,-0.035772
3,0.099667,0.127528,0.147981,-0.095943,0.058095,0.016658,-0.025935,0.129511,-0.024849,0.042916,-0.037102,-0.096026,-0.203185,0.067233,0.072508,-0.063512,0.133542,0.040367,-0.111224,-0.175234
4,0.115398,0.119539,0.104279,-0.072874,0.050134,-0.046047,-0.020154,0.020553,-0.007696,0.042768,-0.003114,-0.006774,0.023941,-0.006057,-0.021021,0.006679,-0.054390,-0.002607,0.057195,0.030980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18841,0.111611,-0.040149,-0.002239,-0.007048,0.008484,0.007556,-0.009869,-0.029927,-0.010241,-0.021003,-0.027877,-0.046461,0.001771,-0.032864,0.006692,0.007275,0.000921,-0.010309,0.008605,-0.010627
18842,0.041121,0.016629,-0.014593,0.004803,0.011255,0.023225,0.002360,-0.025793,-0.004557,-0.032545,-0.000295,-0.029458,0.028135,0.022948,0.011118,0.030526,0.031692,-0.015422,0.001622,0.006162
18843,0.059817,0.022083,0.036148,-0.018361,-0.019430,0.039212,-0.039218,-0.022285,-0.021465,0.012009,-0.001599,0.014291,0.041423,-0.009175,0.044883,-0.026335,-0.010880,0.001946,-0.017933,-0.007067
18844,0.069566,-0.028832,-0.013698,-0.009136,0.014438,-0.002855,-0.007736,-0.002505,-0.028657,-0.026990,0.014333,-0.062280,-0.023114,0.026373,-0.033613,-0.013621,-0.043408,0.003408,0.023973,0.014174


In [37]:
num_topics = 20
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [38]:
rows = []
for doc in corpus:
    doc_top = []
    for t in lda_model.get_document_topics(doc, minimum_probability = 0):
        doc_top.append(t[1])
    rows.append(doc_top)

In [39]:
document_topic_matrix = pd.DataFrame(rows)
#document_topic_matrix_sourcecode["identifier"] = df_sourcecode.iloc[:,0].tolist()
document_topic_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.001273,0.003925,0.027564,0.003541,0.002531,0.070713,0.001256,0.002608,0.016298,0.016849,0.005205,0.018300,0.014934,0.002469,0.003555,0.534880,0.004282,0.232538,0.034169,0.003112
1,0.001799,0.005543,0.025120,0.005002,0.003574,0.051309,0.001773,0.003684,0.020009,0.257890,0.007352,0.025289,0.021113,0.003487,0.005021,0.048155,0.006048,0.184649,0.318788,0.004396
2,0.000760,0.011509,0.213770,0.002209,0.001510,0.120583,0.000749,0.001556,0.129346,0.022957,0.003502,0.035050,0.017883,0.001473,0.002121,0.065833,0.011707,0.324563,0.031060,0.001857
3,0.001104,0.016716,0.002096,0.003069,0.002193,0.055757,0.001088,0.002260,0.025545,0.013851,0.004512,0.029638,0.013067,0.002140,0.003081,0.047890,0.003711,0.165635,0.603948,0.002698
4,0.001505,0.004637,0.002857,0.004184,0.166455,0.053241,0.001484,0.003081,0.104342,0.064048,0.006151,0.060891,0.017756,0.002917,0.004200,0.040514,0.005059,0.260168,0.192832,0.003678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18841,0.000872,0.023726,0.001658,0.065548,0.022764,0.091994,0.000859,0.001785,0.012103,0.010722,0.014081,0.035512,0.031157,0.012189,0.002433,0.075341,0.002930,0.559376,0.032818,0.002130
18842,0.002667,0.040403,0.005065,0.007417,0.005300,0.240163,0.002630,0.005462,0.030797,0.059234,0.010903,0.037362,0.031280,0.005171,0.007446,0.093540,0.008968,0.293085,0.106587,0.006519
18843,0.002068,0.066188,0.003927,0.005751,0.004110,0.129584,0.002039,0.004235,0.022934,0.024913,0.008454,0.053652,0.024253,0.004009,0.005773,0.125619,0.006953,0.299732,0.125863,0.079942
18844,0.002235,0.006890,0.004245,0.006216,0.004442,0.137721,0.002204,0.004578,0.026082,0.027076,0.009138,0.031313,0.080185,0.004334,0.033223,0.113506,0.034463,0.396995,0.069689,0.005464


In [56]:
document_topic_matrix_short = document_topic_matrix
Y_short = Y

In [None]:
# Dimensionality reduction 1: tSNE

import time
from sklearn.manifold import TSNE
from scipy.spatial.distance import pdist, squareform, jensenshannon, cosine

#n_sne = 7000

time_start = time.time()
#tsne = TSNE(n_iter=300)
tsne = TSNE(n_components=2, n_iter = 250, perplexity=30, learning_rate = 250, metric =jensenshannon)
tsne_results = tsne.fit_transform(document_topic_matrix_short.values)

print ('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

import matplotlib.pyplot as plt
%matplotlib inline
# Create the figure
fig = plt.figure( figsize=(8,8) )
ax = fig.add_subplot(1, 1, 1, title='TSNE' )
# Create the scatter
ax.scatter(
    x=tsne_results[:,0], 
    y=tsne_results[:,1], 
    c=Y_short, 
    cmap=plt.cm.get_cmap('Paired'), 
    alpha=0.4)
plt.show()

  return np.sqrt(js / 2.0)


### Local Metrics

In [None]:
from scipy.spatial.distance import cdist

def pairwise_distance_matrix(point, distance_function="euclidean"):
	"""
	Compute the pairwise distance matrix of the point list
	You can use any distance function from scipy.spatial.distance.cdist or specify a callable function
	INPUT:
		ndarray: point: list of points
		str or callable: distance_function: distance function to use
	OUTPUT:
		ndarry: pairwise distance matrix 
	"""
	if callable(distance_function):
		distance_matrix = cdist(point, point, distance_function)
	else:
		distance_matrix = cdist(point, point, distance_function)
	return distance_matrix

In [None]:
from .pairwise_dist import pairwise_distance_matrix
import numpy as np
import faiss
from sklearn.neighbors import KDTree

def knn_with_ranking(points, k, distance_function='euclidean'):
    """
    Compute the k-nearest neighbors of the points along with the 
    rankings of other points based on the distance to each point.
    If the distance matrix is not provided, it is computed in O(n^2) time.
    INPUT:
    	ndarray: points: list of points
        int: k: number of nearest neighbors to compute
    	ndarray: distance_matrix: pairwise distance matrix (Optional)
    OUTPUT:
    	ndarray: knn_indices: k-nearest neighbors of each point 
    	ndarray: ranking: ranking of other points based on the distance to each point
    """
    distance_matrix = pairwise_distance_matrix(points, distance_function)
    
    knn_indices = np.empty((points.shape[0], k), dtype=np.int32)
    ranking = np.empty((points.shape[0], points.shape[0]), dtype=np.int32)
      
    for i in range(points.shape[0]):
        distance_to_i = distance_matrix[i]
        sorted_indices = np.argsort(distance_to_i)
        knn_indices[i] = sorted_indices[1:k+1]
        ranking[i] = np.argsort(sorted_indices)
      
    return knn_indices, ranking
  

def knn(points, k, distance_function="euclidean"):
    """
    Compute the k-nearest neighbors of the points
    If the distance function is euclidean, the computation relies on faiss-cpu.
    Otherwise, the computation is done based on scikit-learn KD Tree algorithm
    You can use any distance function supported by scikit-learn KD Tree or specify a callable function
    INPUT:
    	ndarray: points: list of points
    	int: k: number of nearest neighbors to compute
    	str or callable: distance_function: distance function to use
    OUTPUT:
    	ndarray: knn_indices: k-nearest neighbors of each point 
    """
    	
    ## make c-contiguous
    points = np.ascontiguousarray(points, dtype=np.float32)
    
    if distance_function == "euclidean":
        index = faiss.IndexFlatL2(points.shape[1])
        index.add(points)
        knn_indices = index.search(points, k+1)[1][:, 1:]
    else:
        tree = KDTree(points, metric=distance_function)
        knn_indices = tree.query(points, k=k+1, return_distance=False)[:, 1:]
    	
    return knn_indices

In [None]:
def measure(orig, emb, k=20, return_local=False):
	"""
	Compute the trustworthiness and continuity of the embedding
	INPUT:
		ndarray: orig: original data
		ndarray: emb: embedded data
		int: k: number of nearest neighbors to consider
		tuple: knn_ranking_info: precomputed k-nearest neighbors and rankings of the original and embedded data (Optional)
	OUTPUT:
		dict: trustworthiness and continuity
	"""

	orig_knn_indices, orig_ranking = knn_with_ranking(orig, k, distance_function='cosine')
	emb_knn_indices,  emb_ranking  = knn_with_ranking(emb, k)

	if return_local:
		trust, local_trust = tnc_computation(orig_knn_indices, orig_ranking, emb_knn_indices, k, return_local)
		cont , local_cont  = tnc_computation(emb_knn_indices,  emb_ranking, orig_knn_indices, k, return_local)
		return ({
			"trustworthiness": trust,
			"continuity": cont
		}, {
			"local_trustworthiness": local_trust,
			"local_continuity": local_cont
		})
	else:
		trust = tnc_computation(orig_knn_indices, orig_ranking, emb_knn_indices, k, return_local)
		cont  = tnc_computation(emb_knn_indices,  emb_ranking, orig_knn_indices, k, return_local)
		return {
			"trustworthiness": trust,
			"continuity": cont
		}

def tnc_computation(base_knn_indices, base_ranking, target_knn_indices, k, return_local=False):
	"""
	Core computation of trustworthiness and continuity
	"""
	local_distortion_list = []
	points_num = base_knn_indices.shape[0]

	for i in range(points_num):
		missings = np.setdiff1d(target_knn_indices[i], base_knn_indices[i])
		local_distortion = 0.0 
		for missing in missings:
			local_distortion += base_ranking[i, missing] - k
		local_distortion_list.append(local_distortion)
	local_distortion_list = np.array(local_distortion_list)
	local_distortion_list = 1 - local_distortion_list * (2 / (k * (2 * points_num - 3 * k - 1)))

	average_distortion = np.mean(local_distortion_list)

	if return_local:
		return average_distortion, local_distortion_list
	else:
		return average_distortion