In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

import re
from pprint import pprint
from collections import Counter

%load_ext autoreload
%autoreload 2
# %matplotlib inline

In [2]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train',  remove=('headers', 'footers'))

In [3]:
target_names = ['comp.graphics', 'rec.sport.baseball', 'talk.politics.mideast', 'rec.autos', 'sci.med']
target_nums = [i for i in range(20) if newsgroups_train.target_names[i] in target_names]

masks = [newsgroups_train.target == i for i in target_nums]
mask = np.array([any(tup) for tup in zip(*masks)])

data = np.array(newsgroups_train.data)[mask]
targets = np.array(newsgroups_train.target)[mask]

print(f'Counts\ndata shape: {data.shape}')
# print(f'targets shape: {targets.shape}')

for name, count in zip(target_names, [np.sum(m) for m in masks]):
    print(f'{name}: {count}')

Counts
data shape: (2933,)
comp.graphics: 584
rec.sport.baseball: 594
talk.politics.mideast: 597
rec.autos: 594
sci.med: 564


In [27]:
import csv

def save_labels(name:string):
    PATH = f'data/{name}_labels.csv'
    # first row is header
    labels = ['header'] + [newsgroups_train.target_names[i] for i in targets]
    with open(PATH, 'w') as myfile:
        wr = csv.writer(myfile,dialect='excel')
        for row in labels:
            wr.writerow([row])
    print(f'Saved in {PATH}')
    
save_labels('tfidf')

Saved in data/tfidf_labels.csv


In [6]:
data[1]

"Robert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> abraxis@iastate.edu writes in article <abraxis.734340159@class1.iastate.edu>:\n> > Anyone know about the Weitek P9000 graphics chip?\n> As far as the low-level stuff goes, it looks pretty nice.  It's got this\n> quadrilateral fill command that requires just the four points.\n\nDo you have Weitek's address/phone number?  I'd like to get some information\nabout this chip.\n"

In [7]:
print(data[1])

Robert J.C. Kyanko (rob@rjck.UUCP) wrote:
> abraxis@iastate.edu writes in article <abraxis.734340159@class1.iastate.edu>:
> > Anyone know about the Weitek P9000 graphics chip?
> As far as the low-level stuff goes, it looks pretty nice.  It's got this
> quadrilateral fill command that requires just the four points.

Do you have Weitek's address/phone number?  I'd like to get some information
about this chip.



In [8]:
text = pd.DataFrame(data)
text.to_pickle('data/source_text.pkl')

### TFIDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# vectorize
bow_vec = TfidfVectorizer(max_df=0.25, min_df=0.001, ngram_range=(1, 2),
                          sublinear_tf=True, use_idf=True)
%time doc_bow = bow_vec.fit_transform(data)

# get vocab
vocab = bow_vec.get_feature_names()
print('\nVocab size:', len(vocab))
print('Document vector shape:', doc_bow.shape)

CPU times: user 2.56 s, sys: 70.9 ms, total: 2.63 s
Wall time: 2.67 s

Vocab size: 60109
Document vector shape: (2933, 60109)


In [10]:
counts = np.count_nonzero(doc_bow.toarray(), axis=0)
percentage = 100 * counts / len(data)
print('Token'.ljust(10), '% Docs', sep='\t')
print('='*30)
for idx in np.argsort(counts)[-25:]:
    print(vocab[idx].ljust(10), f'{percentage[idx]:.2f}', sep='\t')

Token     	% Docs
these     	19.16
that the  	19.77
could     	19.98
for the   	20.08
time      	20.12
them      	20.76
people    	21.00
had       	21.41
does      	21.55
than      	21.79
com       	21.79
we        	21.85
to be     	22.37
were      	22.74
their     	22.81
also      	23.05
he        	23.18
only      	23.32
get       	23.59
other     	23.63
how       	23.66
been      	23.66
when      	23.90
think     	24.21
it is     	24.96


In [29]:
# https://scikit-learn.org/stable/modules/decomposition.html
from sklearn.decomposition import TruncatedSVD, PCA
svd = TruncatedSVD(n_components=200, n_iter=7, random_state=42)
%time doc_bow_svd = svd.fit_transform(doc_bow) 

print('explained variance:', svd.explained_variance_ratio_.sum())


PCA = PCA()

In [25]:
def save_vectors(vectors, name:string):
    PATH = f"data/{name}_input.csv"
    # create .csv header
    head_vals = np.arange(vectors.shape[1])
    header = ",".join([item for item in head_vals.astype(str)])
    np.savetxt(PATH, vectors, header=header, comments='', delimiter=',')
    print(f'Saved in {PATH}')

save_vectors(doc_bow_svd, 'tfidf')

Saved in data/tfidf_input.csv


In [12]:
# save out high d object
# TODO: TOO BIG
# head_vals = np.arange(doc_bow_svd.shape[1])
# header = ",".join([item for item in head_vals.astype(str)])

# np.savetxt("data/tfidf_input.csv", doc_bow_svd, header=header, comments='', delimiter=',')

### Doc2Vec

In [15]:
import gensim

# tokenize
def read_corpus(data, tokens_only=False):
    for i, doc in enumerate(data):
        if tokens_only:
            yield gensim.utils.simple_preprocess(doc)
        else:
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc), [i])
        
corpus = list(read_corpus(data))



In [16]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=200, min_count=2, epochs=40)
model.build_vocab(corpus)

print(f'vocab length: {len(model.wv.vocab)}')
print(f'sample tokenised doc:\n{corpus[0]}')

vocab length: 20979
sample tokenised doc:
TaggedDocument(['was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail'], [0])


In [17]:
%time model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)


CPU times: user 57.4 s, sys: 1.64 s, total: 59.1 s
Wall time: 23.7 s


In [18]:
model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', 'fires']).shape

(200,)

In [19]:
ranks = []
second_ranks = []
for doc_id in range(len(corpus)):
    inferred_vector = model.infer_vector(corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    # what rank is the trained vector in order of sim to inferred vector
    rank = [docid for docid, sim in sims].index(doc_id) 
    ranks.append(rank)
    
    second_ranks.append(sims[1])

print(f'{100* np.sum([r==0 for r in ranks]) / len(ranks):.2f}%')

# import collections
# counter = collections.Counter(ranks)
# sorted(counter.most_common(20))

76.78%


In [20]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(corpus[sims[index][0]].words)))

Document (2932): «in article qkgbuinns shelley washington edu bolson carson washington edu edward bolson writes boy this will be embarassing if it is trivial or an faq given points non coplanar how does one find the sphere that is center and radius exactly fitting those points know how to do it for circle from points but do not immediately see straightforward way to do it in have checked some geometry books graphics gems and farin but am still at loss please have mercy on me and provide the solution wouldn this require hyper sphere in space points over specifies sphere as far as can see unless that is you can prove that point exists in space that is equi distant from the points and this may not necessarily happen correct me if wrong which quite possibly am steve»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d200,n5,w5,mc2,s0.001,t3):

MOST (2657, 0.8543505668640137): «in article rb srgenprp sr hp com almanb sr hp com bob alman writes hose»

SECOND-MOST (735, 0.849534273147583): «jus

In [28]:
save_vectors(model.docvecs.vectors_docs, 'doc2vec')
save_labels('doc2vec')

Saved in data/doc2vec_input.csv
Saved in data/doc2vec_labels.csv


### BERT

In [62]:
bert_text = list(text.strip() for text in data)
bert_text = ['BLANK' if text is '' else text for text in bert_text]

In [73]:
from bert_serving.client import BertClient

# start server e.g.
# bert-serving-start -model_dir ~/models/BERT/cased_L-12_H-768_A-12/ -num_worker=4 -max_seq_len=500

bc = BertClient(ip='137.117.67.76', show_server_config=True)
%time bert_vectors = bc.encode(bert_text)

server config:
                        client	=	4d940469-5918-4f4d-8a1c-b068b8938f87
                   num_process	=	2                             
          ventilator -> worker	=	['ipc://tmpRytRC8/socket', 'ipc://tmpORskFY/socket', 'ipc://tmpZc9NHO/socket', 'ipc://tmp2RmiKE/socket', 'ipc://tmpmZaNMu/socket', 'ipc://tmpmaiiPk/socket', 'ipc://tmpfsXNRa/socket', 'ipc://tmp4t9jU0/socket']
                worker -> sink	=	ipc://tmpSpUOnf/socket        
           ventilator <-> sink	=	ipc://tmpq32oAi/socket        
           server_current_time	=	2019-04-25 10:12:00.033451    
                     statistic	=	{'num_data_request': 0, 'num_total_seq': 0, 'num_sys_request': 1, 'num_total_request': 1, 'num_total_client': 1, 'num_active_client': 0, 'avg_request_per_client': 1.0, 'min_request_per_client': 1, 'max_request_per_client': 1, 'num_min_request_per_client': 1, 'num_max_request_per_client': 1}
                    device_map	=	[]                            
         num_concurrent_sock

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


CPU times: user 101 ms, sys: 190 ms, total: 291 ms
Wall time: 39 s


In [75]:
save_vectors(bert_vectors, 'bert_250_word_mean')
save_labels('bert_250_word_mean')

Saved in data/bert_250_word_mean_input.csv
Saved in data/bert_250_word_mean_labels.csv


### Try out new tsne

In [35]:
import bhtsne

%time output = bhtsne.tsne(doc_bow_svd, dimensions=3)

CPU times: user 1min 15s, sys: 3.08 s, total: 1min 18s
Wall time: 1min 18s


In [15]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=3)
%time output_2 = tsne.fit_transform(doc_bow_svd)

CPU times: user 2min 31s, sys: 5.3 s, total: 2min 36s
Wall time: 2min 36s


In [11]:
# data = np.loadtxt("data/tfidf_input.csv", skiprows=1)
# embedding_array = bhtsne.run_bh_tsne(doc_bow_svd, initial_dims=doc_bow_svd.shape[1])

In [37]:
pca_dim_ls = [25, 100, None]
{('No PCA' if i is None else i): (200 if i is None else i) for i in pca_dim_ls}

{25: 25, 100: 100, 'No PCA': 200}

In [36]:
df = pd.DataFrame(output, columns=['x', 'y', 'z'])
df

Unnamed: 0,x,y,z
0,-1.149523,35.135636,13.449234
1,-33.802058,5.057675,15.112783
2,-12.953371,6.272250,0.888270
3,8.501414,35.272079,4.175327
4,2.045654,-12.163825,-18.475271
5,-14.902226,13.177760,-3.301088
6,0.797801,-6.008549,-0.298618
7,-0.450804,-25.294197,-13.729284
8,12.211734,4.324410,-7.236874
9,-10.851260,-18.124040,13.533896
