In [19]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from multiprocessing import Process
from pymongo import MongoClient
import os
import string
import tqdm

DEBUG = False

class TrainPipeline:
  def __init__(self):
    punctuations = string.punctuation+'—”\'\’'

    self.stopws = set(stopwords.words('english'))
    self.translator = str.maketrans(punctuations, ' '*len(punctuations))
    self.postprocessors = []

  def split(self, wordLists, split_way='sentence'):
    assert split_way == 'word' or split_way == 'sentence' or split_way == 'paragraph'

    if split_way == 'paragraph':
      return 

    splits = []

    if split_way == 'sentence':
      delimiter = '.'
      for wl in wordLists:
        splits.append(wl.split(delimiter))
    else:
      delimiter = ' '
      for wl in wordLists:
        splits += wl.split(delimiter)
    return splits 

  def elim_stopwords(self, wordLists):
    wl_tokens = []
    for wl in wordLists:
      filtered_wl = wl.translate(self.translator).split()
      tokens = [token.lower() for token in filtered_wl if token.lower() not in self.stopws]
      wl_tokens += tokens
    return wl_tokens

  def transform(self, doc):
    if DEBUG:
      print(doc.keys())
    title_outputs = self.transform_titles(doc['title'], doc.get('subtitles', []))
    pgraph_outputs = self.transform_paragraphs(doc.get('paragraphs', []))

    return (title_outputs, pgraph_outputs)

  def transform_titles(self, title, subtitles):
    if not isinstance(title, list):
      title = [title]

    titles = title + subtitles
    keywords = self.split(titles, split_way='word')
    return self.elim_stopwords(keywords)

  def transform_paragraphs(self, paragraphs):
    sentences = self.split(paragraphs, split_way='sentence')
    sentences = [self.elim_stopwords(s) for s in sentences]
    return sentences 

  def register_postprocessor(self, postprocessor, order):
    if not self.postprocessors:
      self.postprocessors = [(postprocessor, order)]
    else:
      insert_idx = 0
      while order > self.posprocessors[i]:
        insert_idx += 1

      self.postprocessors.insert(insert_idx, (postprocessor, order))
   
  def run_postprocessors(self, data):
    for pp in self.postprocessors:
      processor, _ = pp
      data = processor.transform(data)
    return data

class PostProcessors:
  def __init__(*args, **kwargs):
    raise AssertionError('Base class for post processors, not to be instantiated')

  def transform(self, inputs):
    # input is list of words/sentences
    # output modified list
    pass

class WordCountLimitProcessor:
  def __init__(self, word_count_lb):
    self.word_count_lb = word_count_lb

  def transform(self, sentences):
    return [sentence for sentence in sentences if len(sentence) > self.word_count_lb]

if __name__ == '__main__':
  pipeline = TrainPipeline()
  client = MongoClient("localhost", 27017, maxPoolSize=50)
  db = client.hndb
  collection = db['mongo_sites_1']
  docs = list(collection.find({}))

  titles = []
  pgraph_datas = []
  for doc in tqdm.tqdm(docs):
    title_data, pgraph_data = pipeline.transform(doc)
    titles.append(title_data)
    pgraph_datas.append(pgraph_data)
  # wc_processor = WordCountLimitProcessor(5)

  # pipeline.register_postprocessor(wc_processor, 5)
  # pgraph_data = pipeline.run_postprocessors(pgraph_data)


[nltk_data] Downloading package stopwords to /Users/lchi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 22196/22196 [00:04<00:00, 4759.43it/s]


In [2]:
import torch
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [3]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

inputs = tokenizer("Hello, my dog is so pretty", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

'LABEL_1'

In [27]:
import numpy as np
import pickle
from gensim.models import KeyedVectors
import itertools

In [5]:
fname = 'word2vec-google-news-300.model.vectors.npy'
vectors = np.load(fname)

In [15]:
fname = 'word2vec-google-news-300.model'
word_vectors = KeyedVectors.load(fname)

In [22]:
len(pgraph_datas)

22196

In [28]:
pgraph_datas = [list(itertools.chain.from_iterable(pgraph_data)) for pgraph_data in pgraph_datas]

In [32]:
pgraph_datas[0]

['bit',
 'mystery',
 'teamviewer',
 '–',
 'popular',
 'remote',
 'desktop',
 'program',
 '–',
 'install',
 'font',
 'use',
 'computer',
 'abstract',
 'font',
 'shown',
 'image',
 'seem',
 'serve',
 'purpose',
 'software',
 'intentional',
 'enables',
 'websites',
 'detect',
 'teamviewer',
 'installed',
 'computer',
 'see',
 'almost',
 'complete',
 'type',
 'specimen',
 'teamviewer',
 'font',
 'illustration',
 'contains',
 'characters',
 'write',
 'plus',
 'digits',
 '7',
 '8',
 'remaining',
 '24',
 'majuscules',
 'uppercase',
 'characters',
 'latin',
 'alphabet',
 'encoded',
 'apostrophe',
 'included',
 'characters',
 'feature',
 'rather',
 'unique',
 'mostly',
 'unreadable',
 'design',
 'uncommon',
 'creative',
 'software',
 'like',
 'microsoft',
 'office',
 'libreoffice',
 'adobe',
 'creative',
 'suite',
 'install',
 'complementary',
 'fonts',
 'however',
 'fonts',
 'meant',
 'enhance',
 'use',
 'software',
 'giving',
 'fonts',
 'options',
 'get',
 'teamviewer',
 'font',
 'option',
 '

In [34]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts


In [37]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(pgraph_datas)]

In [38]:
model = Doc2Vec(documents, vector_size=300, window=8, min_count=1, workers=4)

In [39]:
vector = model.infer_vector(["system", "response"])

In [42]:
import sys
!{sys.executable} -m pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting scikit-learn
  Downloading scikit_learn-1.1.2-cp38-cp38-macosx_10_9_x86_64.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1316 sha256=7acd39a78715ff988fd9d20d2a9ec44416eae06c0a1e93659bf11abe5b016957
  Stored in directory: /Users/lchi/Library/Caches/pip/wheels/22/0b/40/fd3f795caaa1fb4c6cb738bc1f56100be1e57da95849bfc897
Successfully built sklearn
Installing collected packages: threadpoolctl, scikit-learn, sklearn
Successfully installed scikit-learn-1.1.2 sklearn-0.0 threadpoolctl-3.1.0


In [43]:
pgraph_vectors = []
for pgraph_data in pgraph_datas:
  pgraph_vectors.append(model.infer_vector(pgraph_data))

In [44]:
from sklearn.cluster import KMeans

In [45]:
X = np.array(pgraph_vectors)

In [59]:
kmeans = KMeans(n_clusters=20, random_state=0).fit(X)

In [60]:
kmeans.labels_

array([ 2, 13, 13, ..., 13, 14,  1], dtype=int32)

In [61]:
kmeans.labels_[:100]

array([ 2, 13, 13, 12,  2, 10, 13,  8, 16,  2,  4, 13, 13, 13, 13, 13,  1,
        5, 12, 12, 13,  0, 13, 13,  2,  5, 14, 10,  2,  1,  1, 10, 14,  1,
        2, 14,  2, 16,  2, 14,  5, 13, 13, 13, 10,  5,  1,  2,  1,  2, 13,
       13,  5,  2,  8, 13, 11, 13,  1, 14, 16, 13, 13, 13, 14,  1,  2, 13,
        2,  0,  1,  1,  1, 14, 14, 13,  1, 13,  1, 13,  1, 12,  2,  5,  5,
       13, 13, 14,  8,  2, 11,  1, 14,  1, 13,  1, 13, 16,  1,  1],
      dtype=int32)

In [107]:
def pick_samples_of_label(docs, doc_vecs, centroids, labels, of_label, sample=10, top=10):
    # pick docs that are closest to of_label, sampling `sample` amount
    # and picking the `top` ones
    label_idxs = (labels == of_label).nonzero()[0]
    label_vecs = doc_vecs[label_idxs]
    dists = np.linalg.norm(label_vecs - centroids[of_label], axis=1)
    assert len(label_vecs) == len(label_idxs), f"Lengths must be same, but {len(label_vecs)} != {len(label_idxs)}"
    dists_idx = list(zip(dists, label_idxs))
    sorted_dists = sorted(dists_idx, key = lambda x: x[0])
    
    return [idx for _, idx in sorted_dists]

In [108]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
doc_vecs = np.array(pgraph_vectors)
top = pick_samples_of_label(docs, doc_vecs, centroids, labels, 2)

In [114]:
for i in top[:5]:
  print(docs[i]['paragraphs'])

['The goal is to simulate 3D printers, but any sort of stm32 microcontroller firmware should work.', 'The emulator is configured via a configuration file, see example\n', '.', 'In the following example, I show how to emulate the 3D printer of the Elegoo\nSaturn and Anycubic MonoX unmodified firmwares downloaded from the vendor website.', 'This emulator is done in the context of my work on ', ' so I can write a Rust\nfirmware for 3D printers, ', '.', 'In the ', ",\nwe provide an SVD file that provides all the peripheral register addresses for\nthe STM32F407. We then configure various memory regions, framebuffers, and\ndevices. We also patch two functions in the firmware just to speed things up as\nwe don't need to wait for our devices to initialize.", 'We also specify the firmware binary ', ", and that's the\nofficial binary downloaded from the Elegoo website.  The ", " is the\ncontent of the external SPI flash dumped from the Saturn board itself (I cheated\na bit here, I wish we could 