In [1]:
from nltk.corpus import stopwords
from multiprocessing import Process
from pymongo import MongoClient
import nltk
nltk.download('stopwords')
import os
import string
import tqdm

DEBUG = False

class TrainPipeline:
  def __init__(self):
    punctuations = string.punctuation+'—”\'\’'

    self.stopws = set(stopwords.words('english'))
    self.translator = str.maketrans(punctuations, ' '*len(punctuations))
    self.postprocessors = []

  def split(self, wordLists, split_way='sentence'):
    # Params:
    #   wordLists: list of words, could be sentence or paragraphs.
    #   split_way: word, sentence, or paragraphs
    assert split_way in ['word', 'sentence', 'paragraph']

    if split_way == 'paragraph':
      return 

    splits = []

    if split_way == 'sentence':
      delimiter = '.'
      for wl in wordLists:
        splits.append(wl.split(delimiter))
    else:
      delimiter = ' '
      for wl in wordLists:
        splits += wl.split(delimiter)
    return splits 

  def elim_stopwords(self, wordLists):
    # Params:
    #   wordLists: list of words, could be sentence or paragraphs.
    wl_tokens = []
    for wl in wordLists:
      filtered_wl = wl.translate(self.translator).split()
      tokens = [token.lower() for token in filtered_wl if token.lower() not in self.stopws]
      wl_tokens += tokens
    return wl_tokens

  def transform(self, doc):
    if DEBUG:
      print(doc.keys())
    title_outputs = self.transform_titles(doc['title'], doc.get('subtitles', []))
    pgraph_outputs = self.transform_paragraphs(doc.get('paragraphs', []))

    return (title_outputs, pgraph_outputs)

  def transform_titles(self, title, subtitles):
    if not isinstance(title, list):
      title = [title]

    titles = title + subtitles
    keywords = self.split(titles, split_way='word')
    return self.elim_stopwords(keywords)

  def transform_paragraphs(self, paragraphs):
    sentences = self.split(paragraphs, split_way='sentence')
    sentences = [self.elim_stopwords(s) for s in sentences]
    return sentences 

  def register_postprocessor(self, postprocessor, order):
    if not self.postprocessors:
      self.postprocessors = [(postprocessor, order)]
    else:
      insert_idx = 0
      while order > self.posprocessors[i]:
        insert_idx += 1

      self.postprocessors.insert(insert_idx, (postprocessor, order))
   
  def run_postprocessors(self, data):
    for pp in self.postprocessors:
      processor, _ = pp
      data = processor.transform(data)
    return data

class PostProcessors:
  def __init__(*args, **kwargs):
    raise AssertionError('Base class for post processors, not to be instantiated')

  def transform(self, inputs):
    # input is list of words/sentences
    # output modified list
    pass

class WordCountLimitProcessor:
  def __init__(self, word_count_lb):
    self.word_count_lb = word_count_lb

  def transform(self, sentences):
    return [sentence for sentence in sentences if len(sentence) > self.word_count_lb]

if __name__ == '__main__':
  pipeline = TrainPipeline()
  client = MongoClient("localhost", 27017, maxPoolSize=50)
  db = client.hndb
  collection = db['mongo_sites_1']
  docs = list(collection.find({}))

  titles = []
  pgraph_datas = []
  for doc in tqdm.tqdm(docs):
    title_data, pgraph_data = pipeline.transform(doc)
    titles.append(title_data)
    pgraph_datas.append(pgraph_data)
  # wc_processor = WordCountLimitProcessor(5)

  # pipeline.register_postprocessor(wc_processor, 5)
  # pgraph_data = pipeline.run_postprocessors(pgraph_data)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liweic/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 27573/27573 [00:17<00:00, 1535.93it/s]


In [5]:
robot = 0
for doc in docs:
    if doc['title'][0] =='Bloomberg - Are you a robot?':
        robot += 1
print(robot)

22633


In [2]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

In [3]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

inputs = tokenizer("Hello, my dog is so pretty", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

'LABEL_1'

In [4]:
import numpy as np
import pickle
from gensim.models import KeyedVectors
import itertools

In [5]:
fname = 'word2vec-google-news-300.model.vectors.npy'
vectors = np.load(fname)

In [6]:
fname = 'word2vec-google-news-300.model'
word_vectors = KeyedVectors.load(fname)

In [7]:
len(pgraph_datas)

27573

In [8]:
pgraph_datas = [list(itertools.chain.from_iterable(pgraph_data)) for pgraph_data in pgraph_datas]

In [10]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts

In [11]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(pgraph_datas)]

In [12]:
model = Doc2Vec(documents, vector_size=300, window=8, min_count=1, workers=4)

In [13]:
vector = model.infer_vector(["system", "response"])

In [14]:
pgraph_vectors = []
for pgraph_data in pgraph_datas:
  pgraph_vectors.append(model.infer_vector(pgraph_data))

In [15]:
from sklearn.cluster import KMeans

In [16]:
X = np.array(pgraph_vectors)

In [17]:
kmeans = KMeans(n_clusters=20, random_state=0).fit(X)

In [18]:
kmeans.labels_

array([4, 4, 4, ..., 4, 4, 4], dtype=int32)

In [19]:
from collections import Counter
label_counts = Counter(kmeans.labels_)

In [25]:
def pick_samples_of_label(docs, doc_vecs, centroids, labels, of_label, sample=10, top=10):
    # pick docs that are closest to of_label, sampling `sample` amount
    # and picking the `top` ones
    label_idxs = (labels == of_label).nonzero()[0]
    label_vecs = doc_vecs[label_idxs]
    dists = np.linalg.norm(label_vecs - centroids[of_label], axis=1)
    assert len(label_vecs) == len(label_idxs), f"Lengths must be same, but {len(label_vecs)} != {len(label_idxs)}"
    dists_idx = list(zip(dists, label_idxs))
    sorted_dists = sorted(dists_idx, key = lambda x: x[0])
    
    return [idx for _, idx in sorted_dists]

In [44]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
doc_vecs = np.array(pgraph_vectors)
top = pick_samples_of_label(docs, doc_vecs, centroids, labels, 2)

In [45]:
for i in top[:20]:
  print("post {}".format(i))
  print('.'.join(docs[i]['paragraphs'])[:500])
  print("")

post 13151

    .
  .
                Researchers found that lunar pits and caves reach stable temperatures, making them potentially suitable for human life.
                
                .
                .
            .Researchers found that lunar pits and caves reach stable temperatures, making them potentially suitable for human life..Hoping to live on the moon one day? Your chances just got a tiny bit better..The moon has pits and caves where temperatures stay at roughly 63 degrees Fahrenheit, maki

post 21872
Physicists have known for decades that, in principle, a semiconductor device can emit more light power than it consumes electrically. Experiments published in . finally demonstrate this in practice, though at a small scale..The energy absorbed by an electron as it traverses a light-emitting diode is equal to its charge times the applied voltage. But if the electron produces light, the emitted photon energy, which is determined by the semiconductor band gap, can be much la