# **Load text data from the given source**

In [1]:
# download data and covert to the from that etm can understand and process
from sklearn.datasets import fetch_20newsgroups
train_data = fetch_20newsgroups(subset='train').data
test_data = fetch_20newsgroups(subset='test').data
documents = train_data
documents.extend(test_data)
print(f'Number of documents {len(documents)}')

Number of documents 18846


# **Sample texts from 20 News Groups Dataset**

In [2]:
for i in range(0,2):
  print(documents[i])
  print(100*"=")

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.wa

# **Preprocessing and Coverting**


1.   Preprocessing: 

  *   stopwords, punctuation, words filter by max-df, min-df
  *   lematization, lower
  *   remove empty documents after preprocessing

2.   Convert Text to Word-Embedding-Representation (BOW-dict)
3.   Split numeric representation to two parts of dataset

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import scipy.sparse
import pandas as pd

Count-Matrix `vectorized_documents`:
- Column: Vocabulary (word of V)
- Row: Document
- Each Element_{ij} describes the frequency of word j in document i

In [4]:
# using CountVectorizer to reduce the vocabulary
min_df = 0.01
max_df = 0.75
dataset = documents
vectorizer = CountVectorizer(min_df=min_df, max_df=max_df)
vectorized_documents = vectorizer.fit_transform(dataset)
# vocalubary 2130 words in V, 18845 documents
pd.DataFrame.sparse.from_spmatrix(vectorized_documents[:,:])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2120,2121,2122,2123,2124,2125,2126,2127,2128,2129
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18841,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,1,0,0,0
18842,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
18843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,0,0,0
18844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# vocabulary and the frequency in the whole dataset
print(vectorizer.vocabulary_)
print(vectorizer.get_feature_names())
print(vectorizer.stop_words_)



In [6]:
# tokenization and remove stop words from documents
documents_without_stop_words = [
        [word for word in document.split()
            if word not in vectorizer.stop_words_]
        for document in dataset]
for i in range(0,2):
  print(documents_without_stop_words[i])
  print("\n")

['From:', 'lerxst@wam.umd.edu', "(where's", 'my', 'thing)', 'Subject:', 'WHAT', 'car', 'this!?', 'Nntp-Posting-Host:', 'rac3.wam.umd.edu', 'Organization:', 'University', 'Maryland,', 'College', 'Park', 'Lines:', '15', 'I', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'me', 'on', 'this', 'car', 'I', 'saw', 'other', 'day.', 'It', 'was', 'a', '2-door', 'car,', 'looked', 'be', 'late', '60s/', 'early', '70s.', 'It', 'was', 'called', 'a', 'Bricklin.', 'The', 'were', 'really', 'small.', 'In', 'addition,', 'front', 'was', 'separate', 'rest', 'body.', 'This', 'all', 'I', 'know.', 'If', 'anyone', 'can', 'a', 'model', 'name,', 'engine', 'specs,', 'years', 'production,', 'where', 'this', 'car', 'made,', 'history,', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'looking', 'car,', 'please', 'e-mail.', 'Thanks,', '-', 'IL', '----', 'brought', 'you', 'by', 'your', 'Lerxst', '----']


['From:', 'guykuo@carson.u.washington.edu', '(Guy', 'Kuo)', 'Subject:', 'SI', 'Clock', 'Poll',

In [7]:
signed_documents = vectorized_documents.sign() #greater than 0 = 1, 0 = 0, otherwise -1
signed_docs_as_pd = pd.DataFrame.sparse.from_spmatrix(signed_documents)

print(len(list(signed_docs_as_pd.iloc[0])))
print(f'sum in the row: {sum(list(signed_docs_as_pd.iloc[0]))}')

# how many documents hold each word of vocabulary
print(len(list(signed_docs_as_pd[0])))
print(f'sum in the colum: {sum(list(signed_docs_as_pd[0]))}')

2130
sum in the row: 64
18846
sum in the colum: 748


In [8]:
# saving the frequency of each word in Vocabulary over all documents/ look "sum in the column"
sum_counts = signed_documents.sum(axis=0)
print(sum_counts)
v_size = sum_counts.shape[1]
sum_counts_np = np.zeros(v_size, dtype=int)
for v in range(v_size):
    sum_counts_np[v] = sum_counts[0, v]
sum_counts_np.shape

[[748 690 348 ... 297 576 206]]


(2130,)

# **Prepare Vocabulary**
1. Vocabulary only from train-data-set
2. word2id and id2word
3. docs-train, docs-test, docs-val after preprocessing and filtering

In [9]:
import random
def get_randoms(mdict, n_samples):
  #keys = random.sample(mdict.keys(), n_samples)
  keys = list(mdict.keys())[:n_samples]
  sample_d = {k: mdict[k] for k in keys}
  print("samples from dict: {}".format(sample_d))

word2id = {}
id2word = {}
for w in vectorizer.vocabulary_:
  word2id[w] = vectorizer.vocabulary_.get(w)
  id2word[vectorizer.vocabulary_.get(w)] = w
get_randoms(word2id, 3)
get_randoms(id2word, 3)

samples from dict: {'umd': 1973, 'edu': 676, 'where': 2070}
samples from dict: {1973: 'umd', 676: 'edu', 2070: 'where'}


In [10]:
#create and sort the vocabulary by the column-frequency of words
idx_sort = np.argsort(sum_counts_np)
print(idx_sort)
vocabulary = [id2word[idx_sort[cc]] for cc in range(v_size)]
vocabulary[:10]

[1595 1400  260 ... 1894 1030  815]


['reliable',
 'papers',
 'attacks',
 'covered',
 'ex',
 'enjoy',
 'stories',
 'scheme',
 'wayne',
 'coverage']

In [11]:
# data split to train, test and validation
train_size = 0.8
num_docs = signed_documents.shape[0]
print(f'number of docs: {num_docs}')
train_dataset_size = int(np.floor(train_size * num_docs))
test_dataset_size = int(num_docs - train_dataset_size)
# validationset?
idx_permute = np.random.permutation(num_docs).astype(int)
print(f'train size: {train_dataset_size}')
print(f'test size: {test_dataset_size}')
idx_permute

number of docs: 18846
train size: 15076
test size: 3770


array([15390, 15399, 13932, ..., 18405,  8189,  7753])

In [12]:
for text in documents_without_stop_words[:3]:
  print(text)

['From:', 'lerxst@wam.umd.edu', "(where's", 'my', 'thing)', 'Subject:', 'WHAT', 'car', 'this!?', 'Nntp-Posting-Host:', 'rac3.wam.umd.edu', 'Organization:', 'University', 'Maryland,', 'College', 'Park', 'Lines:', '15', 'I', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'me', 'on', 'this', 'car', 'I', 'saw', 'other', 'day.', 'It', 'was', 'a', '2-door', 'car,', 'looked', 'be', 'late', '60s/', 'early', '70s.', 'It', 'was', 'called', 'a', 'Bricklin.', 'The', 'were', 'really', 'small.', 'In', 'addition,', 'front', 'was', 'separate', 'rest', 'body.', 'This', 'all', 'I', 'know.', 'If', 'anyone', 'can', 'a', 'model', 'name,', 'engine', 'specs,', 'years', 'production,', 'where', 'this', 'car', 'made,', 'history,', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'looking', 'car,', 'please', 'e-mail.', 'Thanks,', '-', 'IL', '----', 'brought', 'you', 'by', 'your', 'Lerxst', '----']
['From:', 'guykuo@carson.u.washington.edu', '(Guy', 'Kuo)', 'Subject:', 'SI', 'Clock', 'Poll', '

In [13]:
# only words from train dataset will be maintained
vocabulary = []
for idx_d in range(train_dataset_size):
  for w in documents_without_stop_words[idx_permute[idx_d]]:
    if w in word2id:
      vocabulary.append(w)
vocabulary = list(set(vocabulary))

**create word2id and id2word only with the vocabulary from train-dataset**

In [14]:
print(f'whole voca: {len(vocabulary)}')
print(f'unique voca: {len(set(vocabulary))}')
word2id = {}
id2word = {} 
for j, w in enumerate(vocabulary): 
  word2id[w] = j
  id2word[j] = w 
get_randoms(word2id,3)
get_randoms(id2word,3)

whole voca: 2033
unique voca: 2033
samples from dict: {'16': 0, '20': 1, 'forced': 2}
samples from dict: {0: '16', 1: '20', 2: 'forced'}


**saving docs-train, docs-test not by words, but by word-ids**

In [15]:
import numpy as np
from scipy import sparse

def _remove_empty_documents(documents):
    return [doc for doc in documents if doc != []]

In [16]:
docs_train = [[word2id[w] for w in documents_without_stop_words[idx_permute[idx_d]]
                if w in word2id] for idx_d in range(train_dataset_size)]
docs_test = [
    [word2id[w] for w in
        documents_without_stop_words[idx_permute[idx_d + train_dataset_size]]
      if w in word2id] for idx_d in range(test_dataset_size)]
for doc_train_by_word_ids in docs_train[:3]:
  print(doc_train_by_word_ids)
print(f'docs-train befor: {len(docs_train)}')
docs_train = _remove_empty_documents(docs_train)
docs_test = _remove_empty_documents(docs_test)
print(f'docs-train after empty removing: {len(docs_train)}')

# Remove test documents with length=1
docs_test = [doc for doc in docs_test if len(doc) > 1]
print(f'doc-test-len: {len(docs_test)}')

docs_test_h1 = []
docs_test_h2 = []
for doc in docs_test:
  doc_h1 = []
  doc_h2 = []
  for i,w in enumerate(doc):
    if i<=len(doc)/2.0-1:
      doc_h1.append(w)
    if i>len(doc)/2.0-1:
      doc_h2.append(w)

  if len(doc_h1)!=0:
    docs_test_h1.append(doc_h1)
  if len(doc_h2)!=0:
    docs_test_h2.append(doc_h2)

print(f'doc-test-h1-len: {len(docs_test_h1)}')
print(f'doc-test-h2-len: {len(docs_test_h2)}')

docs_test_h1 = [[w for i,w in enumerate(doc) if i<=len(doc)/2.0-1] for doc in docs_test]
docs_test_h2 = [[w for i,w in enumerate(doc) if i>len(doc)/2.0-1] for doc in docs_test]
print(f'control: doc-test-h1-len: {len(docs_test_h1)}')
print(f'control: doc-test-h2-len: {len(docs_test_h2)}')

[62, 1288, 639, 329, 416, 191, 1288, 337, 676, 1750, 354, 520, 1438, 255, 1550, 205, 899, 132, 1457, 11, 885, 1752, 1265, 1998, 446, 1485, 565, 1704, 520, 649, 329, 996, 1288, 1255, 795, 676, 1485, 329, 1830, 649, 1136, 1358, 337, 1457, 1566, 1288, 176, 1011, 1130, 1442, 1481, 1665, 1485, 191, 1288, 875, 337, 62, 1379, 564, 649, 1281, 118, 1384, 1239, 292, 1594, 1442, 1277, 875, 129, 1358, 62, 1382]
[1468, 980, 1111, 1011, 1111, 1691, 1740, 1783, 1778, 924, 783, 980, 1545, 867, 1011, 1547, 154, 1442, 374, 1482, 354, 924, 1482, 1468, 906, 429, 1547, 351, 767, 354, 924, 76, 1482, 1517, 1011, 1111, 39, 354, 1053, 1778, 1355, 520, 1308, 924, 920, 1358, 1457, 1238, 859, 520, 1482, 1468, 1208, 1210, 1027, 354, 1907, 1011, 867, 313, 868, 1259, 128, 1242, 1358, 1429, 1545, 311, 1111, 1457, 1242, 1711, 859, 1011, 1545, 1100, 1308, 665, 1208, 1210, 1027, 883, 1895, 1847, 351, 285, 1138, 2029, 1603, 867, 354, 1801, 62, 70, 1011, 1840, 351, 285, 749, 351, 1711, 1648, 883, 1665, 1442, 1097, 859, 15

In [17]:
sorted_id_doc_train = sorted(docs_train[0])
print(f'SORTED-doc with word-ids {sorted_id_doc_train}')
doc_words = []
for word_id in sorted_id_doc_train:
  doc_words.append(id2word[word_id])
print(f'SORTED-doc with words {doc_words}')

print(f'doc with word-ids {docs_train[0]}')
doc_words = []
for word_id in docs_train[0]:
  doc_words.append(id2word[word_id])
print(f'doc with words {doc_words}')

# saving docs as word lists
doc_words_train = [[id2word[w] for w in doc] for doc in docs_train]
doc_words_test = [[id2word[w] for w in doc] for doc in docs_test]
for doc in doc_words_train[:3]:
  print(doc)

SORTED-doc with word-ids [11, 62, 62, 62, 118, 129, 132, 176, 191, 191, 205, 255, 292, 329, 329, 329, 337, 337, 337, 354, 416, 446, 520, 520, 564, 565, 639, 649, 649, 649, 676, 676, 795, 875, 875, 885, 899, 996, 1011, 1130, 1136, 1239, 1255, 1265, 1277, 1281, 1288, 1288, 1288, 1288, 1288, 1358, 1358, 1379, 1382, 1384, 1438, 1442, 1442, 1457, 1457, 1481, 1485, 1485, 1485, 1550, 1566, 1594, 1665, 1704, 1750, 1752, 1830, 1998]
SORTED-doc with words ['their', 'are', 'are', 'are', 'so', 'who', 'saying', 'out', 'where', 'where', 'regular', 'posts', 'caught', 'just', 'just', 'just', 'fans', 'fans', 'fans', 'it', 'wondering', 'win', 'with', 'with', 'doing', 'nothing', '24', 'little', 'little', 'little', 'mean', 'mean', 'stopped', 'those', 'those', 'team', 'season', 'why', 'on', 'course', 'fan', 'think', 'have', 'going', 'over', 'our', 'all', 'all', 'all', 'all', 'all', 'for', 'for', 'at', 'great', 'post', 'about', 'or', 'or', 'that', 'that', 'sure', 'see', 'see', 'see', 'during', 'maybe', 'now

In [18]:
# just saving all word-ids of train-dataset as a list, test-dataset as a list, test-halves as a list
def _create_list_words(documents):
    return [word for document in documents for word in document]

words_train = _create_list_words(docs_train)
words_test = _create_list_words(docs_test)
words_ts_h1 = _create_list_words(docs_test_h1)
words_ts_h2 = _create_list_words(docs_test_h2)

print(len(words_train))
print(len(set(words_train)))
print(len(words_test))
print(len(set(words_test)))
print(len(words_ts_h1))
print(len(set(words_ts_h1)))
print(len(words_ts_h2))

1536797
2033
383417
1962
190795
1906
192622


In [19]:
def _create_document_indices(documents):
    """
    aux = []
    for j, doc in enumerate(documents):
      doc_aux = []
      for i in range(len(doc)):
        doc_aux.append(j)
      aux.append(doc_aux)
    """
    aux = [[j for i in range(len(doc))] for j, doc in enumerate(documents)]
    return [int(x) for y in aux for x in y]

def _create_bow(document_indices, words, num_docs, vocab_size):
    return sparse.coo_matrix(
        ([1] *
         len(document_indices),
         (document_indices,
          words)),
        shape=(
            num_docs,
            vocab_size)).tocsr()

# create the doc-indices for word-trains, word-test ...
doc_indices_train = _create_document_indices(docs_train)
print(len(doc_indices_train))
doc_indices_test = _create_document_indices(docs_test)
doc_indices_test_h1 = _create_document_indices(docs_test_h1)
doc_indices_test_h2 = _create_document_indices(docs_test_h2)

1536797


In [20]:
n_docs_train = len(docs_train)
n_docs_test = len(docs_test)
n_docs_test_h1 = len(docs_test_h1)
n_docs_test_h2 = len(docs_test_h2)

In [21]:
bow_train = _create_bow(
    doc_indices_train,
    words_train,
    n_docs_train,
    len(vocabulary))
bow_test = _create_bow(
    doc_indices_test,
    words_test,
    n_docs_test,
    len(vocabulary))
bow_test_h1 = _create_bow(
    doc_indices_test_h1,
    words_ts_h1,
    n_docs_test_h1,
    len(vocabulary))
bow_test_h2 = _create_bow(
    doc_indices_test_h2,
    words_ts_h2,
    n_docs_test_h2,
    len(vocabulary))
bow_train

<15044x2033 sparse matrix of type '<class 'numpy.int64'>'
	with 918964 stored elements in Compressed Sparse Row format>

In [22]:
print(bow_train.shape)
print(type(bow_train))
df = pd.DataFrame(bow_train.toarray())
print(df.iloc[0].unique())
df

(15044, 2033)
<class 'scipy.sparse.csr.csr_matrix'>
[0 1 3 2 5]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15041,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15042,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
df = pd.DataFrame(bow_test.toarray())
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,3,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3753,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3754,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3756,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
def _split_bow(bow_in, num_docs):
    indices = [[w for w in bow_in[doc, :].indices] for doc in range(num_docs)]
    counts = [[c for c in bow_in[doc, :].data] for doc in range(num_docs)]
    return indices, counts

def _to_numpy_array(documents):
    return np.array([[np.array(doc) for doc in documents]],
                    dtype=object).squeeze()

bow_train_tokens, bow_train_counts = _split_bow(bow_train, n_docs_train)
bow_test_tokens, bow_test_counts = _split_bow(bow_test, n_docs_test)

bow_test_h1_tokens, bow_test_h1_counts = _split_bow(
    bow_test_h1, n_docs_test_h1)
bow_test_h2_tokens, bow_test_h2_counts = _split_bow(
    bow_test_h2, n_docs_test_h2)

train_dataset = {
    'tokens': _to_numpy_array(bow_train_tokens),
    'counts': _to_numpy_array(bow_train_counts),
}

test_dataset = {
    'test': {
        'tokens': _to_numpy_array(bow_test_tokens),
        'counts': _to_numpy_array(bow_test_counts),
    },
    'test1': {
        'tokens': _to_numpy_array(bow_test_h1_tokens),
        'counts': _to_numpy_array(bow_test_h1_counts),
    },
    'test2': {
        'tokens': _to_numpy_array(bow_test_h2_tokens),
        'counts': _to_numpy_array(bow_test_h2_counts),
    }
}

In [24]:
print(len(train_dataset['tokens']))
print(train_dataset['tokens'][0]) #word-ids in the document
print(train_dataset['counts'][0]) #word-count of the word in the document

15044
[  11   62  118  129  132  176  191  205  255  292  329  337  354  416
  446  520  564  565  639  649  676  795  875  885  899  996 1011 1130
 1136 1239 1255 1265 1277 1281 1288 1358 1379 1382 1384 1438 1442 1457
 1481 1485 1550 1566 1594 1665 1704 1750 1752 1830 1998]
[1 3 1 1 1 1 2 1 1 1 3 3 1 1 1 2 1 1 1 3 2 1 2 1 1 1 1 1 1 1 1 1 1 1 5 2 1
 1 1 1 2 2 1 3 1 1 1 1 1 1 1 1 1]


# **Test gensim corpora**

In [25]:
# turn our tokenized documents into a id <-> term dictionary
from gensim import corpora, models
# to use gensim.corpora documents must be splited to list of terms

splited_documents = [[word for word in document.split() if word not in vectorizer.stop_words_] for document in dataset]
for i in range(0,2):
  print(splited_documents[i])
  print("\n")

num_docs = len(splited_documents)
dictionary = corpora.Dictionary(splited_documents)
print("Before-----filter----------")
print(dictionary.num_pos)
print(dictionary.num_docs)
print(len(list(dictionary.token2id.keys())))

print("After------filter----------")
dictionary.filter_extremes(no_below = int(0.01 * num_docs), no_above = 0.75)
dictionary.id2token = { v:k for k, v in dictionary.token2id.items()}
get_randoms(dictionary.id2token, 3)
print(dictionary.num_pos)
print(dictionary.num_docs)
print(len(list(dictionary.token2id.keys())))
print(dictionary.dfs)

['From:', 'lerxst@wam.umd.edu', "(where's", 'my', 'thing)', 'Subject:', 'WHAT', 'car', 'this!?', 'Nntp-Posting-Host:', 'rac3.wam.umd.edu', 'Organization:', 'University', 'Maryland,', 'College', 'Park', 'Lines:', '15', 'I', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'me', 'on', 'this', 'car', 'I', 'saw', 'other', 'day.', 'It', 'was', 'a', '2-door', 'car,', 'looked', 'be', 'late', '60s/', 'early', '70s.', 'It', 'was', 'called', 'a', 'Bricklin.', 'The', 'were', 'really', 'small.', 'In', 'addition,', 'front', 'was', 'separate', 'rest', 'body.', 'This', 'all', 'I', 'know.', 'If', 'anyone', 'can', 'a', 'model', 'name,', 'engine', 'specs,', 'years', 'production,', 'where', 'this', 'car', 'made,', 'history,', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'looking', 'car,', 'please', 'e-mail.', 'Thanks,', '-', 'IL', '----', 'brought', 'you', 'by', 'your', 'Lerxst', '----']


['From:', 'guykuo@carson.u.washington.edu', '(Guy', 'Kuo)', 'Subject:', 'SI', 'Clock', 'Poll',

In [26]:
!pip install -U embedded_topic_model



**Vocabulary will be updated to save only words, which are in the train dataset: vocabulary, word2id and id2word**

In [27]:
from embedded_topic_model.utils import preprocessing
vocabulary, train_dataset, _, = preprocessing.create_etm_datasets(
    documents, 
    min_df=0.01, 
    max_df=0.75, 
    train_size=0.85, 
)

In [28]:
train_dataset.keys()

dict_keys(['tokens', 'counts'])

In [29]:
for a in train_dataset['counts'][:10]:
  print(len(a))

26
29
59
12
6
24
17
46
42
151
