# **Load text data from the given source**

In [None]:
# download data and covert to the from that etm can understand and process
from sklearn.datasets import fetch_20newsgroups
train_data = fetch_20newsgroups(subset='train').data
test_data = fetch_20newsgroups(subset='test').data
documents = train_data
documents.extend(test_data)
print(f'Number of documents {len(documents)}')

Number of documents 18846


# **Sample texts from 20 News Groups Dataset**

In [None]:
for i in range(0,2):
  print(documents[i])
  print(100*"=")

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: University of Washington
Lines: 11
NNTP-Posting-Host: carson.u.wa

# **Preprocessing and Coverting**


1.   Preprocessing: 

  *   stopwords, punctuation, words filter by max-df, min-df
  *   lematization, lower
  *   remove empty documents after preprocessing

2.   Convert Text to Word-Embedding-Representation (BOW-dict)
3.   Split numeric representation to two parts of dataset

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import scipy.sparse
import pandas as pd

Count-Matrix `vectorized_documents`:
- Column: Vocabulary (word of V)
- Row: Document
- Each Element_{ij} describes the frequency of word j in document i

In [None]:
# using CountVectorizer to reduce the vocabulary
min_df = 0.01
max_df = 0.75
dataset = documents
vectorizer = CountVectorizer(min_df=min_df, max_df=max_df)
vectorized_documents = vectorizer.fit_transform(dataset)
# vocalubary 2130 words in V, 18845 documents
pd.DataFrame.sparse.from_spmatrix(vectorized_documents[:,:])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2120,2121,2122,2123,2124,2125,2126,2127,2128,2129
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18841,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,0,1,0,0,0
18842,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
18843,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,0,0,0
18844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# vocabulary and the frequency in the whole dataset
print(vectorizer.vocabulary_)
print(vectorizer.get_feature_names())
print(vectorizer.stop_words_)



In [None]:
# tokenization and remove stop words from documents
documents_without_stop_words = [
        [word for word in document.split()
            if word not in vectorizer.stop_words_]
        for document in dataset]
for i in range(0,2):
  print(documents_without_stop_words[i])
  print("\n")

['From:', 'lerxst@wam.umd.edu', "(where's", 'my', 'thing)', 'Subject:', 'WHAT', 'car', 'this!?', 'Nntp-Posting-Host:', 'rac3.wam.umd.edu', 'Organization:', 'University', 'Maryland,', 'College', 'Park', 'Lines:', '15', 'I', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'me', 'on', 'this', 'car', 'I', 'saw', 'other', 'day.', 'It', 'was', 'a', '2-door', 'car,', 'looked', 'be', 'late', '60s/', 'early', '70s.', 'It', 'was', 'called', 'a', 'Bricklin.', 'The', 'were', 'really', 'small.', 'In', 'addition,', 'front', 'was', 'separate', 'rest', 'body.', 'This', 'all', 'I', 'know.', 'If', 'anyone', 'can', 'a', 'model', 'name,', 'engine', 'specs,', 'years', 'production,', 'where', 'this', 'car', 'made,', 'history,', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'looking', 'car,', 'please', 'e-mail.', 'Thanks,', '-', 'IL', '----', 'brought', 'you', 'by', 'your', 'Lerxst', '----']


['From:', 'guykuo@carson.u.washington.edu', '(Guy', 'Kuo)', 'Subject:', 'SI', 'Clock', 'Poll',

In [None]:
signed_documents = vectorized_documents.sign() #greater than 0 = 1, 0 = 0, otherwise -1
signed_docs_as_pd = pd.DataFrame.sparse.from_spmatrix(signed_documents)

print(len(list(signed_docs_as_pd.iloc[0])))
print(f'sum in the row: {sum(list(signed_docs_as_pd.iloc[0]))}')

# how many documents hold each word of vocabulary
print(len(list(signed_docs_as_pd[0])))
print(f'sum in the colum: {sum(list(signed_docs_as_pd[0]))}')

2130
sum in the row: 64
18846
sum in the colum: 748


In [None]:
# saving the frequency of each word in Vocabulary over all documents/ look "sum in the column"
sum_counts = signed_documents.sum(axis=0)
print(sum_counts)
v_size = sum_counts.shape[1]
sum_counts_np = np.zeros(v_size, dtype=int)
for v in range(v_size):
    sum_counts_np[v] = sum_counts[0, v]
sum_counts_np.shape

[[748 690 348 ... 297 576 206]]


(2130,)

# **Prepare Vocabulary**

In [None]:
import random
def get_randoms(mdict, n_samples):
  #keys = random.sample(mdict.keys(), n_samples)
  keys = list(mdict.keys())[:n_samples]
  sample_d = {k: mdict[k] for k in keys}
  print("samples from dict: {}".format(sample_d))

word2id = {}
id2word = {}
for w in vectorizer.vocabulary_:
  word2id[w] = vectorizer.vocabulary_.get(w)
  id2word[vectorizer.vocabulary_.get(w)] = w
get_randoms(word2id, 3)
get_randoms(id2word, 3)

samples from dict: {'umd': 1973, 'edu': 676, 'where': 2070}
samples from dict: {1973: 'umd', 676: 'edu', 2070: 'where'}


In [None]:
#create and sort the vocabulary by the column-frequency of words
idx_sort = np.argsort(sum_counts_np)
print(idx_sort)
vocabulary = [id2word[idx_sort[cc]] for cc in range(v_size)]
vocabulary[:10]

[1595 1400  260 ... 1894 1030  815]


['reliable',
 'papers',
 'attacks',
 'covered',
 'ex',
 'enjoy',
 'stories',
 'scheme',
 'wayne',
 'coverage']

# **Test gensim corpora**

In [None]:
# turn our tokenized documents into a id <-> term dictionary
from gensim import corpora, models
# to use gensim.corpora documents must be splited to list of terms

splited_documents = [[word for word in document.split() if word not in vectorizer.stop_words_] for document in dataset]
for i in range(0,2):
  print(splited_documents[i])
  print("\n")

num_docs = len(splited_documents)
dictionary = corpora.Dictionary(splited_documents)
print("Before-----filter----------")
print(dictionary.num_pos)
print(dictionary.num_docs)
print(len(list(dictionary.token2id.keys())))

print("After------filter----------")
dictionary.filter_extremes(no_below = int(0.01 * num_docs), no_above = 0.75)
dictionary.id2token = { v:k for k, v in dictionary.token2id.items()}
get_randoms(dictionary.id2token, 3)
print(dictionary.num_pos)
print(dictionary.num_docs)
print(len(list(dictionary.token2id.keys())))
print(dictionary.dfs)

['From:', 'lerxst@wam.umd.edu', "(where's", 'my', 'thing)', 'Subject:', 'WHAT', 'car', 'this!?', 'Nntp-Posting-Host:', 'rac3.wam.umd.edu', 'Organization:', 'University', 'Maryland,', 'College', 'Park', 'Lines:', '15', 'I', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'me', 'on', 'this', 'car', 'I', 'saw', 'other', 'day.', 'It', 'was', 'a', '2-door', 'car,', 'looked', 'be', 'late', '60s/', 'early', '70s.', 'It', 'was', 'called', 'a', 'Bricklin.', 'The', 'were', 'really', 'small.', 'In', 'addition,', 'front', 'was', 'separate', 'rest', 'body.', 'This', 'all', 'I', 'know.', 'If', 'anyone', 'can', 'a', 'model', 'name,', 'engine', 'specs,', 'years', 'production,', 'where', 'this', 'car', 'made,', 'history,', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'looking', 'car,', 'please', 'e-mail.', 'Thanks,', '-', 'IL', '----', 'brought', 'you', 'by', 'your', 'Lerxst', '----']


['From:', 'guykuo@carson.u.washington.edu', '(Guy', 'Kuo)', 'Subject:', 'SI', 'Clock', 'Poll',

In [None]:
# data split to train, test and validation
train_size = 0.8
num_docs = signed_documents.shape[0]
train_dataset_size = int(np.floor(train_size * num_docs))
test_dataset_size = int(num_docs - train_dataset_size)
# validationset?
idx_permute = np.random.permutation(num_docs).astype(int)
print(train_dataset_size)
print(test_dataset_size)

15076
3770


**Vocabulary will be updated to save only words, which are in the train dataset: vocabulary, word2id and id2word**