In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
newsgroups = fetch_20newsgroups(remove=('headers', 'footers'))

In [None]:
newsgroups.target_names

In [None]:
newsgroups.data[0]

In [None]:
newsgroups.target_names[newsgroups.target[0]]

In [None]:
origin = newsgroups.target_names[newsgroups.target[0]]
print(f"The post at index 0 first appeared in the '{origin}' group")

In [None]:
len(newsgroups.data)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(stop_words='english')

In [None]:
tf_matrix = vectorizer.fit_transform(newsgroups.data)

In [None]:
type(tf_matrix)

In [None]:
tf_np_matrix = tf_matrix.toarray()

In [None]:
tf_vector = tf_np_matrix[0]
non_zero_indices = np.flatnonzero(tf_vector)
words = vectorizer.get_feature_names_out()
unique_words = [words[index] for index in non_zero_indices]
data = {'Word': unique_words,
'Count': tf_vector[non_zero_indices]}
df = pd.DataFrame(data).sort_values('Count', ascending=False)
print(f"After stop-word deletion, {df.shape[0]} unique words remain.")

In [None]:
df.head(10)

In [None]:
df.Word

In [None]:
non_zero_indices

In [None]:
sub_matrix = tf_np_matrix[:, non_zero_indices]

In [None]:
sub_matrix[0]

In [None]:
from sklearn.preprocessing import binarize

In [None]:
binary_matrix = binarize(sub_matrix)

In [None]:
binary_matrix

In [None]:
binary_matrix.sum(axis=0).shape

In [None]:
unique_post_mentions = binary_matrix.sum(axis=0)

In [None]:
unique_post_mentions

In [None]:
np_post_mentions = binarize(tf_np_matrix[:,non_zero_indices]).sum(axis=0)
csr_post_mentions = binarize(tf_matrix[:,non_zero_indices]).sum(axis=0)
print(f'NumPy matrix-generated counts:\n {np_post_mentions}\n')
print(f'CSR matrix-generated counts:\n {csr_post_mentions}')

In [None]:
dataset_size = len(newsgroups.data)
document_frequencies = unique_post_mentions / dataset_size

In [None]:
data = {'Word': unique_words,
'Count': tf_vector[non_zero_indices],
'Document Frequency': document_frequencies}

In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df_common_words = df[df['Document Frequency'] > 0.1]

In [None]:
df_common_words.head(10)

In [None]:
df_sorted = df.sort_values(['Count','Document Frequency'], ascending=[False, True])

In [None]:
df_sorted.head(10)