In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
newsgroups = fetch_20newsgroups(remove=('headers', 'footers'))

In [None]:
newsgroups.target_names

In [None]:
newsgroups.data[0]

In [None]:
newsgroups.target_names[newsgroups.target[0]]

In [None]:
origin = newsgroups.target_names[newsgroups.target[0]]
print(f"The post at index 0 first appeared in the '{origin}' group")

In [None]:
len(newsgroups.data)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(stop_words='english')

In [None]:
tf_matrix = vectorizer.fit_transform(newsgroups.data)

In [None]:
type(tf_matrix)

In [None]:
tf_np_matrix = tf_matrix.toarray()

In [None]:
tf_vector = tf_np_matrix[0]
non_zero_indices = np.flatnonzero(tf_vector)
words = vectorizer.get_feature_names_out()
unique_words = [words[index] for index in non_zero_indices]
data = {'Word': unique_words,
'Count': tf_vector[non_zero_indices]}
df = pd.DataFrame(data).sort_values('Count', ascending=False)
print(f"After stop-word deletion, {df.shape[0]} unique words remain.")

In [None]:
df.head(10)

In [None]:
df.Word

In [None]:
non_zero_indices

In [None]:
sub_matrix = tf_np_matrix[:, non_zero_indices]

In [None]:
sub_matrix[0]

In [None]:
from sklearn.preprocessing import binarize

In [None]:
binary_matrix = binarize(sub_matrix)

In [None]:
binary_matrix

In [None]:
binary_matrix.sum(axis=0).shape

In [None]:
unique_post_mentions = binary_matrix.sum(axis=0)

In [None]:
unique_post_mentions

In [None]:
np_post_mentions = binarize(tf_np_matrix[:,non_zero_indices]).sum(axis=0)
csr_post_mentions = binarize(tf_matrix[:,non_zero_indices]).sum(axis=0)
print(f'NumPy matrix-generated counts:\n {np_post_mentions}\n')
print(f'CSR matrix-generated counts:\n {csr_post_mentions}')

In [None]:
dataset_size = len(newsgroups.data)
document_frequencies = unique_post_mentions / dataset_size

In [None]:
data = {'Word': unique_words,
'Count': tf_vector[non_zero_indices],
'Document Frequency': document_frequencies}

In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df_common_words = df[df['Document Frequency'] > 0.1]

In [None]:
df_common_words.head(10)

In [None]:
df_sorted = df.sort_values(['Count','Document Frequency'], ascending=[False, True])

In [None]:
df_sorted.head(10)

In [None]:
inverse_document_frequencies = 1 / document_frequencies

In [None]:
df['IDF'] = inverse_document_frequencies

In [None]:
df['Combined'] = df.Count * df.IDF

In [None]:
df['Combined']

In [None]:
df_sorted = df.sort_values('Combined', ascending=False)

In [None]:
df['Combined'] = df.Count * np.log10(df.IDF)

In [None]:
df_sorted = df.sort_values('Combined', ascending=False)

In [None]:
df_sorted.head(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(newsgroups.data)

In [None]:
tfidf_matrix[0]

In [None]:
assert tfidf_matrix.shape == tf_matrix.shape

In [None]:
assert np.all(tfidf_vectorizer.get_feature_names_out() == words)

In [None]:
tfidf_np_matrix = tfidf_matrix.toarray()

In [None]:
tfidf_vector = tfidf_np_matrix[0]
tfidf_non_zero_indices = np.flatnonzero(tfidf_vector)
assert np.array_equal(tfidf_non_zero_indices,
non_zero_indices)

In [None]:
df['TFIDF'] = tfidf_vector[non_zero_indices]

In [None]:
df

In [None]:
df_sorted_old = df.sort_values('Combined', ascending=False)
df_sorted_new = df.sort_values('TFIDF', ascending=False)
assert np.array_equal(df_sorted_old['Word'].values, df_sorted_new['Word'].values)

In [None]:
df_sorted_new.head(10)

In [None]:
np.linalg.norm(df.TFIDF.values)

In [None]:
tfidf_np_matrix @ tfidf_np_matrix[0]

In [None]:
cosine_similarities = _

In [None]:
np.argsort(cosine_similarities)[-2]

In [None]:
most_similar_index = np.argsort(cosine_similarities)[-2]

In [None]:
similarity = cosine_similarities[most_similar_index]

In [None]:
most_similar_post = newsgroups.data[most_similar_index]
print(f"The following post has a cosine similarity of {similarity:.2f} "
"with newsgroups.data[0]:\n")
print(most_similar_post)

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
truncated_svd = TruncatedSVD(n_components=100)

In [None]:
np.random.seed(0)
shrunk_matrix = truncated_svd.fit_transform(tfidf_matrix)

In [None]:
shrunk_matrix.shape

In [None]:
magnitude = np.linalg.norm(shrunk_matrix[0])

In [None]:
magnitude

In [None]:
from sklearn.preprocessing import normalize

In [None]:
shrunk_norm_matrix = normalize(shrunk_matrix)

In [None]:
shrunk_norm_matrix.shape

In [None]:
shrunk_norm_matrix @ shrunk_norm_matrix.T

In [None]:
cosine_similarity_matrix = _

In [None]:
cosine_similarity_matrix.shape

In [None]:
np.random.seed(1)
index1 = np.random.randint(dataset_size)

In [None]:
index2 = np.argsort(cosine_similarity_matrix[index1])[-2]
similarity = cosine_similarity_matrix[index1][index2]
print(f"The posts at indices {index1} and {index2} share a cosine " f"similarity of {similarity:.2f}")

In [None]:
print(newsgroups.data[index1].replace('\n\n', '\n'))

In [None]:
print(newsgroups.data[index2].replace('\n\n', '\n'))