In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
newsgroups = fetch_20newsgroups(remove=('headers', 'footers'))

In [None]:
newsgroups.target_names

In [None]:
newsgroups.data[0]

In [None]:
newsgroups.target_names[newsgroups.target[0]]

In [None]:
origin = newsgroups.target_names[newsgroups.target[0]]
print(f"The post at index 0 first appeared in the '{origin}' group")

In [None]:
len(newsgroups.data)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(stop_words='english')

In [None]:
tf_matrix = vectorizer.fit_transform(newsgroups.data)

In [None]:
type(tf_matrix)

In [None]:
tf_np_matrix = tf_matrix.toarray()

In [None]:
tf_vector = tf_np_matrix[0]
non_zero_indices = np.flatnonzero(tf_vector)
words = vectorizer.get_feature_names_out()
unique_words = [words[index] for index in non_zero_indices]
data = {'Word': unique_words,
'Count': tf_vector[non_zero_indices]}
df = pd.DataFrame(data).sort_values('Count', ascending=False)
print(f"After stop-word deletion, {df.shape[0]} unique words remain.")

In [None]:
df.head(10)

In [None]:
df.Word

In [None]:
non_zero_indices

In [None]:
sub_matrix = tf_np_matrix[:, non_zero_indices]

In [None]:
sub_matrix[0]

In [None]:
from sklearn.preprocessing import binarize

In [None]:
binary_matrix = binarize(sub_matrix)

In [None]:
binary_matrix

In [None]:
binary_matrix.sum(axis=0).shape

In [None]:
unique_post_mentions = binary_matrix.sum(axis=0)

In [None]:
unique_post_mentions

In [None]:
np_post_mentions = binarize(tf_np_matrix[:,non_zero_indices]).sum(axis=0)
csr_post_mentions = binarize(tf_matrix[:,non_zero_indices]).sum(axis=0)
print(f'NumPy matrix-generated counts:\n {np_post_mentions}\n')
print(f'CSR matrix-generated counts:\n {csr_post_mentions}')

In [None]:
dataset_size = len(newsgroups.data)
document_frequencies = unique_post_mentions / dataset_size

In [None]:
data = {'Word': unique_words,
'Count': tf_vector[non_zero_indices],
'Document Frequency': document_frequencies}

In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df_common_words = df[df['Document Frequency'] > 0.1]

In [None]:
df_common_words.head(10)

In [None]:
df_sorted = df.sort_values(['Count','Document Frequency'], ascending=[False, True])

In [None]:
df_sorted.head(10)

In [None]:
inverse_document_frequencies = 1 / document_frequencies

In [None]:
df['IDF'] = inverse_document_frequencies

In [None]:
df['Combined'] = df.Count * df.IDF

In [None]:
df['Combined']

In [None]:
df_sorted = df.sort_values('Combined', ascending=False)

In [None]:
df['Combined'] = df.Count * np.log10(df.IDF)

In [None]:
df_sorted = df.sort_values('Combined', ascending=False)

In [None]:
df_sorted.head(10)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(newsgroups.data)

In [None]:
tfidf_matrix[0]

In [None]:
assert tfidf_matrix.shape == tf_matrix.shape

In [None]:
assert np.all(tfidf_vectorizer.get_feature_names_out() == words)

In [None]:
tfidf_np_matrix = tfidf_matrix.toarray()

In [None]:
tfidf_vector = tfidf_np_matrix[0]
tfidf_non_zero_indices = np.flatnonzero(tfidf_vector)
assert np.array_equal(tfidf_non_zero_indices,
non_zero_indices)

In [None]:
df['TFIDF'] = tfidf_vector[non_zero_indices]

In [None]:
df

In [None]:
df_sorted_old = df.sort_values('Combined', ascending=False)
df_sorted_new = df.sort_values('TFIDF', ascending=False)
assert np.array_equal(df_sorted_old['Word'].values, df_sorted_new['Word'].values)

In [None]:
df_sorted_new.head(10)

In [None]:
np.linalg.norm(df.TFIDF.values)

In [None]:
tfidf_np_matrix @ tfidf_np_matrix[0]

In [None]:
cosine_similarities = _

In [None]:
np.argsort(cosine_similarities)[-2]

In [None]:
most_similar_index = np.argsort(cosine_similarities)[-2]

In [None]:
similarity = cosine_similarities[most_similar_index]

In [None]:
most_similar_post = newsgroups.data[most_similar_index]
print(f"The following post has a cosine similarity of {similarity:.2f} "
"with newsgroups.data[0]:\n")
print(most_similar_post)

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
truncated_svd = TruncatedSVD(n_components=100)

In [None]:
np.random.seed(0)
shrunk_matrix = truncated_svd.fit_transform(tfidf_matrix)

In [None]:
shrunk_matrix.shape

In [None]:
magnitude = np.linalg.norm(shrunk_matrix[0])

In [None]:
magnitude

In [None]:
from sklearn.preprocessing import normalize

In [None]:
shrunk_norm_matrix = normalize(shrunk_matrix)

In [None]:
shrunk_norm_matrix.shape

In [None]:
shrunk_norm_matrix @ shrunk_norm_matrix.T

In [None]:
cosine_similarity_matrix = _

In [None]:
cosine_similarity_matrix.shape

In [None]:
np.random.seed(1)
index1 = np.random.randint(dataset_size)

In [None]:
index2 = np.argsort(cosine_similarity_matrix[index1])[-2]
similarity = cosine_similarity_matrix[index1][index2]
print(f"The posts at indices {index1} and {index2} share a cosine " f"similarity of {similarity:.2f}")

In [None]:
print(newsgroups.data[index1].replace('\n\n', '\n'))

In [None]:
print(newsgroups.data[index2].replace('\n\n', '\n'))

In [None]:
from sklearn.cluster import DBSCAN
cluster_model = DBSCAN(eps=.4, min_samples=50, metric='cosine')
clusters = cluster_model.fit_predict(shrunk_matrix)

In [None]:
cluster_count = clusters.max() + 1

In [None]:
np.random.seed(0)
import time
from sklearn.cluster import KMeans, MiniBatchKMeans
times = []
k = 20
for KMeans_class in [KMeans, MiniBatchKMeans]:
    start = time.time()
    KMeans_class(k).fit(shrunk_norm_matrix)
    end = time.time()
    times.append(end - start)
running_time_ratio = times[0] / times[1]
print(f"Mini Batch K-means ran {running_time_ratio:.2f} times faster than regular K-means")

In [None]:
np.random.seed(0)
k_values = range(1, 61)
inertia_values = [MiniBatchKMeans(k).fit(shrunk_norm_matrix).inertia_ for k in k_values]
plt.plot(k_values, inertia_values)
plt.xlabel('K')
plt.ylabel('Inertia')
plt.axvline(20, c='k')
plt.grid(True)

In [None]:
np.random.seed(0)
cluster_model = KMeans(n_clusters=20)
clusters = cluster_model.fit_predict(shrunk_norm_matrix)
df = pd.DataFrame({'Index': range(dataset_size), 'Cluster': clusters})

In [None]:
df

In [None]:
df_car = df[df.Cluster == clusters[0]]
cluster_size = df_car.shape[0]
print(f"{cluster_size} posts cluster together with the car-themed post at index 0")

In [None]:
np.random.seed(1)
def get_post_category(index):
    return newsgroups.target_names[newsgroups.target[index]]

In [None]:
random_index = np.random.choice(df_car.Index.values)
post_category = get_post_category(random_index)
print(f"This post appeared in the {post_category} discussion group:\n")
print(newsgroups.data[random_index].replace('\n\n', '\n'))

In [None]:
rec_autos_count = 0
for index in df_car.Index.values:
    if get_post_category(index) == 'rec.autos':
        rec_autos_count += 1
rec_autos_percent = 100 * rec_autos_count / cluster_size
print(f"{rec_autos_percent:.2f}% of posts within the cluster appeared "
"in the rec.autos discussion group")

In [None]:
np.random.seed(1)
not_autos_indices = [index for index in df_car.Index.values
if get_post_category(index) != 'rec.autos']
random_index = np.random.choice(not_autos_indices)
post_category = get_post_category(random_index)
print(f"This post appeared in the {post_category} discussion group:\n")
print(newsgroups.data[random_index].replace('\n\n', '\n'))

In [None]:
def rank_words_by_tfidf(indices, word_list=words):
    summed_tfidf = np.asarray(tfidf_matrix[indices].sum(axis=0)).ravel()
    data = {'Word': word_list, 'Summed TFIDF': summed_tfidf}
    return pd.DataFrame(data).sort_values('Summed TFIDF', ascending=False)

In [None]:
df_rank_words = rank_words_by_tfidf(not_autos_indices)

In [None]:
df_rank_words

In [None]:
df_ranked_words = rank_words_by_tfidf(df_car.Index.values)

In [None]:
df_ranked_words.head(10)

In [None]:
i = 0
for x_coord in np.arange(0, 1, .2):
    for y_coord in np.arange(0, 1, .2):
        word, significance = df_ranked_words.iloc[i].values
        plt.text(y_coord, x_coord, word, fontsize=2 * significance)
        i+=1

In [None]:
from wordcloud import WordCloud
cloud_generator = WordCloud(random_state=1)

In [None]:
words_to_score = {word: score for word, score in df_ranked_words[:10].values}
wordcloud_image = cloud_generator.fit_words(words_to_score)

In [None]:
plt.imshow(wordcloud_image)

In [None]:
wordcloud_image = WordCloud(random_state=1, background_color='white').fit_words(words_to_score)
plt.imshow(wordcloud_image)

In [None]:
wordcloud_image = WordCloud(random_state=1, background_color='white').fit_words(words_to_score)
plt.imshow(wordcloud_image)

In [None]:
cloud_generator = WordCloud(background_color='white',
random_state=1)
wordcloud_image = cloud_generator.fit_words(words_to_score)
plt.imshow(wordcloud_image, interpolation="bilinear")

In [None]:
np.random.seed(1)
def cluster_to_image(df_cluster, max_words=15):
    indices = df_cluster.Index.values
    df_ranked_words = rank_words_by_tfidf(indices)[:max_words]
    words_to_score = {word: score for word, score in df_ranked_words[:max_words].values}
    cloud_generator = WordCloud(background_color='white', color_func=_color_func, random_state=1)
    wordcloud_image = cloud_generator.fit_words(words_to_score)
    return wordcloud_image
def _color_func(*args, **kwargs):
    return np.random.choice(['black', 'blue', 'teal', 'purple', 'brown'])
cluster_id = np.random.randint(0, 20)
df_random_cluster = df[df.Cluster == cluster_id]
wordcloud_image = cluster_to_image(df_random_cluster)
plt.imshow(wordcloud_image, interpolation="bilinear")

In [None]:
from collections import Counter
def get_top_category(df_cluster):
    categories = [get_post_category(index) for index in df_cluster.Index.values]
    top_category, _ = Counter(categories).most_common()[0]
    return top_category
top_category = get_top_category(df_random_cluster)
print("The posts within the cluster commonly appear in the "
f"'{top_category}' newsgroup")

In [None]:
figure, axes = plt.subplots(2, 2)
for r in range(2):
    for c in range(2):
        x = np.arange(0, 1, .2)
        y = r*x*x + c*x
        axes[r,c].plot(x, y)

In [None]:
figure, axes = plt.subplots(2, 2)
for r in range(2):
    for c in range(2):
        if (r, c) == (1, 0):
            axes[r][c].set_title(top_category)
            axes[r][c].imshow(wordcloud_image, interpolation="bilinear")
        else:
            x = np.arange(0, 1, .2)
            y = r * x * x + c * x
            axes[r][c].plot(x, y)

In [None]:
np.random.seed(0)
def get_title(df_cluster):
    top_category = get_top_category(df_cluster)
    cluster_id = df_cluster.Cluster.values[0]
    return f"{cluster_id}: {top_category}"
figure, axes = plt.subplots(5, 4, figsize=(20, 15))
cluster_groups = list(df.groupby('Cluster'))
for r in range(5):
    for c in range(4):
        _, df_cluster = cluster_groups.pop(0)
        wordcloud_image = cluster_to_image(df_cluster)
        ax = axes[r, c]
        ax.imshow(wordcloud_image, interpolation="bilinear")
        ax.set_title(get_title(df_cluster))
        ax.set_xticks([])
        ax.set_yticks([])


In [None]:
np.random.seed(3)
df_cluster= df[df.Cluster == 7]
df_ranked_words = rank_words_by_tfidf(df_cluster.Index.values)
words_to_score = {word: score
for word, score in df_ranked_words[10:25].values}
cloud_generator = WordCloud(background_color='white',
color_func=_color_func,
random_state=1)
wordcloud_image = cloud_generator.fit_words(words_to_score)
plt.imshow(wordcloud_image, interpolation="bilinear")
plt.title(get_title(df_cluster), fontsize=20)
plt.xticks([])
plt.yticks([])