# Comparison of PCA+DBSCAN to UMAP

This can be [run in Google Colab](https://colab.research.google.com/github/jreades/ph-word-embeddings/blob/main/Comparison_to_PCA.ipynb).

In [None]:
import pandas as pd
import numpy as np
import pickle
import math
import re
import os

from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap

import seaborn as sns

In [None]:
import spacy
try:
    import en_core_web_lg
    nlp = en_core_web_lg.load()
except ModuleNotFoundError, OSError:
    import spacy.cli
    spacy.cli.download("en_core_web_lg")
    import en_core_web_lg
    nlp = en_core_web_lg.load()

In [None]:
try:
    import umap
except ModuleNotFoundError:
    !pip install umap-learn
    import umap

### Process Tutorial Data 

For demonstration purposes, here we pick up with Part 2 of the ['Clustering with sklearn'](https://programminghistorian.org/en/lessons/clustering-with-scikit-learn-in-python#1-loading-the-dataset--exploratory-data-analysis) tutorial on The Programming Historian.

In [None]:
ddf = pd.read_csv('https://raw.githubusercontent.com/programminghistorian/jekyll/gh-pages/assets/clustering-with-scikit-learn-in-python/data/RELIGION_abstracts.csv', 
                 usecols=['title','abstract','link','volume'])
ddf.head()

In [None]:
ddf.describe()

In [None]:
def lemmatizeAbstracts(x):
        doc = nlp(x)
        new_text = []
        for token in doc:
            new_text.append(token.lemma_)
        text_string = " ".join(new_text)
        # getting rid of non-word characters
        text_string = re.sub(r"[^\w\s]+", "", text_string)
        text_string = re.sub(r"\s{2,}", " ", text_string)
        return text_string

ddf["abstract_lemma"] = ddf.abstract.apply(lemmatizeAbstracts)

In [None]:
tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1,2), max_features=250, strip_accents="unicode", min_df=10, max_df=200)
tfidf_religion_array = tfidf.fit_transform(ddf["abstract_lemma"])
df_abstracts_tfidf = pd.DataFrame(tfidf_religion_array.toarray(), index=ddf.index, columns=tfidf.get_feature_names_out())
df_abstracts_tfidf.describe()

### PCA

In [None]:
# using PCA to reduce the dimensionality
pca = PCA(n_components=10, whiten=False, random_state=42)
abstracts_pca = pca.fit_transform(df_abstracts_tfidf)
df_abstracts_pca = pd.DataFrame(data=abstracts_pca)
df_abstracts_pca.head()

Here's the issue with PCA:

In [None]:
print(f"Total explained variance is {np.sum(pca.explained_variance_)*100:0.2f}% with first eigenvector explaining {pca.explained_variance_[0]*100:0.2f}%")

In [None]:
# I cannot reproduce the 4-cluster result using an eps of 0.2
dbscan = DBSCAN(eps=0.2, metric="euclidean")
dbscan_labels = dbscan.fit_predict(df_abstracts_pca)
df_abstracts_dbscan = ddf.copy()
df_abstracts_dbscan['cluster'] = dbscan_labels
df_abstracts_dbscan.cluster.value_counts()

### UMAP

In [None]:
reducer = umap.UMAP(
            n_neighbors=8,
            min_dist=0.02,
            n_components=2,
            random_state=42)
    
# Basically reduces our 300 feature vectors for each thesis, down to n dimensions
X_embedded = reducer.fit_transform(df_abstracts_tfidf)
print(f"Resulting embedding is: {X_embedded.shape[0]} rows by {X_embedded.shape[1]} columns.")

In [None]:
# Grab the DBSCAN assignments based on the 
# PCA-decomposed data
f,axs = plt.subplots(1,1,figsize=(12,6))
f.suptitle("UMAP Output") 

if isinstance(axs, np.ndarray):
    axs = axs.reshape(-1)
else:
    axs = [axs]

newcolors = np.insert(cm.get_cmap('tab10', 5).colors, 0, [0.6, 0.6, 0.6, 0.7], axis=0)
newcmp = ListedColormap(newcolors)
    
for i, ax in enumerate(axs):
    ax.set_xlabel(f"Dimension {i*2+1}")
    ax.set_ylabel(f"Dimension {i*2+2}")
    sctr = ax.scatter(x=X_embedded[:,i*2], y=X_embedded[:,i*2+1], s=8, c=dbscan_labels, cmap=newcmp)
    ax.legend(*sctr.legend_elements(), loc='upper left', title='PCA-Derived\nDBSCAN Clusters')
    
f.tight_layout()
#plt.savefig(os.path.join('UMAP_Output.png'), dpi=150)

In [None]:
fg_list = ['darkgrey','darkorange','lightblue','brown','red','green']
fp = os.path.join(os.sep,'Library','Font','Khula-Light.ttf')
bg = 'white'
wd = 50

def get_cloud(fg='black'):
    return WordCloud(
        #font_path=fp,
        max_words=wd,
        width=1000, height=1000,
        mode='RGBA',
        background_color=bg,
        color_func=lambda *args ,**kwargs: fg,
        stopwords=['religion','religious','article','study','paper'])

In [None]:
# From https://stackoverflow.com/a/45096142
stopwords=['religion','religious','article','study','paper','new','use','research','analysis']
stpw = text.ENGLISH_STOP_WORDS.union(stopwords)

vectorizer = TfidfVectorizer(stop_words=stpw)

num_clouds = df_abstracts_dbscan.cluster.max()+1 # Counts from zero

f,axs = plt.subplots(math.ceil(num_clouds/2),2,figsize=(12,12))

for i in range(0,max(dbscan_labels)+1):
    cldf = df_abstracts_dbscan[df_abstracts_dbscan.cluster==i]
    vecs = vectorizer.fit_transform(cldf.abstract_lemma)
    feature_names = vectorizer.get_feature_names_out()
    dense = vecs.todense()
    lst1 = dense.tolist()
    df = pd.DataFrame(lst1, columns=feature_names)
    wordcloud = get_cloud().generate_from_frequencies(df.T.sum(axis=1))
    axs.reshape(-1)[i].imshow(wordcloud, interpolation='bilinear')
    axs.reshape(-1)[i].axis('off')
    axs.reshape(-1)[i].set_title(f'Cluster {i}')

plt.axis('off')
plt.suptitle("Cluster TF/IDF")
plt.tight_layout(pad=1.35)

In [None]:
pd.set_option('display.max_colwidth',150)
for i in range(0,4):
    print(f"Cluster {i}")
    egs = dfl[dfl.cluster==i].title.sample(5, random_state=42)
    for e in egs:
        print(f"\tExample: {e[:75]}")
    print()
    