Cluster literature using TF-IDF and K-Means. Dimensionality reduction via PCA and T-SNE

From:
[arxiv-literature-clustering](https://www.kaggle.com/maksimeren/arxiv-literature-clustering)


In [None]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import src.constants as const
import os
plt.rcParams["figure.figsize"] = (10,10)
sb.set()

df = pd.read_pickle(const.JOURNALS_DF)

In [None]:
todrop = df["dc:description:tokenized"].isna()
df = df[~todrop]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def vectorize(text, maxx_features):
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X

In [None]:
a = df["dc:description:tokenized"]

In [None]:
%time X = vectorize(df["dc:description:tokenized"], 5000)


In [None]:
from sklearn.cluster import KMeans
k = 20
kmeans = KMeans(n_clusters=k, random_state=42, n_jobs=-1)
%time y_pred = kmeans.fit_predict(X)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20, random_state=42)
%time X_embedded_pca= pca.fit_transform(X.toarray())
X_embedded_pca.shape


In [None]:
from sklearn.manifold import TSNE

tsne = TSNE(verbose=0, perplexity=100, random_state=42)
%time X_embedded_tsne = tsne.fit_transform(X_embedded_pca)

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
# sns settings
sns.set(rc={'figure.figsize':(20,20)})

# plot
sns.scatterplot(X_embedded_tsne[:,0], X_embedded_tsne[:,1], hue=list(y_pred), legend='full', palette="bright")
plt.title("Social Work Literature Clustering")
plt.tight_layout()
plt.savefig(os.path.join(const.ARTIFACTS_DIR, "lit-clustering.pdf"), dpi=300)
plt.show()


In [None]:
import pickle

with open(os.path.join(const.BOKEH_DIR, "X-embedding-tfidf.pkl"), "wb") as f:
    pickle.dump(X_embedded_tsne, f)

with open(os.path.join(const.BOKEH_DIR, "y-pred-tfidf.pkl"), "wb") as f:
    pickle.dump(y_pred, f)



In [None]:
# KMEANS HPO
# caluclate distortion for various k values to determine optimal number of clusters

# ks = []
# values = []
#
# for k in range(2, 300):
#     print(f"Clusters: {k}")
#     kmeans = KMeans(n_clusters=k, random_state=42, n_jobs=-1)
#     %time y_pred = kmeans.fit_predict(X)
#     v = kmeans.inertia_
#     ks.append(k)
#     values.append(v)
#     print(f"Distortion: {v}")

# plt.plot(ks, values)