Convert dataframe to input for doc2vec model for clustering

In [None]:
import pandas as pd
import src.constants as const
from os.path import join

directory = join("..", "..", "paragraph-vectors", "data")

In [None]:
df = pd.read_pickle(const.JOURNALS_DF)
df2 = pd.DataFrame()
df2["text"] = df["dc:description"][~df["dc:description"].isna()].str.lower().values
df2.to_csv(join(directory, "scopus-2.csv"), index=False, header=False)

Further dimensionality reduction with PCA and T-SNE

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd
from os.path import join
directory = join("..", "..", "paragraph-vectors", "data")
df = pd.read_csv(
    join(directory,"scopus-2_model.dbow_numnoisewords.2_vecdim.2000_batchsize.128_lr.0.001000_epoch.24_loss.1.087058.csv"))

In [None]:
from sklearn.cluster import KMeans
k = 20
kmeans = KMeans(n_clusters=k, random_state=42) # , n_jobs=-1
y_pred = kmeans.fit_predict(df)


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1000, random_state=42)
X_embedded_pca= pca.fit_transform(df)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
tsne = TSNE(random_state=42, perplexity=100) #
X_embedded_tsne = tsne.fit_transform(df[:10000]) # X_embedded_pca

In [None]:
import seaborn as sns
import os

# sns settings
sns.set(rc={'figure.figsize':(20,20)})

# plot
# hue=(list(y_pred))
sns.scatterplot(X_embedded_tsne[:,0], X_embedded_tsne[:,1], legend='full', palette="bright")
plt.title("Social Work Literature Clustering")
plt.tight_layout()
# plt.savefig(os.path.join(const.ARTIFACTS_DIR, "lit-clustering.pdf"), dpi=300)
plt.show()

## Save for bokeh

In [None]:
import pickle
import src.constants as const

path = join(const.BOKEH_DIR, "X-embedding-doc2vec.pkl")
with open(path, "wb") as f:
    pickle.dump(X_embedded_tsne, f)

path = join(const.BOKEH_DIR, "y-pred-doc2vec.pkl")
with open(path, "wb") as f:
    pickle.dump(y_pred, f)




# GenSim

doc2vec with gensim

https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html#sphx-glr-auto-examples-tutorials-run-doc2vec-lee-py

In [None]:
import gensim
from os.path import join

def read_corpus(fname, tokens_only=False):
    with open(fname) as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])



In [None]:
directory = join("..", "..", "paragraph-vectors", "data")
path = join(directory, "scopus-2.csv")

train_corpus = list(read_corpus(path))
test_corpus = list(read_corpus(path, tokens_only=True))

In [None]:
print(train_corpus[:2])

In [None]:
import logging
FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
logging.basicConfig(format=FORMAT, level=logging.DEBUG)


In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=1000, min_count=2, epochs=100, workers=16, dbow_words=1,
                                      min_alpha=5e-7, seed=0, alpha=0.1, max_vocab_size=5000)

In [None]:
model.build_vocab(train_corpus)

In [None]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs, queue_factor=10)

In [None]:
vectors = [model.infer_vector(doc) for doc in test_corpus]

In [None]:
import numpy as np

vectors = np.array(vectors)


In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
# pca = PCA(n_components=100, random_state=42)
# X_embedded_pca = pca.fit_transform(vectors)
#
# print(sum(pca.explained_variance_ratio_))

tsne = TSNE(random_state=42, perplexity=100) #
X_embedded_tsne = tsne.fit_transform(vectors) # X_embedded_pca

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# sns settings
sns.set(rc={'figure.figsize':(20,20)})

# plot
# hue=(list(y_pred))
sns.scatterplot(X_embedded_tsne[:,0], X_embedded_tsne[:,1], legend='full', palette="bright")
plt.title("Social Work Literature Clustering")
plt.tight_layout()
# plt.savefig(os.path.join(const.ARTIFACTS_DIR, "lit-clustering.pdf"), dpi=300)
plt.show()

In [None]:
############################################################################################

In [None]:
from sklearn.cluster import KMeans
k = 20
kmeans = KMeans(n_clusters=k, random_state=42, n_jobs=-1)
%time y_pred = kmeans.fit_predict(vectors)


In [None]:
# plot
# hue=(list(y_pred))
sns.scatterplot(X_embedded_tsne[:,0], X_embedded_tsne[:,1], hue=list(y_pred), legend='full', palette="bright")
plt.title("Social Work Literature Clustering")
plt.tight_layout()
# plt.savefig(os.path.join(const.ARTIFACTS_DIR, "lit-clustering.pdf"), dpi=300)
plt.show()


In [None]:
import pickle
import os
import src.constants as const

with open(os.path.join(const.BOKEH_DIR, "X-embedding-gensim-doc2vec.pkl"), "wb") as f:
    pickle.dump(X_embedded_tsne, f)

with open(os.path.join(const.BOKEH_DIR, "y-pred-gensim-doc2vec.pkl"), "wb") as f:
    pickle.dump(y_pred, f)
