In [1]:
import langid
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import OPTICS, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

DOCUMENTS_JSON = 'data/WordbagsBuckets.json'

In [2]:
df = pd.read_json(DOCUMENTS_JSON)
df = df[df['wordbag'] != ''].copy()
df['pl'] = [False if (langid.classify(' '.join(df.iloc[i]['wordbag'].split()[:30]))[0] == 'en') else True
             for i in range(0, len(df.values))]

corpus_pl = [row.wordbag for _,row in df[df['pl']].iterrows()]
corpus_en = [row.wordbag for _,row in df[~df['pl']].iterrows()]
sites_pl = df[df['pl']]['netloc'].values

In [3]:
np.random.seed(2137)
n_clusters = 4

vectorizer_pl = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=2, use_idf=True)
X_pl = vectorizer_pl.fit_transform(corpus_pl)

(27, 10996)


In [None]:
svd = TruncatedSVD(100)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X_pl = lsa.fit_transform(X_pl)

In [4]:
km_pl = KMeans(
        n_clusters=n_clusters,
        init="k-means++",
        max_iter=100,
        n_init=10
)
print('Clustering')
t0 = time.time()
km_pl.fit(X_pl)
print(f'done in {time.time() - t0}' )

Clustering
done in 0.08110952377319336


In [5]:
print(km_pl.labels_)
terms = vectorizer_pl.get_feature_names_out()
#original_space_centroids = svd.inverse_transform(km_pl.cluster_centers_)
#order_centroids = original_space_centroids.argsort()[:, ::-1]
order_centroids = km_pl.cluster_centers_.argsort()[:, ::-1]

[2 2 2 1 1 2 1 2 2 1 2 0 3 0 1 1 2 1 2 1 3 0 2 1 2 1 2]


In [6]:
for i in range(n_clusters):
    print(f'Cluster {i} ', end='')
    print([sites_pl[l] for l in range(len(sites_pl)) if km_pl.labels_[l] == i], end=':\n ')
    for k in order_centroids[i, :10]:
        print(f' {terms[k]} ', end='')
    print()

Cluster 0 ['drive.google.com', 'accounts.google.com', 'app.erasmusplusols.eu']:
  latviešu  hrvatski  magyar  slovenčina  slovenščina  български  ελληνικά  eesti  dansk  lietuvių 
Cluster 1 ['www.usosweb.uj.edu.pl', 'login.uj.edu.pl', 'pl.wikipedia.org', 'translate.google.pl', 'sjp.pl', 'im.uj.edu.pl', 'synonim.net', 'sjp.pwn.pl', 'api.stat.gov.pl', 'stat.gov.pl']:
  należeć  jagielloński  przedmiot  urząd  grupa  słowo  prosić  informatyka  studia  praca 
Cluster 2 ['www.youtube.com', 'www.linkedin.com', 'www.instagram.com', 'nofluffjobs.com', 'www.facebook.com', 'coub.com', 'docs.google.com', 'soundcloud.com', 'www.cloudskillsboost.google', 'pandas.pydata.org', 'bdl.stat.gov.pl', 'www.sokmarket.com.tr']:
  urząd  zablokować  wzgląd  cooki  adr  için  size  cookie  daha  plików 
Cluster 3 ['outlook.office.com', 'outlook.office365.com']:
  prywatność  utworzyć  cookie  uzyskać  ponownie  spróbować  warunki  logować  base  odświeżyć 


In [None]:
clust3_pl = OPTICS(min_samples=5, xi=0.05, min_cluster_size=0.1)
X3_pl = feature_matrix3_pl.toarray()
clust3_pl.fit(X3_pl)

In [None]:
space = np.arange(X3_pl.shape[0])
reachability = clust3_pl.reachability_[clust3_pl.ordering_]
labels = clust3_pl.labels_[clust3_pl.ordering_]

plt.figure(figsize=(18, 18))
G = gridspec.GridSpec(3, 3)
ax1 = plt.subplot(G[0, :])
ax2 = plt.subplot(G[1:3, :])

# Reachability plot
colors = ["g.", "r.", "b.", "y.", "m."]
for klass, color in zip(range(0, max(clust3_pl.labels_)+1), colors):
    Xk = space[labels == klass]
    Rk = reachability[labels == klass]
    ax1.plot(Xk, Rk, color, alpha=1)
ax1.plot(space[labels == -1], reachability[labels == -1], "k.", alpha=1)
ax1.set_ylabel("Reachability (epsilon distance)")
ax1.set_title("Reachability Plot")

# OPTICS
colors = ["g.", "r.", "b.", "y.", "m."]
for klass, color in zip(range(0, max(clust3_pl.labels_)+1), colors):
    Xk = X3_pl[clust3_pl.labels_ == klass]
    ax2.plot(Xk[:, 9], Xk[:, 63], color, alpha=1)
ax2.plot(X3_pl[clust3_pl.labels_ == -1, 0], X3_pl[clust3_pl.labels_ == -1, 1], "k+", alpha=1)
ax2.set_title("Automatic Clustering\nOPTICS")