In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import tensorflow_text
import tensorflow_hub as hub
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim
import clustering
from sentence_transformers import SentenceTransformer
import cupy as cp

### Reading tweet files

In [3]:
folder = 'files/'
df_with_sw = pd.read_csv(folder + 'clean_tweets_with_stopwords.csv')
df_without_sw = pd.read_csv(folder + 'clean_tweets_without_stopwords.csv')

In [4]:
#Extract tweet data
tweets_with_sw = df_with_sw['tweet']
tweets_without_sw = df_without_sw['tweet']

### Getting encodings using Bag of Words

In [5]:
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
embeddings_bow = cp.asarray(bow_vectorizer.fit_transform(tweets_without_sw).toarray())

### Getting encodings using TF-IDF

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
embeddings_tfidf = cp.asarray(tfidf_vectorizer.fit_transform(tweets_without_sw).toarray())

### Getting encodings using Word2Vec

In [7]:
embeddings_wordvec = cp.asarray(clustering.word2vec(tweets_without_sw))

### Getting encodings using Universal Sentence Encoder model

In [None]:
model_use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings_use = cp.asarray(model_use(tweets_with_sw))

INFO:absl:Using /tmp/tfhub_modules to cache modules.


### Getting encodings using different Transformer Encoder models

In [8]:
model_mpnet = SentenceTransformer('all-mpnet-base-v2')
model_minilm = SentenceTransformer('all-MiniLM-L6-v2')
model_distil = SentenceTransformer('all-distilroberta-v1')

In [9]:
embeddings_mpnet = cp.asarray(model_mpnet.encode(tweets_with_sw, show_progress_bar=True))
embeddings_minilm = cp.asarray(model_minilm.encode(tweets_with_sw, show_progress_bar=True))
embeddings_distil = cp.asarray(model_distil.encode(tweets_with_sw, show_progress_bar = True))

Batches:   0%|          | 0/382 [00:00<?, ?it/s]

Batches:   0%|          | 0/382 [00:00<?, ?it/s]

Batches:   0%|          | 0/382 [00:00<?, ?it/s]

### UMAP + HDBSCAN with different encoding techniques

#### Bag of Words:

In [None]:
bow_clusters, score = clustering.generate_clusters(bow_vectorizer,
                                                   n_neighbors=30,
                                                    n_components=2,
                                                    random_state=42,
                                                     min_samples=50,
                                                    min_cluster_size=120
                                                    )

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(embeddings)

In [None]:
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], s=0.1, cmap='Spectral');

In [None]:
labels = hdbscan.HDBSCAN(
    min_samples=10,
    min_cluster_size=500,
).fit_predict(clusterable_embedding)

In [None]:
clustered = (labels >= 0)
plt.scatter(clusterable_embedding[~clustered, 0],
            clusterable_embedding[~clustered, 1],
            color=(0.5, 0.5, 0.5),
            s=0.1,
            alpha=0.5)
plt.scatter(clusterable_embedding[clustered, 0],
            clusterable_embedding[clustered, 1],
            c=labels[clustered],
            s=0.1,
            cmap='Spectral');

Bag of Words + Clusterization

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(bow)

In [None]:
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], s=0.1, cmap='Spectral');

In [None]:
labels = hdbscan.HDBSCAN(
    min_samples=50,
    min_cluster_size=120,
).fit_predict(clusterable_embedding)

In [None]:
clustered = (labels >= 0)
plt.scatter(clusterable_embedding[~clustered, 0],
            clusterable_embedding[~clustered, 1],
            color=(0.5, 0.5, 0.5),
            s=0.1,
            alpha=0.5)
plt.scatter(clusterable_embedding[clustered, 0],
            clusterable_embedding[clustered, 1],
            c=labels[clustered],
            s=0.1,
            cmap='Spectral');

In [None]:
df['Cluster_BoW'] = labels

TF-IDF + Clusterization

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(tfidf)

In [None]:
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], s=0.1, cmap='Spectral');

In [None]:
labels = hdbscan.HDBSCAN(
    min_samples=50,
    min_cluster_size=200,
).fit_predict(clusterable_embedding)

In [None]:
clustered = (labels >= 0)
plt.scatter(clusterable_embedding[~clustered, 0],
            clusterable_embedding[~clustered, 1],
            color=(0.5, 0.5, 0.5),
            s=0.1,
            alpha=0.5)
plt.scatter(clusterable_embedding[clustered, 0],
            clusterable_embedding[clustered, 1],
            c=labels[clustered],
            s=0.1,
            cmap='Spectral');

Word2vec + Clusterization

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=5,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(wordvec)

In [None]:
plt.scatter(clusterable_embedding[:, 0], clusterable_embedding[:, 1], s=0.1, cmap='Spectral');

In [None]:
labels = hdbscan.HDBSCAN(
    min_samples=70,
    min_cluster_size=30,
).fit_predict(clusterable_embedding)

In [None]:
clustered = (labels >= 0)
plt.scatter(clusterable_embedding[~clustered, 0],
            clusterable_embedding[~clustered, 1],
            color=(0.5, 0.5, 0.5),
            s=0.1,
            alpha=0.5)
plt.scatter(clusterable_embedding[clustered, 0],
            clusterable_embedding[clustered, 1],
            c=labels[clustered],
            s=0.1,
            cmap='Spectral');

In [None]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(embeddings)

In [None]:
df['Cluster_word2vec'] = labels

In [None]:
df.head()

In [None]:
def freq(tweets):
    vocabulary = dict()
    for tweet in tweets:
        # break the string into list of words
        str_list = tweet.split()
        for s in str_list:
            if s in vocabulary:
                vocabulary[s] += 1
            else:
                vocabulary[s] = 1
    return sorted(vocabulary.items(), key=lambda x: x[1], reverse=True)[:10]

In [None]:
for cluster in list(np.unique(df['Cluster_BoW'])):
  print('Cluster: ' + str(cluster))
  cluster_text = df[df['Cluster_BoW'] == cluster]['tweet']
  dic = freq(cluster_text)
  print(dic)

In [None]:
df['Cluster_BoW'].value_counts()

In [None]:
for cluster in list(np.unique(df['Cluster_word2vec'])):
  print('Cluster: ' + str(cluster))
  cluster_text = df[df['Cluster_word2vec'] == cluster]['tweet']
  dic = freq(cluster_text)
  print(dic)

In [None]:
df.to_csv('/content/drive/MyDrive/ML701-Project/clusterized_ver_0.csv')