## **Example of text clustering**

Mount your google drive to colab

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Installing some packages

In [3]:
!pip3 install tensorflow_text>=2.0.0rc0

In [4]:
!pip3 install emoji demoji bs4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[K     |████████████████████████████████| 240 kB 4.5 MB/s 
[?25hCollecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 664 kB/s 
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234927 sha256=731172d73d2be78577b20e71f9772525e970c6e171274551d404c033b84c38e9
  Stored in directory: /root/.cache/pip/wheels/f3/e3/f2/1de1c2e3ed742e1df73e0f15d58864e50c7e64f607b548d6cf
Successfully built emoji
Installing collected packages: emoji, demoji
Successfully installed demoji-1.1.0 emoji-2.2.0


In [5]:
!pip3 install xlsxwriter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xlsxwriter
  Downloading XlsxWriter-3.0.3-py3-none-any.whl (149 kB)
[K     |████████████████████████████████| 149 kB 4.8 MB/s 
[?25hInstalling collected packages: xlsxwriter
Successfully installed xlsxwriter-3.0.3


In [7]:
import pandas as pd #tabular data manipulation
import numpy as np #numerical
import re #for regular expressions
import os #operational system (just in case)
import tensorflow as tf 
import tensorflow_hub as hub
import tensorflow_text

In [6]:
from bs4 import BeautifulSoup #to remove web characters
import emoji #to clean emojis
import demoji #to clean emojis
from tqdm.notebook import tqdm #progress bar
import pickle #to save serialized data
from sklearn.decomposition import PCA #PCA
from sklearn.cluster import KMeans, DBSCAN #clustering methods
demoji.download_codes() #downloading emojis

  


Let's read a file and take first 15k lines of it

In [8]:
os.getcwd()

'/content'

In [9]:
df_orig = pd.read_csv("/content/gdrive/MyDrive/Data_examples/Corona_NLP_train.csv",encoding='latin1')

In [12]:
df_orig['OriginalTweet'][0]

'@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/iFz9FAn2Pa and https://t.co/xX6ghGFzCC and https://t.co/I2NlzdxNo8'

In [11]:
df = df_orig[:15000].copy()

Here we define some preprocessing functions

In [13]:
def remove_weblinks(text):
    text = re.sub(r"((https?:\/\/)|(www\.))(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", " ", text)
    text = re.sub(r"/forum/\S+"," ", text)
    return text.strip()

def remove_mentions(text):
    text = re.sub("(\S+)?@\S+", ' ', text)
    text = re.sub(r"^.+@", " ", text)
    return text.strip()

def replace_empty(text):
    if text == '' or str(text) == 'nan':
        return "nan"
    if re.match("\s+$", text):
        return "nan"
    return text

def remove_hashtags(text):
    text = re.sub('(#\w+)', '', text)
    return text.strip()

# Applying Beautiful Soup (bs4) package to remove web characters (html) + remove emojis
def get_emoji_regexp():
    # Sort emoji by length to make sure multi-character emojis are
    # matched first
    emojis = sorted(emoji.EMOJI_DATA, key=len, reverse=True)
    pattern = u'(' + u'|'.join(re.escape(u) for u in emojis) + u')'
    return re.compile(pattern)

def remove_emoji(text):
    import warnings
    warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
    cln_text = BeautifulSoup(text, 'lxml').get_text() # remove web characters
    cln_text = demoji.replace(cln_text, " ")
    emoji_pattern = re.compile("["
                       u"\U0001F600-\U0001F64F"  # emoticons
                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       "]+", flags=re.UNICODE)

    cln_noemo = emoji_pattern.sub(r' ', cln_text)
    regex = get_emoji_regexp()
    cln_post = regex.sub(" ", cln_noemo)
    return (cln_post)

# Main function with preprocessing - please keep your eye on order of functions
def preprocess(text):
    text = str(text)
    text = remove_weblinks(text)
    text = remove_mentions(text)
    text = remove_hashtags(text)
    text = remove_emoji(text)
    text = replace_empty(text)
    text = re.sub("\s+"," ", text)
    text = re.sub("’", "'", text)
    text = re.sub("[“”·【】《》（）©£°¥«º»¿¤*~=%&€•@#…“”·、；！？【】《》（）©£¥«º»¿ðð¤_●–«» — — ´ ▽ ∀ ˊᗜ  ω  ▄ 〓█ ● ▂ █ ◥⊙ ▲ ⊙◤ ❶ ❷ ◍ ゜~ ← ▼ ω ♪ → ・｀ ღˇᴗˇ♡ ﾟ ]"," ",text)
    text = re.sub(r"(\bRT\b)"," ",text, flags = re.IGNORECASE|re.MULTILINE)
    text = re.sub(r"(\brt\b)"," ",text, flags = re.IGNORECASE|re.MULTILINE)
    text = re.sub(r"(\bretweet\b)"," ",text, flags = re.IGNORECASE|re.MULTILINE)
    text = re.sub("[-\/\(\)\\\[\]\|]", " ", text, flags = re.IGNORECASE|re.MULTILINE)
    text = re.sub(r"^\W+", "", text)
    text = re.sub(r"(^\W+)?(^from: \w+ on \w+)?",'',text,flags = re.IGNORECASE|re.MULTILINE)
    text = re.sub("\s+", " ", text)
    text = re.sub("⠀"," ", text)
    text = re.sub(r"^\W+", "", text)
    text = re.sub(r"\s+"," ", text)
    return text.strip()

Preprocessing our data - remove unnecessary strings and characters

In [14]:
tqdm.pandas() # loading a progress bar to pandas - to have an estimation of execution time (very important)

In [15]:
df['cleaned_text'] = df['OriginalTweet'].progress_apply(preprocess)

  0%|          | 0/15000 [00:00<?, ?it/s]

In [16]:
df['OriginalTweet']

0        @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...
1        advice Talk to your neighbours family to excha...
2        Coronavirus Australia: Woolworths to give elde...
3        My food stock is not the only one which is emp...
4        Me, ready to go at supermarket during the #COV...
                               ...                        
14995    For those who still donÂt get it\r\r\n https:...
14996    There have been chaotic scenes at some superma...
14997    The "Roaring 20's" has brought #Covid_19 illne...
14998    @POTUS you must follw the Danish and British e...
14999    @JaimeeKara Demand for certain goods, includin...
Name: OriginalTweet, Length: 15000, dtype: object

In [17]:
df['cleaned_text']

0                                                  and and
1        advice Talk to your neighbours family to excha...
2        Coronavirus Australia: Woolworths to give elde...
3        My food stock is not the only one which is emp...
4        Me, ready to go at supermarket during the outb...
                               ...                        
14995                    For those who still donÂ t get it
14996    There have been chaotic scenes at some superma...
14997    The "Roaring 20's" has brought illness, a stoc...
14998    you must follw the Danish and British economic...
14999    Demand for certain goods, including hand sanit...
Name: cleaned_text, Length: 15000, dtype: object

In [None]:
df['OriginalTweet']

0        @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...
1        advice Talk to your neighbours family to excha...
2        Coronavirus Australia: Woolworths to give elde...
3        My food stock is not the only one which is emp...
4        Me, ready to go at supermarket during the #COV...
                               ...                        
14995    For those who still donÂt get it\r\r\n https:...
14996    There have been chaotic scenes at some superma...
14997    The "Roaring 20's" has brought #Covid_19 illne...
14998    @POTUS you must follw the Danish and British e...
14999    @JaimeeKara Demand for certain goods, includin...
Name: OriginalTweet, Length: 15000, dtype: object

Adding more constraints on our data - counting number of characters per post and filter out short ones

In [19]:
df["# chars"] = df['cleaned_text'].astype(str).apply(lambda text: len(text))
df = df.drop_duplicates(['cleaned_text']).reset_index(drop=True)
df = df[df['# chars'] > 7].reset_index(drop=True)

In [20]:
df

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment,cleaned_text,# chars
0,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive,advice Talk to your neighbours family to excha...,237
1,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive,Coronavirus Australia: Woolworths to give elde...,107
2,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive,My food stock is not the only one which is emp...,172
3,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative,"Me, ready to go at supermarket during the outb...",194
4,3804,48756,"ÃT: 36.319708,-82.363649",16-03-2020,As news of the regionÂs first confirmed COVID...,Positive,As news of the regionÂ s first confirmed COVID...,214
...,...,...,...,...,...,...,...,...
14955,18794,63746,University of Surrey,22-03-2020,For those who still donÂt get it\r\r\n https:...,Neutral,For those who still donÂ t get it,33
14956,18795,63747,"Birmingham, England",22-03-2020,There have been chaotic scenes at some superma...,Extremely Negative,There have been chaotic scenes at some superma...,233
14957,18796,63748,"Tipperary, Ireland.",22-03-2020,"The ""Roaring 20's"" has brought #Covid_19 illne...",Negative,"The ""Roaring 20's"" has brought illness, a stoc...",195
14958,18797,63749,British,22-03-2020,@POTUS you must follw the Danish and British e...,Extremely Negative,you must follw the Danish and British economic...,247


First, let's try to load our vectorizer

In [21]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

Vectorization function - split by chuncks in order to follow the execution time

In [22]:
def use_vectorize(texts, chunk_size=1000) -> np.ndarray:
        embedder = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual/3')
        if len(texts) <= chunk_size:
            return embedder(texts).numpy().astype(np.float16)
            #return embedder(texts).numpy()
        res = []
        for i in tqdm(range(0, len(texts), chunk_size), total=round(len(texts)/chunk_size), desc="Vectorization"):
            cu_texts = texts[i:i+chunk_size]
            res.append(embedder(cu_texts).numpy().astype(np.float16))
        return np.concatenate(res)

Let's convert our text to embeddings (vectors) and save outputs.

In [23]:
embeddings = use_vectorize(df['cleaned_text'].astype(str).tolist())
pickle_out = open("embeddings.p","wb")
pickle.dump(embeddings, pickle_out)
pickle_out.close()

Vectorization:   0%|          | 0/15 [00:00<?, ?it/s]

In [24]:
embeddings.shape

(14960, 512)

Principle Component Analysis - PCA. We use it to reduce dimensions, from 512 to N (in our example it will be 50). You can also use variance explained value, e.g. PCA(0.6) will return you a number of components explaining 60% of total varience of your data

In [25]:
def pca(message_embeddings):
    pca = PCA(n_components = 50)
    newX = pca.fit_transform(message_embeddings)
    newdf = pd.DataFrame(newX)
    newdf.to_pickle('pca.p')
    return (newdf)

In [26]:
df_embed = pca(embeddings)

In [27]:
df_embed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.060646,0.027105,0.121205,0.054254,-0.218270,0.038037,-0.078493,0.206703,-0.166503,-0.034324,...,0.010919,-0.043482,0.086059,-0.137303,0.026032,0.013081,-0.002038,-0.068690,0.093252,-0.070377
1,0.090380,0.304331,0.096642,0.069359,-0.073211,0.216486,0.094874,0.002184,-0.040458,0.055426,...,0.139087,0.006276,-0.008276,0.035679,-0.051182,0.005542,0.135724,0.045229,-0.056655,0.006470
2,-0.213827,-0.200395,0.140440,-0.190376,0.220214,0.149407,-0.260717,0.007024,-0.003331,0.092508,...,-0.056782,0.015608,-0.046241,-0.052177,0.003867,0.064206,-0.094401,0.014805,-0.046000,-0.040481
3,-0.260122,-0.128980,-0.007137,-0.014044,0.172467,0.287648,0.007936,0.064725,0.023135,0.223248,...,-0.063341,-0.081033,-0.083607,0.044031,0.033512,-0.028094,0.008826,-0.008656,0.033404,-0.066264
4,-0.014590,0.207081,0.033275,-0.039106,-0.004005,0.153932,0.042149,0.029214,-0.256814,0.116528,...,-0.070558,0.065006,-0.061580,-0.006118,0.067873,-0.005743,-0.072930,-0.106287,-0.047098,0.107500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14955,0.144842,-0.106214,-0.129890,0.008623,-0.212194,0.084803,-0.128382,-0.056258,0.098361,-0.110678,...,-0.171709,0.214587,-0.115479,-0.203167,0.073306,0.029229,0.056965,-0.074376,-0.012782,-0.113524
14956,-0.260552,0.062662,0.146902,0.308467,0.000712,0.008315,0.115405,0.203969,0.057235,0.081164,...,-0.006316,-0.021147,-0.047777,-0.030510,0.087185,0.003061,-0.041102,0.008674,0.005146,0.044838
14957,0.065565,0.045213,0.183974,-0.022851,0.178102,0.056522,-0.043768,-0.174704,0.005985,0.198023,...,-0.007733,0.080384,-0.010486,0.012558,-0.029332,-0.024022,-0.083314,0.015403,0.015004,0.060976
14958,0.266998,-0.044547,0.029429,-0.167195,0.206740,-0.164190,-0.034389,-0.190962,0.117421,0.144733,...,0.017215,-0.032622,-0.092948,0.039533,0.042455,-0.032799,-0.065614,-0.016494,0.031613,0.040180


Now we try with the simplest clustering algorithm - KMeans.

In [31]:
clustering = KMeans(n_clusters=20, random_state=20).fit(df_embed.values.astype(float))

Adding cluster labels column to our dataframe and check the distribution

In [32]:
df['Cluster'] = clustering.labels_

In [33]:
df['Cluster'].value_counts()

9     1029
7      997
0      986
16     973
8      937
2      881
19     836
10     825
15     739
6      709
5      695
14     692
18     665
17     649
11     632
1      587
13     573
12     534
3      529
4      492
Name: Cluster, dtype: int64

Export our results to Excel workbook and save it

In [34]:
import xlrd
writer = pd.ExcelWriter('Corona_clustered.xlsx', engine='xlsxwriter', options={'strings_to_urls': False})
df.to_excel(writer, index = False)
writer.save()
writer.close()

  
  warn("Calling close() on already closed file.")
