In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
!pip install sklearn

In [6]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [None]:
!pip install sentencepiece

In [None]:
from absl import logging

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import tensorflow_hub as hub
import sentencepiece as spm
import numpy as np
from absl import logging
import pandas as pd

class Use_model:
    def __init__(self):
        '''
        THis uses the lite version of universal sentence encoder
            ~ https://tfhub.dev/google/universal-sentence-encoder-lite/2

        Also see here: https://www.tensorflow.org/hub/tutorials/semantic_similarity_with_tf_hub_universal_encoder_lite
        '''

        spm_path = "https://tfhub.dev/google/universal-sentence-encoder-lite/2"

        module = hub.Module(spm_path)

        self.input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])
        self.encodings = module(
            inputs=dict(
                values=self.input_placeholder.values,
                indices=self.input_placeholder.indices,
                dense_shape=self.input_placeholder.dense_shape))

        with tf.Session() as sess:
            spm_path = sess.run(module(signature="spm_path"))

        self.sp = spm.SentencePieceProcessor()
        
        with tf.io.gfile.GFile(spm_path, mode="rb") as f:
            self.sp.LoadFromSerializedProto(f.read())
        
        print("SentencePiece model loaded at {}.".format(spm_path))

    def process_to_IDs_in_sparse_format(self, sp, sentences):

        '''
         An utility method that processes sentences with the sentence piece processor
         'sp' and returns the results in tf.SparseTensor-similar format:
         (values, indices, dense_shape)
        '''
        ids = [sp.EncodeAsIds(x) for x in sentences]
        max_len = max(len(x) for x in ids)
        dense_shape=(len(ids), max_len)
        values=[item for sublist in ids for item in sublist]
        indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]
        return (values, indices, dense_shape)

    def text_vector(self, sentence):

        self.sentence = sentence
        messages = [self.sentence]

        values, indices, dense_shape = self.process_to_IDs_in_sparse_format(self.sp, messages)

        # Reduce logging output.
        logging.set_verbosity(logging.ERROR)

        input_placeholder = self.input_placeholder

        with tf.Session() as session:
            session.run([tf.global_variables_initializer(), tf.tables_initializer()])
            message_embeddings = session.run(
                self.encodings,
                feed_dict={input_placeholder.values: values,
                            input_placeholder.indices: indices,
                            input_placeholder.dense_shape: dense_shape})

        self.vector = np.array(message_embeddings).tolist()

        return self.vector


In [None]:
embedding = Use_model()

In [None]:
embedding.text_vector('I love this man')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/medium_articles_drop.csv')
data.drop(['authors', 'timestamp'], inplace=True, axis=1)

data.head

In [34]:
tags = data['tags']
title = data['text']
heading = data['title']

In [35]:
all_tags = np.array(tags)
all_tags.shape

(1927,)

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [40]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk import SnowballStemmer, WordNetLemmatizer, pos_tag
from nltk.corpus import stopwords
from string import punctuation
import re
from pprint import pprint

In [41]:
stop_words = stopwords.words('english')

In [42]:
cleaned_tokens = []

# textcleaning
def long_text_cleaning(text: list):
     
    for sen in text:
        sentence_tokenize = sent_tokenize(sen)

        cleaned_sen = []
        for sent_token in sentence_tokenize:
            
            new_sen = []

            words_tokenize = word_tokenize(sent_token)
            prasser_tags = pos_tag(words_tokenize)
            
            for token, tag in prasser_tags:
                token = re.sub("[!$%&'()*+,-./:;<=>?@[\]^_`{|} ~0-9]","", token)

                if len(token) != 1:

                    if tag.startswith("NN"):
                        pos = 'n'
                    elif tag.startswith('VB'):
                        pos = 'v'
                    else:
                        pos = 'a'

                    lemmatizer = WordNetLemmatizer()
                    token = lemmatizer.lemmatize(token, pos)
            
                    filtered_word = token.lower()

                    if len(token) > 0 and filtered_word not in stop_words:
                        new_sen.append(filtered_word)

            cleaned_sen.append([' '.join(new_sen)])

        cleaned_tokens.append(cleaned_sen)

long_text_cleaning(title)


In [43]:
cleaned_text = [' '.join(sent_[0] for sent_ in sent) for sent in cleaned_tokens]

In [None]:
cleaned_text[0]

###### transform text to vector 

In [None]:
import numpy as np

use_title = []

count = 0
for text_ in cleaned_text:
  print(count)
  count+=1
  use_title.append(embedding.text_vector(text_))


###### It roughly takes 3.5 hours to get the embeddings

In [None]:
use_title[0]

In [43]:
len(use_title)

1927

In [10]:
import json

In [47]:
f = open('/content/drive/MyDrive/use_title.json', 'w')

try:
  json.dump(use_title, f)
except:
  json.dumps(use_title, f)

f.close()

Reforming heading

In [45]:

def heading_cleaning(title: np.array):
    
    all_cleaned_title = []
    stemmer = SnowballStemmer('english')

    for tit in title:
        cleaned_title = []
        all_tit_words = word_tokenize(tit)

        for word in all_tit_words:

            word = re.sub('[0-9]', '', word)
            word = stemmer.stem(word)
        
            if word not in stop_words and word not in punctuation and len(word) > 1:

                cleaned_title.append(word)
        
        all_cleaned_title.append(' '.join(cleaned_title))
    
    return all_cleaned_title

all_cleaned_title = heading_cleaning(heading)

In [47]:
all_cleaned_title[0]

'mental note vol'

In [48]:
cleaned_tit = np.array(all_cleaned_title)

cleaned_tit.shape

(1927,)

In [23]:
f_ = open('/content/drive/MyDrive/use_title.json', 'r')

use_title = json.load(f_)
f_.close()
len(use_title)

1927

In [24]:
import numpy as np

use_title = np.array(use_title).reshape(1927, -512)
use_title.dtype, use_title.shape


(dtype('float64'), (1927, 512))

In [26]:
use_title[0].shape

(512,)

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

On K-means

In [28]:
no_of_clusters = 600
means = KMeans(n_clusters=no_of_clusters)
means.fit(use_title)

KMeans(n_clusters=600)

In [54]:
f_model = open('prid_model.sav', 'wb')
pickle.dump(means, f_model)
f_model.close()

In [30]:
y_km = means.fit_predict(use_title)

In [None]:
y = cleaned_tit

for i in range(no_of_clusters):
    y_plot = use_title[y_km==i, 0]
    x_plot = np.array([y[np.where(use_title == ind)[0][0]] for ind in y_plot]).reshape(-1)

    plt.scatter(x_plot, y_plot, s=5)

plt.show()


In [50]:
set_tags = {}

for i in range(no_of_clusters):
    
    k1 = use_title[y_km==i, :512]
    topic = np.array([all_tags[np.where(use_title == ind)[0][0]] for ind in k1]).reshape(-1)

    noth_topic = []
    for w in topic:
      
      seno_topic = word_tokenize(re.sub("[!$%&'()*+,-./:;<=>?@[\]^_`{|}~0-9]", '', w))
      
      ray_topic = [topc for topc in seno_topic if len(topc) > 2]
      noth_topic.append(ray_topic)

    set_tags['topic ' + str(i)] = noth_topic


In [51]:
set_tags['topic 3']

[['Neuroscience', 'Health', 'Science', 'Psychology'],
 ['Health', 'Covid', 'Body', 'Coronavirus', 'Science'],
 ['Technology', 'Health', 'Covid', 'Life', 'Science'],
 ['Selfawareness', 'Body', 'Safety', 'Health', 'Empowerment']]

In [53]:
import json

f_prid_tags = open('pridicted_tags.json', 'w')
json.dump(set_tags, f_prid_tags)
f_prid_tags.close()