In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [0]:
df = pd.read_csv('/content/drive/My Drive/Deep Learning Data/Covid-19 Text Mining/clean_df.csv')

In [0]:
df.head()

In [0]:
df['processed_text'][:10]

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
def vectorizer(text, max_features):
    """Args:
            -text: input text (assumed being cleaned)
            -max_features: length of the extracted vector"""
    vectorizer = TfidfVectorizer(max_features = max_features)
    X = vectorizer.fit_transform(text)
    return X

In [0]:
text = df['processed_text'].values
X = vectorizer(text, 2 ** 9 ) ## transform text into Tfidf vectors of size (no_of_documents x 512)
X.shape

In [0]:
###PCA and Clustering

In [0]:
from sklearn.decomposition import PCA

In [0]:
pca = PCA(n_components=0.95, random_state=42)
X_reduced= pca.fit_transform(X.toarray())
X_reduced.shape

So 341 PCAs can explain 95% variance in our dataset, that is significant reduction from 512 features

In [0]:
from sklearn.cluster import KMeans


In [0]:
k = 20
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X_reduced)
df['y'] = y_pred

In [0]:
from sklearn.manifold import TSNE

tsne = TSNE(verbose=1, perplexity=100, random_state=42)
X_embedded = tsne.fit_transform(X.toarray())

In [0]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.hls_palette(20, l=.4, s=.9)

# plot
sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title('t-SNE with Kmeans Labels')
plt.savefig("improved_cluster_tsne.png")
plt.show()

In [0]:
!pip install transformers

We can see some clusters but not very good at all


In [0]:
## We will try to get title embedding using BERT
import torch
from transformers import BertTokenizer, BertModel, BertForMaskedLM

In [0]:
# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

In [0]:

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [0]:
title_example = df['title'][0]
print("Original title: ", title_example)
marked_text = "[CLS]" + title_example +"[SEP]"
print("Title after added special tokens: ", marked_text)

In [0]:
tokenized_text = tokenizer.tokenize(marked_text)
print(tokenized_text)

In [0]:
# Define a new example sentence with multiple meanings of the word "bank"
text = df['title'][10]
# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
for tup in zip(tokenized_text, indexed_tokens):
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))

In [0]:
token_type_id =[]
attention_mask = []
for i in range(len(df['title'])):
    idx = tokenizer.encode_plus(df['title'][i], add_special_tokens=True, max_length = 200, pad_to_max_length=True, 
                                      return_attention_mask = True, return_tensors = 'pt')
    token_type_id.append(idx['token_type_ids'])
    attention_mask.append(idx['attention_mask'])


In [0]:
len(token_type_id)

In [0]:
id_tensor = torch.cat(token_type_id,0)

In [0]:
attention_tensor = torch.cat(attention_mask,0)

In [0]:
from torch.utils.data import DataLoader
id_generator = Dataloader(id_tensor, batch_size = 32, num_workers = 1)
attention_generator = Dataloader(id_tensor, batch_size = 32, num_workers = 1)

In [0]:
##use pretrained bert as feature extractors
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()
embeddings=[]
for i, batch in enumerate(zip(id_generator, attention_generator), 0):
    with torch.no_grad():
        encoded_layers, _ = model(input_ids = batch[0], attention_masks = batch[1])
        embeddings.append(encoded_layers)

        