In [1]:
!pip install -U sentence-transformers
!pip install plotly==4.14.1
!pip install torch

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/6a/e2/84d6acfcee2d83164149778a33b6bdd1a74e1bcb59b2b2cd1b861359b339/sentence-transformers-0.4.1.2.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 10.0MB/s 
[?25hCollecting transformers<5.0.0,>=3.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 25.6MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/14/67/e42bd1181472c95c8cda79305df848264f2a7f62740995a46945d9797b67/sentencepiece-0.1.95-cp36-cp36m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 60.3MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-many

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import sys
sys.path.append('/content/drive/My Drive/data/icns_project')

In [4]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from pathlib import Path

from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.spatial.distance import euclidean, pdist, squareform
from sklearn import manifold          #use this for MDS computation

#visualization libs
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
% matplotlib inline

In [5]:
MODEL_PATH = Path('drive') / 'My Drive' / 'data' / 'icns_project' / 'paraphrase-distilroberta-base-v1'
DATA_PATH = Path('drive') / 'My Drive' / 'data' / 'icns_project'

In [6]:
pd.set_option('max_colwidth', 800)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('using device: ', torch.cuda.get_device_name(device), flush=True)

using device:  Tesla T4


In [8]:
model = SentenceTransformer(str(MODEL_PATH))

In [9]:
df = pd.read_csv(DATA_PATH / 'BBC_news_adjusted.csv', encoding='utf-8')

In [12]:
df.shape

(1490, 2)

In [13]:
df = df.rename({'Text': 'text', 'Category': 'category'}, axis=1)

In [14]:
df['category'].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: category, dtype: int64

In [15]:
df = df.dropna(subset=['text'])

In [16]:
df.shape

(1490, 2)

In [17]:
texts = df.text.to_list()

In [18]:
%%time
embs = model.encode(texts)

CPU times: user 5.25 s, sys: 2.53 s, total: 7.78 s
Wall time: 13.6 s


In [38]:
def plt_dists(dists, df, dims=2, title=""):
  '''
  Plot distances using MDS in 2D/3D 
  dists: precomputed distance matrix
  df: dataframe
  dims: 2/3 for 2 or 3 dimensional plot, defaults to 2 for any other value passed
  title: title for the plot
  '''
  #https://community.plotly.com/t/plotly-colours-list/11730/6
  colorscale = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', 
                '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']

  #calculate MDS with number of dims passed
  mds = manifold.MDS(n_components=dims, dissimilarity="precomputed", 
                     random_state=60, max_iter=90000)
  results = mds.fit(dists)

  #get coodinates for each point
  coords = results.embedding_

  df['x'] = pd.Series(coords[:, 0])
  df['y'] = pd.Series(coords[:, 1])
  if dims == 3:
    df['z'] = pd.Series(coords[:, 2])

  label_counts = df['category'].value_counts()
  fig = go.Figure()
  idx = 0
  for cat, count in label_counts.iteritems():
    # for jokes, don't plot all categories (e.g. with less than x samples)
    #if count < 400:
    #  continue
    #df_sample = df.query('category == @cat')
    mask = df.category == cat
    if dims == 3:
      fig.add_trace(go.Scatter3d(x=df['x'][mask], y=df['y'][mask], z=df['z'][mask],
                    mode='markers', textposition="top center",
                    marker=dict(
                        size=10,
                        color=colorscale[idx],
                        opacity=0.8,
                    ), text=df['text'][mask], hoverinfo='text',
                    name=cat))
    else:
        fig.add_trace(go.Scatter(x=df['x'][mask], y=df['y'][mask],
                      mode='markers', textposition="top center",
                      marker=dict(
                          size=12,
                          color=colorscale[idx],
                          opacity=0.8,
                      ), text=df['text'][mask], hoverinfo='text',
                      name=cat))
    idx += 1      

  fig.update_layout(template="plotly_dark")
  if title!="":
    fig.update_layout(title_text=title)
  fig.show()

In [39]:
def eval_vecs(dists, df, category='', viz_dims=2):
  '''
  function to calculate cosine distance between each pair of input sentence vectors
  and then pass it to the visualization function

  inputs:
  dists: 
  df: dataframe containing columns 'source', 'category' and 'text'
  viz_dims: 2/3 for 2D or 3D plot
  '''

  #call the plot function on the cosine distance matrix
  plt_dists(dists, df, dims=viz_dims, 
            title='Comparison of embeddings from news dataset')

In [31]:
distances = cosine_distances(embs)

In [40]:
eval_vecs(distances, df, category='', viz_dims=2)