In [None]:
!pip install -U sentence-transformers
!pip install plotly==4.14.1
!pip install torch

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import sys
sys.path.append('/content/drive/My Drive/data/icns_project')

In [4]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from pathlib import Path

from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.spatial.distance import euclidean, pdist, squareform
from sklearn import manifold          #use this for MDS computation

#visualization libs
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
% matplotlib inline

In [5]:
MODEL_PATH = Path('drive') / 'My Drive' / 'data' / 'icns_project' / 'paraphrase-distilroberta-base-v1'
DATA_PATH = Path('drive') / 'My Drive' / 'data' / 'icns_project'

In [6]:
pd.set_option('max_colwidth', 800)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('using device: ', torch.cuda.get_device_name(device), flush=True)

using device:  Tesla T4


In [10]:
model = SentenceTransformer(str(MODEL_PATH))

In [11]:
df = pd.read_csv(DATA_PATH / 'jokes_combined.csv', encoding='utf-8')

In [12]:
news_df = pd.read_csv(DATA_PATH / 'BBC_news_adjusted.csv', encoding='utf-8')

In [13]:
news_df.shape

(1490, 2)

In [14]:
news_df = news_df.rename({'Text': 'text', 'Category': 'category'}, axis=1)

In [15]:
news_df['category'].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: category, dtype: int64

In [16]:
df['category'].value_counts()

Other / Misc       2302
Men / Women         924
One Liners          923
Miscellaneous       701
Children            670
Animal              656
Yo Momma            600
Blond               598
Religious           516
Insults             462
Puns                457
At Work             288
Medical             284
News / Politics     279
Redneck             267
Gross               253
Men                 190
Sports              171
Knock-Knock         167
Lawyer              157
Bar                 154
Tech                151
Yo Mama             144
Women               143
College             131
Light Bulbs         119
Political           112
Blonde Jokes        111
Lightbulb           110
Heaven and Hell      85
Family, Parents      80
Money                77
Animals              74
Bar Jokes            66
Computers            64
Police Jokes         55
Sex                  54
Lawyers              50
Love & Romance       48
Military             47
Crazy Jokes          43
Business        

In [None]:
# take a sample with 2 sentences from one category and 2 from another
sample_a = df[df['category'] == 'Sports']
sample_b = df[df['category'] == 'Religious']
sample = pd.concat([sample_a, sample_b])

In [None]:
sample

Unnamed: 0,text,score,category
191905,"Two men are approaching each other on a sidewalk. Both are dragging their right foot as they walk. As they meet, one man looks at the other knowingly, points to his foot and says, ""Vietnam, 1969."" The other points his thumb behind him and says, ""Dog crap, 20 feet back.""",4.25,Sports
192073,"A man went to visit his 90 year old grandfather in a very secluded rural area of the state he lived in.After spending the night, his grandfather prepared breakfast for him consisting of eggs and bacon. He noticed a film like substance on his plate and he questioned his grandfather, ""are these plates clean?""His grandfather replied, ""Those plates are as clean as cold water can get them, so go on and finish your meal"".That afternoon, while eating the hamburgers his grandfather made for lunch, he noticed tiny specks around the edge of his plate and a substance that looked like dried egg yokes, so he ask again, ""Are you sure these plates are clean""?Without looking up from his hamburger, the grandfather says, ""I told you before; those dishes are as clean as cold water can get them. Now don't...",3.67,Sports
192088,"A man is walking down the street when he sees a sign in the window of a travel agency that says CRUISES - $100. He goes into the agency and hands the guy $100. The travel agent then whacks him over the head with a baseball bat and throws him in the river.Another man is walking down the street a half hour later, sees the sign and pays the guy $100. The travel agent then whacks him with the baseball bat and throws him in the river.Sometime later, the two men are floating down the river together and the first man asks, ""Do you think they'll serve any food on this cruise?""The second man says, ""I don't think so. They didn't do it last year.""",3.00,Sports
192111,"""How was your golf game, dear?"" asked Jack's wife Tracy.""Well, I was hitting pretty well, but my eyesight's gotten so bad Icouldn't see where the ball went.""""But you're seventy-five years old, Jack!"" admonished his wife,""Why don't you take my brother Scott along?""""But he's eighty-five and doesn't even play golf anymore,""protested Jack.""But he's got perfect eyesight. He could watch your ball,""Tracy pointed out.The next day Jack teed off with Scott looking on. Jack swung,and the ball disappeared down the middle of the fairway.""Do you see it?"" asked Jack.""Yup,"" Scott answered.""Well, where is it?"" yelled Jack, peering off into the distance.""I forgot.""",2.83,Sports
192155,"It is the Olympic men's figure skating. Out comes the Russian competitor, he skates around to some classical music in a slightly dull costume, performs some excellent leaps but without any great artistic feel for the music. The Judges' scores read: Britain 5.8: Russia 5.9: United States 5.5: Ireland 6.0 Next comes the American competitor in a sparkling stars and stripes costume, skating to some rock and roll music. He gets the crowd clapping, but is not technically as good as the Russian. He slightly misses landing a triple Salchow and loses the center during a spin. But, artistically, it is a more satisfying performance. The Judges' scores read: Britain 5.8: Russia 5.5: United States 5.9: Ireland 6.0 Finally out comes the Irish competitor wearing a tatty old donkey jacket, wit...",1.50,Sports
...,...,...,...
204481,"Mik:Darn it! There's only 2 chips in my bowl.Damn you,chips! Mak: Aargh! you made me so angry I am gonna punch them! Mak punches the chips. Mik: WHOAH! you made 2 big chips into 20 small ones! Mak: I AM JESUS OF THE DORITOS!!",,Religious
204489,"Little Katie was at Sunday school one day. The teacher asked the class ""Who is someone in your life that worships God by always speaking His name?"" Little Katie raised her hand and said ""The fifth grade teacher at my school! Every time we pass by her room on the way to art I hear her say ""I swear to God I have the worst behaved class in the world!""",,Religious
204491,"Billy was walking in a shopping center with his mom, and suddenly she stopped to pick up a penny. When she reached out for it, he saw armpit hair. Frightened, he said, ""You're not my mom! I'm calling the police."" The man pulled off his mask and said, ""Okay, you got me. But tell me one thing. How did you know I wasn't your mom?"" ""Because my mom's not Jewish.""",,Religious
204494,"So I asked a religious truck driver what his CB handle is. His answer: ""My handle's 'Messiah'."" (Get it? Sounds like ""Handel's Messiah."")",,Religious


In [None]:
sample.text.to_list()

['Yo mama so dumb she burnt down the house using a cd burner',
 '10 Yo Mama got so fat when she jump in the air, she got stuck.9 Yo Mama is soo  fat when she trip and fall she made the Grand Caryon.8 Yo Mama is soo fat when she step on the scale said "Out of Order."7 Yo Mama is soo fat when she wore a red rain jacket, everyone yelled "Hey Kool-Ade!"6 Yo Mama is soo fat when she bungee jump she broke the bridge in half!5 Yo Mama is soo fat she wears a V.C.R. as a pager.4 Yo Mama is soo fat that the city gave her own zip code.3 Yo Mama is soo fat everyone at the baseball sadtium sat on her.2 Yo Mama is soo fat when she drop you off at school, she got a ticket for littering.1 Yo Mama is soo fat takes you a five mile walk around her.',
 'A man went to a doctor, and said he wanted to be able to get a job at the local Post Office, but unfortunately he was too smart. The doctor asked him his IQ, and when he gave a three-digit reply, the doctor told him that the procedure would have to involve

In [None]:
sample_embs = model.encode(sample.text.to_list())

In [None]:
sample_embs

array([[ 0.12699974, -0.00737525, -0.05100042, ...,  0.02139805,
        -0.16290238, -0.11126881],
       [-0.36798954, -0.37642103,  0.12126566, ...,  0.23382455,
        -0.05552831,  0.24960634],
       [-0.07327329,  0.15853071,  0.08581558, ..., -0.22744384,
        -0.11506341,  0.05415363],
       ...,
       [ 0.03978144,  0.30997607, -0.06981759, ...,  0.16887568,
        -0.26433602,  0.05298478],
       [-0.2028626 ,  0.17286932, -0.3297061 , ...,  0.01709026,
         0.10450532,  0.15073754],
       [-0.28460032,  0.19522893, -0.02801261, ...,  0.18380442,
        -0.11625353, -0.09927955]], dtype=float32)

In [None]:
sample_texts = sample.text.to_list()

In [None]:
len(sample_embs[0])

768

In [None]:
#Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.pytorch_cos_sim(sample_embs, sample_embs)

#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sample_texts[i][:30], sample_texts[j][:30], pair['score']))

One Sunday a pastor asked his  		 One Sunday a pastor asked his  		 Score: 0.9983
A pious man, who had reached t 		 A pious man, who had reached t 		 Score: 0.9888
One morning a man came into th 		 One morning a man came into th 		 Score: 0.9887
Two priests died at the same t 		 Two priests died at the same t 		 Score: 0.9846
One day Jesus was out for a wa 		 One day Jesus was out for a wa 		 Score: 0.9842
John Smith was the only Protes 		 John Smith was the only Protes 		 Score: 0.9820
Three Pastors from the south w 		 Three Pastors in the south wer 		 Score: 0.9818
It is the Olympic men's figure 		 It is the Olympic men's figure 		 Score: 0.9771
A blonde was telling a priest  		 A blonde was telling her pries 		 Score: 0.9740
After a few days on the new Ea 		 After a few days, the Lord cal 		 Score: 0.9679


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204542 entries, 0 to 204541
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   text      204541 non-null  object 
 1   score     194531 non-null  float64
 2   category  13133 non-null   object 
dtypes: float64(1), object(2)
memory usage: 4.7+ MB


In [None]:
df = df.dropna(subset=['text'])

In [None]:
df.shape

(204541, 3)

In [17]:
jokes_sample = df[df['category'] == 'Sports']
news_sample = news_df[news_df['category'] == 'sport']

In [25]:
jokes_sample = jokes_sample.assign(source='joke')
news_sample = news_sample.assign(source='news')

In [28]:
jokes_sample.head(3)

Unnamed: 0,text,score,category,source
191905,"Two men are approaching each other on a sidewalk. Both are dragging their right foot as they walk. As they meet, one man looks at the other knowingly, points to his foot and says, ""Vietnam, 1969."" The other points his thumb behind him and says, ""Dog crap, 20 feet back.""",4.25,Sports,joke
192073,"A man went to visit his 90 year old grandfather in a very secluded rural area of the state he lived in.After spending the night, his grandfather prepared breakfast for him consisting of eggs and bacon. He noticed a film like substance on his plate and he questioned his grandfather, ""are these plates clean?""His grandfather replied, ""Those plates are as clean as cold water can get them, so go on and finish your meal"".That afternoon, while eating the hamburgers his grandfather made for lunch, he noticed tiny specks around the edge of his plate and a substance that looked like dried egg yokes, so he ask again, ""Are you sure these plates are clean""?Without looking up from his hamburger, the grandfather says, ""I told you before; those dishes are as clean as cold water can get them. Now don't...",3.67,Sports,joke
192088,"A man is walking down the street when he sees a sign in the window of a travel agency that says CRUISES - $100. He goes into the agency and hands the guy $100. The travel agent then whacks him over the head with a baseball bat and throws him in the river.Another man is walking down the street a half hour later, sees the sign and pays the guy $100. The travel agent then whacks him with the baseball bat and throws him in the river.Sometime later, the two men are floating down the river together and the first man asks, ""Do you think they'll serve any food on this cruise?""The second man says, ""I don't think so. They didn't do it last year.""",3.0,Sports,joke


In [27]:
news_sample.head(3)

Unnamed: 0,text,category,source
6,wales silent on grand slam talk rhys williams says wales are still not thinking of winning the grand slam despite a third six nations win. that s the last thing on our minds at the moment said williams a second- half replacement in saturday s 24-18 win over france in paris. we all realise how difficult a task it is to go up to scotland and beat them.,sport,news
14,ireland 21-19 argentina an injury-time dropped goal by ronan o gara stole victory for ireland from underneath the noses of argentina at lansdowne road on saturday. o gara kicked all of ireland s points with two dropped goals and five penalties to give the home side a 100% record in their autumn internationals.,sport,news
15,wenger signs new deal arsenal manager arsene wenger has signed a new contract to stay at the club until may 2008. wenger has ended speculation about his future by agreeing a long-term contract that takes him beyond the opening of arsenal s new stadium in two years.,sport,news


In [29]:
combined = pd.concat([jokes_sample.drop('score', axis=1), news_sample])

In [30]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 517 entries, 191905 to 1481
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      517 non-null    object
 1   category  517 non-null    object
 2   source    517 non-null    object
dtypes: object(3)
memory usage: 16.2+ KB


In [31]:
texts = combined.text.to_list()

In [32]:
%%time
embs = model.encode(texts)

CPU times: user 909 ms, sys: 547 ms, total: 1.46 s
Wall time: 1.39 s


In [46]:
def plt_dists(dists, labels, texts, dims=2, title=""):
  '''
  Plot distances using MDS in 2D/3D 
  dists: precomputed distance matrix
  labels: labels to display on the plot
  dims: 2/3 for 2 or 3 dimensional plot, defaults to 2 for any other value passed
  title: title for the plot
  '''
  cnt_dict = dict()
  color = list()

  labels_dict = {label: idx for idx, label in enumerate(labels.unique())}
  # TODO: increase limit and add new colors if needed
  if len(labels_dict.keys()) > 10:
    return

  for label in labels:     
      color.append(labels_dict[label])

  #https://community.plotly.com/t/plotly-colours-list/11730/6
  #colorscale = [[0, 'darkcyan'], [1, 'white']]
  colorscale = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']

  #dists is precomputed using cosine similarity and passed
  #calculate MDS with number of dims passed
  mds = manifold.MDS(n_components=dims, dissimilarity="precomputed", random_state=60, max_iter=90000)
  results = mds.fit(dists)

  #get coodinates for each point
  coords = results.embedding_

  #plot
  if dims == 3:
    fig = go.Figure(data=[go.Scatter3d(
        x=coords[:, 0],
        y=coords[:, 1],
        z=coords[:, 2],
        mode='markers',
        textposition="top center",
        marker=dict(
            size=10,
            color=color,
            colorscale=colorscale,
            opacity=0.8,
            
        ),
        text=texts,
        hoverinfo='text'
    )])
  else:
    fig = go.Figure(data=[go.Scatter(
        x=coords[:, 0],
        y=coords[:, 1],
        mode='markers',
        textposition="top center",
        marker=dict(
            size=12,
            color=color,
            colorscale=colorscale,
            opacity=0.8,
            
        ),
        text=texts,
        hoverinfo='text'
    )])

  fig.update_layout(template="plotly_dark")
  if title!="":
    fig.update_layout(title_text=title)
  fig.show()

In [47]:
def eval_vecs(input_vectors, sent_labels, sent_texts, category='', viz_dims=2):
  '''
  function to calculate cosine distance between each pair of input sentence vectors
  and then pass it to the visualization function

  inputs:
  input_vectors: 
  sent_labels: 
  viz_dims: 2/3 for 2D or 3D plot
  '''

  #call the plot function on the cosine distance matrix
  plt_dists(cosine_distances(input_vectors), labels=sent_labels, texts=sent_texts, 
            dims=viz_dims, title=f'Comparison of embeddings from {category} category')

In [48]:
eval_vecs(embs, combined['source'], combined['text'], category='Sports', viz_dims=3)