In [None]:
!pip install -U sentence-transformers
!pip install plotly==4.14.1
!pip install torch

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys
sys.path.append('/content/drive/My Drive/data/icns_project')

In [4]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from pathlib import Path

from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from scipy.spatial.distance import euclidean, pdist, squareform
from sklearn import manifold          #use this for MDS computation

#visualization libs
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
% matplotlib inline

In [5]:
MODEL_PATH = Path('drive') / 'My Drive' / 'data' / 'icns_project' / 'paraphrase-distilroberta-base-v1'
DATA_PATH = Path('drive') / 'My Drive' / 'data' / 'icns_project'

In [6]:
pd.set_option('max_colwidth', 800)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('using device: ', torch.cuda.get_device_name(device), flush=True)

using device:  Tesla T4


In [8]:
model = SentenceTransformer(str(MODEL_PATH))

In [9]:
df = pd.read_csv(DATA_PATH / 'jokes_stupid_wocka_relevant.csv', encoding='utf-8')

In [10]:
news_df = pd.read_csv(DATA_PATH / 'BBC_news_adjusted.csv', encoding='utf-8')

In [11]:
news_df.shape

(1490, 2)

In [12]:
news_df = news_df.rename({'Text': 'text', 'Category': 'category'}, axis=1)

In [13]:
news_df['category'].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: category, dtype: int64

In [14]:
df['category'].value_counts()

At Work            264
News / Politics    258
Sports             166
Tech               126
Political          109
Money               68
Computers           55
Business            40
Office Jokes        17
Name: category, dtype: int64

In [15]:
df['category'] = df['category'].replace({'At Work': 'business',
                'News / Politics': 'politics',
                'Sports': 'sport',
                'Tech': 'tech',
                'Political': 'politics',
                'Money': 'business',
                'Computers': 'tech',
                'Business': 'business',
                'Office Jokes': 'business'})

In [16]:
df['category'].value_counts()

business    389
politics    367
tech        181
sport       166
Name: category, dtype: int64

In [17]:
df.shape

(1103, 5)

In [18]:
df = df.dropna(subset=['text'])

In [19]:
df.shape

(1103, 5)

In [20]:
news_df.shape

(1490, 2)

In [21]:
news_df = news_df.dropna(subset=['text'])

In [22]:
news_df.shape

(1490, 2)

In [23]:
df = df.assign(source='joke')
news_df = news_df.assign(source='news')

In [24]:
df.head(2)

Unnamed: 0,text,category,source,score,length
0,"A brunette, a blonde, and a redhead all worked in the same office with the same female boss. Every day, they noticed their boss left work early. One day, the girls decided that when the boss left, they'd leave right behind her. After all, she never called in or came back to the office when she left early, so how was she to know? The next day, they all three left the office right after the boss left. The brunette was thrilled to be home early. She did a little gardening and went to bed early. The redhead was elated to be able to get in a quick workout at the health club before meeting her dinner date. The blonde was happy, happy, happy to be home, but when she got to the bedroom she heard a muffled noise from inside. Slowly, quietly, she cracked open the door and was mortified to se...",business,joke,3.73,260
1,"Bill and Hillary Clinton went out to dinner and when the waiter came to take their order, he asked Bill how he wanted his steak, she replied, ""medium."" Then the waiter said, ""how about your vegetable?"" Bill replied, ""Oh, she can order for herself.""",politics,joke,3.5,59


In [25]:
news_df.head(2)

Unnamed: 0,text,category,source
0,worldcom ex-boss launches defence lawyers defending former worldcom chief bernie ebbers against a battery of fraud charges have called a company whistleblower as their first witness. cynthia cooper worldcom s ex-head of internal accounting alerted directors to irregular accounting practices at the us telecoms giant in 2002.,business,news
1,german business confidence slides german business confidence fell in february knocking hopes of a speedy recovery in europe s largest economy. munich-based research institute ifo said that its confidence index fell to 95.5 in february from 97.5 in january its first decline in three months.,business,news


In [26]:
df = df[['text', 'category', 'source']]

In [27]:
combined = pd.concat([df, news_df], ignore_index=True)

In [28]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2593 entries, 0 to 2592
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      2593 non-null   object
 1   category  2593 non-null   object
 2   source    2593 non-null   object
dtypes: object(3)
memory usage: 60.9+ KB


In [29]:
combined['source'].value_counts()

news    1490
joke    1103
Name: source, dtype: int64

In [33]:
print("Max Sequence Length:", model.max_seq_length)

Max Sequence Length: 128


In [34]:
# tests with preprocessing and model max seq length

In [35]:
combined['text'] = combined.text.str.lower()

In [36]:
combined['text'] = combined.text.str.replace('[!?"]', '')

In [37]:
tmp_jokes = combined[(combined['source'] == 'joke') & (combined['category'] == 'sport')] #.sample(5)

In [38]:
tmp_jokes

Unnamed: 0,text,category,source
6,"two men are approaching each other on a sidewalk. both are dragging their right foot as they walk. as they meet, one man looks at the other knowingly, points to his foot and says, vietnam, 1969. the other points his thumb behind him and says, dog crap, 20 feet back.",sport,joke
14,"a man went to visit his 90 year old grandfather in a very secluded rural area of the state he lived in.after spending the night, his grandfather prepared breakfast for him consisting of eggs and bacon. he noticed a film like substance on his plate and he questioned his grandfather, are these plates cleanhis grandfather replied, those plates are as clean as cold water can get them, so go on and finish your meal.that afternoon, while eating the hamburgers his grandfather made for lunch, he noticed tiny specks around the edge of his plate and a substance that looked like dried egg yokes, so he ask again, are you sure these plates are cleanwithout looking up from his hamburger, the grandfather says, i told you before; those dishes are as clean as cold water can get them. now don't ask me a...",sport,joke
17,"a man is walking down the street when he sees a sign in the window of a travel agency that says cruises - $100. he goes into the agency and hands the guy $100. the travel agent then whacks him over the head with a baseball bat and throws him in the river.another man is walking down the street a half hour later, sees the sign and pays the guy $100. the travel agent then whacks him with the baseball bat and throws him in the river.sometime later, the two men are floating down the river together and the first man asks, do you think they'll serve any food on this cruisethe second man says, i don't think so. they didn't do it last year.",sport,joke
18,"how was your golf game, dear asked jack's wife tracy.well, i was hitting pretty well, but my eyesight's gotten so bad icouldn't see where the ball went.but you're seventy-five years old, jack admonished his wife,why don't you take my brother scott alongbut he's eighty-five and doesn't even play golf anymore,protested jack.but he's got perfect eyesight. he could watch your ball,tracy pointed out.the next day jack teed off with scott looking on. jack swung,and the ball disappeared down the middle of the fairway.do you see it asked jack.yup, scott answered.well, where is it yelled jack, peering off into the distance.i forgot.",sport,joke
23,"it is the olympic men's figure skating. out comes the russian competitor, he skates around to some classical music in a slightly dull costume, performs some excellent leaps but without any great artistic feel for the music. the judges' scores read: britain 5.8: russia 5.9: united states 5.5: ireland 6.0 next comes the american competitor in a sparkling stars and stripes costume, skating to some rock and roll music. he gets the crowd clapping, but is not technically as good as the russian. he slightly misses landing a triple salchow and loses the center during a spin. but, artistically, it is a more satisfying performance. the judges' scores read: britain 5.8: russia 5.5: united states 5.9: ireland 6.0 finally out comes the irish competitor wearing a tatty old donkey jacket, wit...",sport,joke
...,...,...,...
1068,"top ten signs you're not watching a real baseball team from late show with david letterman; monday, february 20, 1995 you recognize batter as the kid who sold you a hot dog a couple minutes earlier. every time a player slides into second, he busts his hip. they keep shouting do over when umpire yells, strike 3 batter looks at him as if the dude's speaking french. try as they might, they just can't scratch themselves like professionals. first base: siskel. second base: ebert. game stops when some lady in a house near the stadium shouts dinner time players constantly adjusting each other's cups. you overheard the coach yelling, run, forrest, run they play like the mets.",sport,joke
1087,"two mates calles mik and mak went to the australia vs england game. it was mik's birthday, and mik gave mak a ride and mak took mik's birthday present with him.he went to get it while mik wached the 4th over. mak said to mik i've got some bad news for you buddy. your car was actually a car bomb and blew up...but mik interupted by saying well makky, mate i've got some bad news for you. what mak asked. ricky ponting's out,he replied",sport,joke
1088,basketball hoops are like misdemeanor. the more i miss it da meanor i get.,sport,joke
1091,"i'm not even sure this is funny (although it was at the time), but it was just a random happening: i was texting my friend and playing temple run at the same time, and something occurred to me. i said, if i were a temple run character, no one would buy me... i cannot slide on my back, jump over tall objects, run fast (most definitely not), or pick up coins by running through them. i would die in the first 30 seconds. he replied, i would buy u, just so i could get the achievement of sexy seven.",sport,joke


In [39]:
tmp_news = combined[(combined['source'] == 'news') & (combined['category'] == 'sport')] #.sample(5)

In [40]:
tmp_news

Unnamed: 0,text,category,source
1109,wales silent on grand slam talk rhys williams says wales are still not thinking of winning the grand slam despite a third six nations win. that s the last thing on our minds at the moment said williams a second- half replacement in saturday s 24-18 win over france in paris. we all realise how difficult a task it is to go up to scotland and beat them.,sport,news
1117,ireland 21-19 argentina an injury-time dropped goal by ronan o gara stole victory for ireland from underneath the noses of argentina at lansdowne road on saturday. o gara kicked all of ireland s points with two dropped goals and five penalties to give the home side a 100% record in their autumn internationals.,sport,news
1118,wenger signs new deal arsenal manager arsene wenger has signed a new contract to stay at the club until may 2008. wenger has ended speculation about his future by agreeing a long-term contract that takes him beyond the opening of arsenal s new stadium in two years.,sport,news
1120,hantuchova in dubai last eight daniela hantuchova moved into the quarter-finals of the dubai open after beating elene likhotseva of russia 7-5 6-4 and now faces serena williams. australian open champion williams survived an early scare to beat russia s elena bovina 1-6 6-1 6-4.,sport,news
1121,melzer shocks agassi in san jose second seed andre agassi suffered a comprehensive defeat by jurgen melzer in the quarter-finals of the sap open. agassi was often bamboozled by the austrian s drop shots in san jose losing 6-3 6-1.,sport,news
...,...,...,...
2570,charvis set to lose fitness bid flanker colin charvis is unlikely to play any part in wales final two games of the six nations. charvis has missed all three of wales victories with an ankle injury and his recovery has been slower than expected.,sport,news
2571,preview: ireland v england (sun) lansdowne road dublin sunday 27 february 1500 gmt bbc1 radio 4 lw and this website ireland are going for their first grand slam since 1948 after two opening wins and england represent their sternest test of the championship so far. england were sloppy and leaderless in the defeats against wales and france and another loss would be unthinkable.,sport,news
2574,ferrero eyes return to top form former world number one juan carlos ferrero insists he can get back to his best despite a tough start to 2005. the 2003 french open champion has slipped to 64 in the world after a year of illness and injuries in 2004 but is confident that his form will return.,sport,news
2576,dallaglio eyeing lions tour place former england captain lawrence dallaglio still harbours hopes of a place on the british and irish lions tour to new zealand. lions coach sir clive woodward has made it clear he will pick his squad to tour next summer based on form shown in the six nations championship.,sport,news


In [41]:
tmp = pd.concat([tmp_jokes, tmp_news], ignore_index=True)

In [43]:
#Change max sequence length of the model
model.max_seq_length = 512

In [44]:
model.max_seq_length

512

In [45]:
embs = model.encode(tmp.text.to_list(), show_progress_bar=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=16.0, style=ProgressStyle(description_width…




In [46]:
embs_df = pd.DataFrame(embs)

In [47]:
embs_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,728,729,730,731,732,733,734,735,736,737,738,739,740,741,742,743,744,745,746,747,748,749,750,751,752,753,754,755,756,757,758,759,760,761,762,763,764,765,766,767
0,0.124801,-0.007502,-0.008061,-0.005189,-0.357371,0.523320,-0.022404,-0.018851,0.123623,-0.041797,-0.042617,-0.261050,0.052231,-0.694182,0.028858,0.258708,0.603574,-0.114871,-0.185400,0.050888,0.065316,-0.452855,-0.034172,0.136222,0.061083,-0.004258,-0.085004,0.248412,-0.239900,0.113040,0.202940,-0.103075,0.234498,0.076903,0.099550,-0.067800,-0.089362,-0.207321,0.372120,0.002016,...,0.025600,0.498334,0.046252,0.164974,0.427859,0.269142,0.190710,-0.443522,-0.034914,0.211937,0.122616,-0.010568,-0.095072,-0.137024,-0.169129,-0.603798,-0.181073,-0.000143,0.103652,-0.028710,-0.209232,0.236913,0.099972,0.017351,-0.304927,-0.087690,0.169769,-0.172516,0.360386,0.164597,0.275574,-0.195388,-0.030630,0.156924,-0.378563,-0.053482,-0.125059,0.014793,-0.162782,-0.146230
1,-0.440565,-0.388667,0.120421,0.146600,-0.107786,0.482386,0.148933,-0.182777,-0.134912,0.145673,0.426800,-0.299215,0.246872,-0.341138,0.108592,0.133444,0.386603,-0.189184,-0.012221,0.101361,0.074140,-0.601051,0.317717,-0.503686,-0.301281,0.323193,-0.014825,0.301043,-0.266526,-0.053858,0.041876,-0.033287,-0.214337,0.091766,0.127729,0.129340,-0.112310,0.063693,0.165862,-0.353857,...,0.230978,-0.012204,0.056761,-0.128374,0.354661,0.058618,0.280752,0.436821,-0.216821,0.296208,-0.124122,-0.195952,-0.337281,-0.170968,0.088699,-0.179560,-0.095593,0.284447,-0.057676,-0.048946,-0.017151,-0.192518,0.035687,-0.201521,-0.204375,-0.094253,-0.155097,-0.045881,0.478202,-0.040711,-0.079108,-0.148914,0.172370,0.081551,0.159946,0.046242,-0.133694,-0.009447,-0.004923,0.225714
2,-0.108351,0.102725,0.099066,0.327774,-0.253966,0.276951,-0.058987,0.206664,0.012456,-0.143818,0.027132,-0.151205,0.192661,-0.430688,-0.229284,0.133078,0.322186,0.019446,-0.178153,-0.191965,-0.120992,-0.317578,0.576165,0.397094,0.051321,-0.037175,-0.036071,0.304217,-0.123776,0.192836,-0.245184,-0.021134,0.230507,0.067431,-0.519733,0.263417,-0.172570,-0.181054,0.060265,-0.045414,...,0.190236,0.195125,0.206841,0.077960,0.063285,-0.057929,-0.051617,0.025408,-0.029202,-0.108091,-0.297006,-0.184294,-0.048830,0.172152,0.076167,-0.157908,-0.404812,0.040477,-0.262699,-0.343525,0.269857,-0.237136,-0.347432,-0.041823,-0.062414,-0.199282,-0.174428,-0.360430,-0.303079,0.218031,0.124127,-0.042666,-0.151603,-0.047960,-0.077064,0.088717,0.316170,-0.266200,-0.135109,0.066520
3,-0.172755,-0.024257,0.287079,0.354603,0.036258,-0.236290,0.123815,0.368460,-0.078089,0.515048,0.027041,-0.140275,-0.009657,0.172755,-0.216334,-0.359980,0.156651,0.121532,0.375928,0.422618,-0.026608,-0.471621,0.631019,-0.025203,-0.001070,0.268262,-0.007469,0.194734,-0.049154,-0.551102,-0.140005,0.038793,-0.379330,-0.016869,0.847081,-0.357709,-0.168592,-0.032210,0.308380,0.048992,...,0.041419,0.165998,-0.054800,0.019107,0.116942,-0.488891,0.291450,0.327640,0.128516,-0.383425,0.226028,-0.401835,-0.299775,-0.368451,-0.003701,-0.258262,-0.198315,-0.196581,0.035381,-0.166843,-0.284522,-0.681257,-0.216721,-0.062588,0.120291,-0.060421,0.132651,0.302597,0.305927,-0.148068,-0.164079,-0.078832,-0.367222,0.300387,-0.355860,0.083487,0.132766,0.118764,0.137366,-0.441846
4,0.309381,0.213125,0.318156,0.078030,-0.084411,0.074532,0.065993,0.227206,0.105908,-0.226032,-0.118648,-0.098966,-0.057197,0.041466,0.512825,0.054719,0.321883,-0.260506,-0.020826,-0.078433,-0.080534,-0.240825,0.284356,0.222655,-0.357929,0.054241,-0.017163,-0.106364,-0.077471,-0.049230,-0.138492,0.116872,-0.317832,0.018145,-0.195839,-0.046130,0.338885,0.223355,0.062173,-0.187660,...,0.247024,0.171568,-0.058484,0.096137,0.210807,0.359044,0.198724,-0.148763,0.261447,0.040798,0.081029,-0.169521,-0.309695,-0.433509,0.545614,-0.337076,0.202597,-0.183630,0.019440,-0.400993,0.230056,0.029754,-0.604228,-0.096231,-0.087210,0.379543,0.648215,-0.363844,0.282004,0.177389,0.121627,0.267294,0.364765,0.014373,-0.477921,-0.094073,0.018072,0.582554,-0.065838,0.175315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,-0.198167,0.389495,-0.101282,0.220076,-0.303699,-0.390218,0.073446,-0.131409,0.044336,-0.242407,0.046671,-0.371622,-0.172594,0.732657,0.317925,-0.540529,-0.152290,-0.286036,0.268282,0.093302,-0.115122,0.477024,0.428735,-0.092490,0.436026,0.166775,-0.693877,-0.350021,0.204513,-0.160191,-0.175171,-0.064008,0.037380,-0.154765,0.216498,-0.283215,0.127400,-0.193132,0.426154,-0.235556,...,0.495332,0.207917,-0.278018,-0.175865,-0.086045,0.123492,0.143609,0.271348,-0.373139,0.345128,-0.176815,0.153275,-0.166390,-0.054213,-0.162802,-0.617000,-0.291937,-0.186838,-0.262345,-0.522011,-0.347532,-0.479049,-0.029852,-0.145652,-0.104742,0.246259,0.127707,-0.085481,-0.003911,-0.239018,0.030033,0.222370,0.305763,0.036015,-0.320582,0.035946,-0.364166,0.244561,0.052587,0.086219
508,-0.052870,-0.400481,0.170439,-0.253575,-0.410392,-0.206247,0.126420,-0.116612,0.039525,-0.193780,0.469608,-0.146522,0.043630,-0.523031,0.029374,-0.520540,0.060730,0.022359,-0.046274,0.354304,-0.068663,-0.374783,0.032943,-0.353538,-0.032698,0.697654,0.410033,0.395290,-0.398201,0.553951,-0.090680,-0.129989,0.203005,0.319741,0.043481,-0.028274,0.159324,-0.077760,-0.193125,-0.115662,...,0.255125,-0.056020,-0.178286,-0.031201,0.407961,-0.422663,-0.070974,0.372594,-0.320501,-0.142573,0.086966,0.081651,-0.541936,-0.666276,0.241011,-0.060173,-0.103157,-0.083246,0.067030,0.063587,0.131124,-0.141572,0.675592,-0.032808,-1.010945,0.231512,-0.141748,0.052923,0.497961,0.183917,0.269029,-0.114965,0.155257,-0.062848,-0.114285,-0.032311,1.369927,0.521076,0.058775,0.181030
509,0.134341,0.265149,0.014344,0.529898,0.045667,-0.064748,-0.055231,-0.410955,-0.285620,0.250723,0.086133,0.496785,-0.278302,0.489882,0.124491,-0.643204,-0.293128,-0.530235,0.206458,-0.012263,0.042225,-0.033050,0.040753,-0.240929,-0.252036,-0.131820,-0.403079,-0.283484,-0.379318,-0.443203,-0.363671,0.224364,-0.051014,0.156524,0.351271,0.280429,0.183261,0.206516,0.747979,0.014633,...,0.021678,-0.170801,-0.273187,0.225643,-0.237772,0.026836,0.127288,0.178917,0.030977,0.016662,0.229207,-0.048417,-0.013072,0.183511,0.586009,-0.233221,0.195050,-0.109295,0.144775,-0.630719,-0.094746,-0.064163,0.226737,-0.391962,-0.462657,0.325900,0.500112,0.341363,0.696041,-0.305453,0.281935,0.184695,0.119246,0.444629,-0.148543,0.095334,0.073687,0.608310,0.364397,0.318513
510,-0.077360,-0.361675,0.158683,-0.157273,0.025273,0.381501,-0.073097,0.100928,0.167695,-0.307525,-0.247060,-0.269866,-0.204390,-0.324320,0.150118,-0.323263,0.238254,0.013296,0.256989,-0.324901,-0.257310,-0.119650,-0.296093,-0.222045,0.398313,0.234841,-0.495227,0.005305,-0.287070,-0.795549,-0.444139,-0.125061,0.128861,0.175435,0.145226,-0.000618,-0.135524,-0.121268,-0.372514,-0.199991,...,-0.334150,-0.038862,-0.009598,-0.122730,0.193428,-0.206508,-0.032371,-0.504760,-0.272184,0.289407,0.053660,0.132050,-0.393050,0.388793,0.216239,-0.154611,-0.123631,0.100034,-0.095259,0.155988,-0.181777,-0.385479,0.815749,0.294217,0.089790,0.099643,0.088826,-0.142816,-0.182864,-0.193798,0.179495,-0.325640,0.112675,0.097286,-0.413164,0.266195,0.369565,0.424384,-0.197999,0.249904


In [48]:
embs_df.transpose().describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,...,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,0.008383,0.007813,0.008609,0.010324,0.011249,0.011247,0.01234,0.010199,0.010649,0.015192,0.013112,0.010884,0.011032,0.013306,0.010773,0.010857,0.007912,0.011249,0.009327,0.014063,0.009165,0.010134,0.010724,0.013132,0.007196,0.010975,0.011154,0.008783,0.010885,0.012326,0.010982,0.011375,0.012669,0.01126,0.012851,0.012012,0.009859,0.007775,0.009168,0.012856,...,0.019997,0.021188,0.015279,0.013249,0.017938,0.012024,0.014459,0.01859,0.016335,0.014673,0.019311,0.017823,0.013994,0.020489,0.012504,0.01964,0.016341,0.01748,0.011842,0.017007,0.017657,0.01915,0.014841,0.02301,0.016807,0.017835,0.016461,0.018141,0.015468,0.015401,0.019617,0.016269,0.015152,0.02078,0.018314,0.015465,0.012598,0.016476,0.022709,0.017338
std,0.232771,0.248371,0.233986,0.277749,0.262441,0.262,0.263876,0.248191,0.263226,0.269756,0.28751,0.310758,0.273775,0.284643,0.253349,0.26696,0.265993,0.264407,0.268207,0.28142,0.245942,0.251048,0.25906,0.27634,0.274549,0.270554,0.274915,0.255753,0.260993,0.266715,0.283019,0.295548,0.274309,0.262529,0.280776,0.331685,0.226129,0.261728,0.272893,0.261782,...,0.237058,0.259547,0.25614,0.2918,0.268534,0.264883,0.264002,0.261186,0.308803,0.271706,0.250001,0.306463,0.313663,0.260659,0.307223,0.295719,0.277377,0.248139,0.293337,0.310214,0.31165,0.256593,0.307579,0.288128,0.277176,0.273036,0.285904,0.304133,0.234853,0.296453,0.330398,0.262534,0.253156,0.275252,0.266536,0.27711,0.291007,0.275589,0.299514,0.27282
min,-0.838565,-0.892408,-0.6902,-0.883562,-0.716802,-0.803782,-0.922871,-1.097413,-0.926055,-1.101339,-0.894965,-1.183661,-0.995697,-0.896388,-0.853154,-0.902948,-0.850177,-0.713111,-1.029004,-0.99381,-0.776286,-0.902375,-0.835059,-0.948397,-0.803132,-0.959821,-0.846076,-0.981429,-1.094031,-1.128427,-1.04306,-1.018773,-1.127726,-0.737719,-1.022741,-1.192447,-0.81084,-0.858715,-0.938284,-0.778956,...,-0.799493,-0.901638,-0.752174,-1.026862,-0.81483,-1.183698,-0.92872,-0.89164,-1.155786,-1.142387,-0.860511,-1.045708,-1.171755,-0.768229,-0.897481,-0.926903,-0.908037,-0.812749,-0.877755,-1.00563,-0.945595,-0.797889,-1.004676,-0.759678,-1.015326,-0.824135,-1.098868,-1.192093,-0.849988,-0.838658,-1.215055,-0.840273,-0.822131,-1.061406,-1.334195,-0.984684,-1.010945,-0.810901,-0.893988,-0.901392
25%,-0.133815,-0.143101,-0.148938,-0.164643,-0.163799,-0.162756,-0.156714,-0.149981,-0.155065,-0.157303,-0.165535,-0.182648,-0.154853,-0.179658,-0.149028,-0.156646,-0.180336,-0.161895,-0.162278,-0.163554,-0.143528,-0.150479,-0.147539,-0.155006,-0.170764,-0.160897,-0.149981,-0.152164,-0.155873,-0.157014,-0.179413,-0.192924,-0.16908,-0.157247,-0.172595,-0.218068,-0.137139,-0.173856,-0.173182,-0.158397,...,-0.123131,-0.15703,-0.147196,-0.171683,-0.160745,-0.149859,-0.153828,-0.152332,-0.164804,-0.144821,-0.143736,-0.188257,-0.186671,-0.158489,-0.189275,-0.192167,-0.1598,-0.13818,-0.181314,-0.16238,-0.194637,-0.137634,-0.182513,-0.181304,-0.158865,-0.174374,-0.15184,-0.167721,-0.130229,-0.182153,-0.203569,-0.16104,-0.147507,-0.154663,-0.166461,-0.158116,-0.169293,-0.172051,-0.175102,-0.148371
50%,0.014372,0.018018,0.017016,0.001531,0.008232,0.013246,0.017135,0.020733,0.005004,0.014387,0.008323,0.020005,0.004738,0.004406,0.010491,0.015706,0.002547,0.00751,0.006392,0.011395,0.007706,0.020497,0.006242,0.010314,-0.000647,0.00703,0.004703,0.013986,0.010338,0.006925,0.007804,0.014467,0.01735,0.009188,0.007079,0.024731,0.015404,0.014402,0.018,0.022195,...,0.008706,0.013561,0.011787,0.016303,0.009564,0.022654,0.005894,0.012181,0.02308,0.028277,0.013281,0.017414,0.030653,0.01338,0.021498,0.007434,0.015145,0.021646,0.013131,0.018167,0.00953,0.016864,0.019701,0.018178,0.008049,0.024765,0.018261,0.013996,0.019196,0.019571,0.03561,0.023898,0.020538,0.023801,0.031851,0.027791,0.009467,0.015132,0.017862,0.016411
75%,0.156991,0.161702,0.163284,0.196294,0.185113,0.18922,0.177423,0.153539,0.171371,0.182456,0.200014,0.204426,0.191491,0.201652,0.150657,0.176232,0.192203,0.172784,0.17586,0.201502,0.163179,0.172856,0.165185,0.191098,0.200679,0.181078,0.173895,0.169107,0.172669,0.165055,0.199577,0.204144,0.19199,0.190908,0.168097,0.231184,0.147287,0.170004,0.197831,0.186169,...,0.167757,0.191348,0.174899,0.196367,0.202406,0.182623,0.19089,0.195177,0.203029,0.185077,0.178432,0.212726,0.199419,0.187677,0.215414,0.210453,0.199797,0.176234,0.210676,0.203914,0.205175,0.174937,0.193777,0.215142,0.199346,0.201839,0.205853,0.20519,0.147564,0.212938,0.224876,0.183334,0.186169,0.200156,0.192706,0.185645,0.193278,0.203299,0.207174,0.198639
max,1.399194,1.170214,1.097452,1.2515,0.870108,1.068048,1.155036,0.840353,1.028286,0.889148,1.165609,1.088437,1.038882,0.936099,0.981591,1.085008,1.12361,0.994845,0.943147,0.918591,1.024003,0.987804,0.986416,1.09309,1.220678,0.985563,0.993099,1.107453,1.044017,1.134751,0.946356,0.895012,0.928112,0.867032,0.989214,1.109627,1.097376,1.274283,1.14761,0.949189,...,1.296128,1.132553,1.044726,0.964481,0.974978,1.017805,0.943543,0.834916,1.095744,0.905143,0.979538,1.094037,1.162161,0.992582,0.962129,0.852817,1.101009,0.888043,1.120335,1.098889,1.202008,0.890002,1.112836,0.972834,0.90823,1.143485,0.961545,1.074229,0.789147,1.144609,1.076807,0.802916,0.911115,1.028652,0.899809,1.044124,1.369927,0.891894,1.117719,0.819032


In [49]:
dists = cosine_distances(embs)

In [50]:
pd.DataFrame(dists)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504,505,506,507,508,509,510,511
0,0.000000,0.804008,0.611524,0.838090,0.887655,0.734218,0.861425,0.892721,0.772563,0.999438,0.913523,0.839673,0.866924,0.825452,0.833746,0.884100,0.794782,0.896601,0.966519,0.945126,0.856114,0.933680,0.859546,0.855024,0.728006,0.926368,0.795188,0.789157,0.807942,0.974090,0.896360,0.883247,0.949232,0.895341,0.882195,0.977561,0.875331,0.688975,0.865159,0.876261,...,1.031101,1.042575,0.997013,1.027095,1.014575,0.946267,0.864189,0.861165,0.862951,0.990310,0.963783,1.074816,1.019518,0.994144,0.906049,1.080042,0.954287,0.974923,0.971294,1.119280,1.017251,0.966174,0.976404,0.946353,0.958478,0.980651,0.999502,0.960962,0.957858,0.996294,1.033428,0.991139,1.033241,1.015346,1.065712,1.056708,0.864915,1.063638,0.985495,1.046800
1,0.804008,0.000000,0.705750,0.702487,0.871664,0.879395,0.825131,0.689362,0.684797,0.918848,0.927515,0.801486,0.632830,0.804877,0.757593,0.711485,0.719815,0.775116,0.749641,0.863183,0.722957,0.844545,0.716303,0.817512,0.717082,0.906866,0.828081,0.726282,0.706683,0.617707,0.865801,1.022371,0.866825,0.888074,0.841587,0.954046,0.990288,0.739778,0.754672,1.023376,...,0.902390,0.973715,1.059232,0.878857,0.874972,0.897511,0.868826,0.825871,0.910299,0.989915,0.930460,0.958925,0.872596,1.043495,0.842037,0.975234,0.961161,0.952678,0.854519,1.006568,0.978177,0.981524,0.905619,0.973790,0.929908,0.937164,0.974279,0.945981,0.893058,0.949921,0.993972,0.852436,0.915044,0.893539,1.029099,0.966783,0.709911,0.923666,0.943887,0.870621
2,0.611524,0.705750,0.000000,0.854874,0.821748,0.743286,0.874793,0.820606,0.734203,0.986632,0.887754,0.839805,0.759378,0.780478,0.727906,0.862578,0.729899,0.779063,0.945049,0.967628,0.804758,0.980227,0.805347,0.850630,0.597880,0.898007,0.866702,0.712278,0.802920,0.888006,0.719544,0.853115,0.858888,0.809344,0.773476,0.872643,0.947146,0.615057,0.863523,1.081201,...,0.983065,0.954720,1.009807,0.917222,0.881296,0.972018,0.852194,0.897535,0.786831,1.063934,0.996300,0.918212,0.824414,0.946697,0.747951,0.948998,0.934056,0.874045,0.916521,1.051670,1.058907,0.996250,0.984431,1.008515,1.007003,0.963119,0.975802,1.093715,0.844179,0.982408,1.015083,0.960557,0.993176,1.016215,0.999719,0.951576,0.817206,1.077836,1.018535,0.986985
3,0.838090,0.702487,0.854874,0.000000,0.938336,0.834913,0.752231,0.532144,0.470034,0.796702,0.835286,0.670121,0.666155,0.857423,0.669896,0.736693,0.507556,0.750770,0.441199,0.848857,0.722967,0.729343,0.761054,0.759082,0.761628,0.764127,0.519943,0.501762,0.475712,0.724413,0.791198,0.858146,0.758242,0.957851,0.804960,0.966481,0.746157,0.722782,0.656732,0.858173,...,0.820190,0.882238,0.887064,0.850762,0.856749,0.783776,0.738372,0.796734,0.778928,0.820466,0.861335,0.891230,0.906901,0.937895,0.755003,0.879718,0.927549,0.930069,0.834542,0.819137,0.933172,0.827760,0.963099,0.795931,0.848982,0.840689,0.986087,0.832652,0.852038,0.846785,0.955916,0.903102,0.913754,0.911584,0.974292,0.804234,0.807362,0.876867,0.890554,0.903949
4,0.887655,0.871664,0.821748,0.938336,0.000000,0.609533,0.484572,0.786578,0.846929,0.859225,0.775213,0.875360,0.849999,0.712810,0.761626,0.828350,0.894125,0.807915,0.845512,0.653676,0.786551,0.896701,0.883713,0.846502,0.655267,0.870480,0.940360,0.840596,0.831832,0.979372,0.836044,0.845236,0.507673,0.010949,0.836810,0.662378,0.693068,0.643807,0.786616,0.878280,...,1.016072,0.824252,0.907082,0.765730,1.049024,0.759292,0.834032,0.822257,0.840201,0.793574,0.972149,0.742850,0.729106,0.994235,0.798808,0.703592,0.926507,0.821045,0.788185,0.692156,0.815022,0.971237,0.809258,0.891048,0.862404,0.993086,0.879272,0.879105,0.977453,0.690468,0.792409,0.932174,0.907707,0.923183,0.847516,0.806224,0.726590,0.815923,0.991971,0.927439
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,1.056708,0.966783,0.951576,0.804234,0.806224,0.776985,0.831370,0.674098,0.832858,1.011300,0.847189,0.890803,0.942178,0.849087,0.939640,0.892876,0.891160,0.898027,0.744397,0.866025,0.920495,0.916747,0.830997,0.825609,0.870466,0.941835,0.933615,0.848246,0.854557,1.034757,0.857614,0.812018,0.726440,0.806621,0.963160,0.760195,0.886335,0.834046,0.821318,0.962661,...,0.773296,0.681021,0.752743,0.729744,0.787808,0.815891,0.771286,0.718262,0.704442,0.698773,0.837515,0.731957,0.746725,0.963633,0.752700,0.794372,0.917461,0.828344,0.699254,0.671313,0.818722,0.839216,0.771687,0.674888,0.742681,0.843537,0.807808,0.716331,0.783413,0.814389,0.764786,0.875740,0.658955,0.787991,0.897405,0.000000,0.845173,0.691417,0.747412,0.789338
508,0.864915,0.709911,0.817206,0.807362,0.726590,0.709510,0.651250,0.677588,0.736723,0.867462,0.846215,0.796869,0.848146,0.666095,0.764408,0.846658,0.711040,0.887964,0.785884,0.878692,0.842042,0.909674,0.858906,0.799089,0.670085,0.847927,0.710714,0.807368,0.769758,0.943760,0.851370,0.930579,0.544395,0.737277,0.818197,0.725974,0.794309,0.712289,0.828227,0.927709,...,0.826801,0.817216,0.727009,0.359348,0.797955,0.669687,0.725762,0.660535,0.675974,0.750459,0.760473,0.607547,0.609977,0.932607,0.752105,0.551084,0.843281,0.782522,0.588261,0.803901,0.812669,0.916289,0.722127,0.643916,0.869380,0.813251,0.794903,0.703522,0.875671,0.739390,0.702562,0.762874,0.869274,0.801850,0.977109,0.845173,0.000000,0.877220,0.717246,0.769753
509,1.063638,0.923666,1.077836,0.876867,0.815923,0.866689,0.887015,0.777142,0.858790,1.005216,0.964766,0.950006,0.969863,0.934641,0.923132,0.867529,0.960918,0.951890,0.775172,0.900895,0.945000,0.934139,0.888883,0.911521,1.004816,1.019308,1.018612,0.985282,0.904811,0.908416,0.978903,0.938568,0.821820,0.820273,1.014755,0.832790,0.869040,0.979610,0.985303,0.987423,...,0.828977,0.710994,0.762423,0.838105,0.903007,0.837138,0.833003,0.835189,0.878027,0.771150,0.905528,0.760728,0.809194,0.908835,0.869654,0.766628,0.920079,0.899587,0.780349,0.645524,0.844580,0.864697,0.770545,0.786416,0.740170,0.883720,0.683262,0.724423,0.772063,0.733330,0.734668,0.839524,0.753946,0.646267,0.861538,0.691417,0.877220,0.000000,0.709085,0.833581
510,0.985495,0.943887,1.018535,0.890554,0.991971,0.778793,0.946099,0.895995,0.905163,0.989052,1.022676,0.879383,1.116553,0.905076,0.913101,0.936073,0.926400,0.903110,0.950760,0.969036,1.012266,0.987324,0.939675,0.986808,0.998330,0.934814,0.961661,1.058380,0.977122,0.936880,0.997397,0.901961,0.803953,0.992446,1.024383,0.935374,0.935652,0.951720,0.976552,1.025627,...,0.798299,0.648285,0.824343,0.748862,0.703742,0.829952,0.759979,0.808743,0.769516,0.774910,0.817483,0.730190,0.786890,0.844897,0.848952,0.750336,1.034177,0.836801,0.741042,0.744617,0.870602,0.763299,0.755068,0.666767,0.755626,0.714404,0.601024,0.808638,0.780323,0.869908,0.729747,0.699714,0.768078,0.762586,0.879987,0.747412,0.717246,0.709085,0.000000,0.601914


In [51]:
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=60, max_iter=1000)
results = mds.fit(dists)

In [52]:
results

MDS(dissimilarity='precomputed', eps=0.001, max_iter=1000, metric=True,
    n_components=2, n_init=4, n_jobs=None, random_state=60, verbose=0)

In [53]:
coords = results.embedding_

In [54]:
coords

array([[ 0.19516389, -0.80269268],
       [ 0.4745114 , -0.57505371],
       [ 0.63576998, -0.44124774],
       ...,
       [-0.32863639,  0.59158803],
       [ 0.02017577,  0.62558399],
       [ 0.22786369,  0.49439486]])

In [55]:
tmp['x'] = pd.Series(coords[:, 0])
tmp['y'] = pd.Series(coords[:, 1])

In [56]:
label_counts = tmp['source'].value_counts()

In [57]:
label_counts

news    346
joke    166
Name: source, dtype: int64

In [58]:
mask = tmp.source == 'news'

In [59]:
tmp[mask]

Unnamed: 0,text,category,source,x,y
166,wales silent on grand slam talk rhys williams says wales are still not thinking of winning the grand slam despite a third six nations win. that s the last thing on our minds at the moment said williams a second- half replacement in saturday s 24-18 win over france in paris. we all realise how difficult a task it is to go up to scotland and beat them.,sport,news,-0.245072,-0.117579
167,ireland 21-19 argentina an injury-time dropped goal by ronan o gara stole victory for ireland from underneath the noses of argentina at lansdowne road on saturday. o gara kicked all of ireland s points with two dropped goals and five penalties to give the home side a 100% record in their autumn internationals.,sport,news,-0.221561,-0.254971
168,wenger signs new deal arsenal manager arsene wenger has signed a new contract to stay at the club until may 2008. wenger has ended speculation about his future by agreeing a long-term contract that takes him beyond the opening of arsenal s new stadium in two years.,sport,news,0.500514,0.544391
169,hantuchova in dubai last eight daniela hantuchova moved into the quarter-finals of the dubai open after beating elene likhotseva of russia 7-5 6-4 and now faces serena williams. australian open champion williams survived an early scare to beat russia s elena bovina 1-6 6-1 6-4.,sport,news,-0.512856,0.127257
170,melzer shocks agassi in san jose second seed andre agassi suffered a comprehensive defeat by jurgen melzer in the quarter-finals of the sap open. agassi was often bamboozled by the austrian s drop shots in san jose losing 6-3 6-1.,sport,news,-0.575804,-0.230858
...,...,...,...,...,...
507,charvis set to lose fitness bid flanker colin charvis is unlikely to play any part in wales final two games of the six nations. charvis has missed all three of wales victories with an ankle injury and his recovery has been slower than expected.,sport,news,-0.244339,0.356218
508,preview: ireland v england (sun) lansdowne road dublin sunday 27 february 1500 gmt bbc1 radio 4 lw and this website ireland are going for their first grand slam since 1948 after two opening wins and england represent their sternest test of the championship so far. england were sloppy and leaderless in the defeats against wales and france and another loss would be unthinkable.,sport,news,-0.124969,-0.217596
509,ferrero eyes return to top form former world number one juan carlos ferrero insists he can get back to his best despite a tough start to 2005. the 2003 french open champion has slipped to 64 in the world after a year of illness and injuries in 2004 but is confident that his form will return.,sport,news,-0.328636,0.591588
510,dallaglio eyeing lions tour place former england captain lawrence dallaglio still harbours hopes of a place on the british and irish lions tour to new zealand. lions coach sir clive woodward has made it clear he will pick his squad to tour next summer based on form shown in the six nations championship.,sport,news,0.020176,0.625584


In [60]:
tmp['x'][mask]

166   -0.245072
167   -0.221561
168    0.500514
169   -0.512856
170   -0.575804
         ...   
507   -0.244339
508   -0.124969
509   -0.328636
510    0.020176
511    0.227864
Name: x, Length: 346, dtype: float64

In [61]:
colorscale = ['#00CC96', '#AB63FA', '#FFA15A', 
                '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']

In [62]:
fig = go.Figure()
idx = 0
for cat, count in label_counts.iteritems():
  mask = tmp.source == cat
  fig.add_trace(go.Scatter(x=tmp['x'][mask], y=tmp['y'][mask],
                mode='markers', textposition="top center",
                marker=dict(
                    size=10,
                    color=colorscale[idx],
                    opacity=0.8,
                ), text=tmp['text'][mask], hoverinfo='text',
                name=cat))
  idx += 1      

fig.update_layout(template="plotly_dark")
fig.update_layout(title_text=f'Comparison of embeddings from sport category')
fig.show()
fig.write_html(str(DATA_PATH) + "news_jokes_sports.html")

In [63]:
!pip install chart_studio



In [66]:
import chart_studio.tools as tls
import chart_studio.plotly as py

In [68]:
py.plot(fig, filename='news_jokes_sports')

'https://plotly.com/~karmenkk/1/'