In [24]:
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from sklearn.preprocessing import StandardScaler
import umap
import plotly.express as px

In [21]:
### load the movie data 
df = pd.read_parquet("../artifacts/movie_plots.parquet")
df.head()

Unnamed: 0,index,url,title,plot
0,0,https://en.wikipedia.org/wiki/White_Noise_(200...,White Noise (2005 film),Jonathan Rivers is an architect and lives with...
1,1,https://en.wikipedia.org/wiki/Coach_Carter,Coach Carter,"Ken Carter lives in Richmond, California. He b..."
2,2,https://en.wikipedia.org/wiki/Elektra_(2005_film),Elektra (2005 film),"After being killed,[a] Elektra Natchios is rev..."
3,3,https://en.wikipedia.org/wiki/Racing_Stripes,Racing Stripes,"During a thunderstorm, a traveling circus, Cir..."
4,4,https://en.wikipedia.org/wiki/Tom_and_Jerry:_B...,Tom and Jerry: Blast Off to Mars,Tom chases Jerry as usual from their house and...


In [17]:
### make a dataframe out of SBERT embeddings of 384 columns
sbert_df = pq.read_table("../artifacts/sbert_embeddings.parquet").to_pandas()
display(sbert_df)
sbert_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.079473,0.039867,0.015422,0.001454,0.054188,0.008344,0.062162,-0.038170,0.074984,-0.059262,...,0.056672,-0.064081,0.031193,-0.075870,-0.092477,0.007825,0.087413,0.014493,-0.010109,-0.036263
1,-0.035826,0.058588,0.009567,-0.051754,0.007164,0.005096,-0.008980,0.017756,0.112654,0.055845,...,0.048709,-0.004139,-0.097728,0.080659,-0.040395,-0.077078,-0.038374,0.003045,-0.081532,-0.024123
2,-0.020834,0.029291,-0.042002,0.009461,0.119430,0.054744,0.072913,0.014281,0.034877,0.022201,...,0.098046,-0.113098,0.045841,0.025968,0.004102,0.055712,-0.002792,-0.017434,0.004939,0.009428
3,-0.108421,-0.000945,-0.026790,-0.022783,-0.035257,0.043165,0.061273,-0.021300,0.010107,0.010341,...,-0.038871,-0.124378,-0.070355,-0.013237,-0.046645,0.080196,-0.029096,-0.024358,0.092425,-0.078799
4,-0.033221,-0.006662,0.015374,-0.041513,-0.044280,-0.049768,0.103815,-0.051591,0.030272,-0.020047,...,0.053476,-0.027529,-0.069866,-0.017386,0.063754,0.045215,-0.016984,0.100490,-0.020218,0.019145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3503,-0.015937,-0.045859,0.038560,-0.013979,-0.032743,0.017974,-0.004209,0.095838,0.023471,-0.081796,...,0.055380,-0.042650,-0.016786,-0.007887,0.001550,0.039736,0.050041,-0.079494,0.075386,-0.027209
3504,-0.054327,-0.025792,-0.047029,0.000116,0.023716,0.022768,0.042176,-0.066350,0.013970,-0.062164,...,0.037129,-0.081040,-0.019769,0.058770,-0.087794,-0.036639,0.066967,0.043139,-0.066197,0.008521
3505,0.022267,0.009472,-0.030729,0.055254,0.060820,0.053154,0.076183,-0.080574,-0.029180,0.079510,...,0.049057,-0.044715,-0.041976,0.066816,0.021295,0.083398,-0.038968,-0.065160,-0.033944,-0.057705
3506,-0.001472,0.071103,-0.013785,0.059384,0.011841,0.023274,0.001098,0.076087,0.026722,-0.069878,...,0.098609,-0.040856,-0.016767,0.053141,-0.027726,0.066312,0.048097,0.025599,0.053474,-0.048871


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
count,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,...,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0
mean,-0.052602,0.001245,-0.009321,-0.00695,0.021326,0.016827,0.050092,-0.002859,0.020367,-0.003336,...,0.022278,-0.037888,-0.01188,0.011876,-0.015746,0.031458,0.01185,-0.000369,-0.00263,-0.022682
std,0.043498,0.047356,0.048859,0.045816,0.047389,0.039319,0.044576,0.042875,0.044025,0.043642,...,0.041044,0.046127,0.043728,0.048763,0.045514,0.044579,0.048056,0.045885,0.045312,0.040497
min,-0.216054,-0.163409,-0.191732,-0.168607,-0.143745,-0.122684,-0.105055,-0.155209,-0.134346,-0.159154,...,-0.114981,-0.184974,-0.170972,-0.15075,-0.176166,-0.193908,-0.150471,-0.159669,-0.17311,-0.17204
25%,-0.082772,-0.030132,-0.042253,-0.038853,-0.011057,-0.008522,0.019731,-0.033163,-0.009005,-0.033208,...,-0.005954,-0.06945,-0.041437,-0.020732,-0.046255,0.001374,-0.020893,-0.030383,-0.033115,-0.049428
50%,-0.053875,0.000594,-0.00885,-0.006843,0.022063,0.016299,0.049625,-0.002896,0.021141,-0.003596,...,0.021173,-0.038047,-0.010866,0.011568,-0.016521,0.031913,0.011975,0.000343,-0.002752,-0.022482
75%,-0.024745,0.033273,0.02267,0.022508,0.054135,0.043549,0.08033,0.025747,0.050402,0.026839,...,0.04983,-0.00658,0.016576,0.044743,0.014928,0.062033,0.04373,0.03082,0.027673,0.004708
max,0.103364,0.172785,0.194742,0.142543,0.169512,0.13377,0.215654,0.152991,0.166111,0.14515,...,0.181038,0.12153,0.151541,0.179724,0.161818,0.187832,0.196703,0.160332,0.163457,0.125698


In [18]:
## standardize the data, this helps UMAP to converge quickly and produce better o/p
scaler = StandardScaler()
sbert_df = pd.DataFrame(scaler.fit_transform(sbert_df))
display(sbert_df)
sbert_df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.617834,0.815688,0.506467,0.183449,0.693559,-0.215762,0.270800,-0.823685,1.240774,-1.281656,...,0.838096,-0.567933,0.985159,-1.799695,-1.686111,-0.530218,1.572613,0.323934,-0.165095,-0.335397
1,0.385727,1.211074,0.386633,-0.978073,-0.298883,-0.298384,-1.325401,0.480899,2.096538,1.356262,...,0.644062,0.731764,-1.963519,1.410763,-0.541640,-2.435040,-1.045270,0.074403,-1.741570,-0.035579
2,0.730447,0.592333,-0.668983,0.358244,2.070475,0.964460,0.512018,0.399822,0.329636,0.585234,...,1.846261,-1.630735,1.320184,0.289042,0.436134,0.544147,-0.304732,-0.371977,0.167057,0.793002
3,-1.283431,-0.046256,-0.357591,-0.345645,-1.194185,0.669946,0.250851,-0.430170,-0.233091,0.313434,...,-1.490035,-1.875309,-1.337448,-0.515060,-0.678977,1.093445,-0.852178,-0.522889,2.098096,-1.385889
4,0.445629,-0.166985,0.505500,-0.754510,-1.384604,-1.693921,1.205368,-1.136757,0.225008,-0.382963,...,0.760228,0.224605,-1.326252,-0.600157,1.746937,0.308650,-0.600087,2.198411,-0.388214,1.032982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3503,0.843043,-0.994821,0.980110,-0.153448,-1.141113,0.029176,-1.218361,2.302298,0.070511,-1.798067,...,0.806612,-0.103251,-0.112212,-0.405329,0.380072,0.185725,0.794822,-1.724687,1.722010,-0.111802
3504,-0.039664,-0.571009,-0.771880,0.154241,0.050453,0.151129,-0.177624,-1.481036,-0.145344,-1.348159,...,0.361893,-0.935641,-0.180432,0.961823,-1.583203,-1.527789,1.147083,0.948332,-1.403100,0.770593
3505,1.721466,0.173765,-0.438217,1.357879,0.833527,0.924028,0.585382,-1.812853,-1.125607,1.898584,...,0.652548,-0.148020,-0.688353,1.126848,0.813950,1.165296,-1.057633,-1.412247,-0.691185,-0.864958
3506,1.175618,1.475390,-0.091391,1.448048,-0.200174,0.164000,-1.099278,1.841565,0.144362,-1.524945,...,1.859994,-0.064367,-0.111771,0.846375,-0.263253,0.781960,0.754367,0.566009,1.238363,-0.646776


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
count,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,...,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0,3508.0
mean,0.0,-1.196171e-08,-1.631142e-08,-6.524568e-09,4.349712e-09,9.786851e-09,-2.174856e-09,-5.43714e-09,7.611995e-09,9.514995e-09,...,1.087428e-09,7.611995e-09,4.349712e-09,-9.786851e-09,4.349712e-09,1.413656e-08,0.0,-7.611995e-09,-1.196171e-08,1.087428e-09
std,1.000142,1.000143,1.000143,1.000142,1.000143,1.000142,1.000143,1.000143,1.000143,1.000143,...,1.000142,1.000142,1.000142,1.000143,1.000142,1.000143,1.000143,1.000142,1.000142,1.000142
min,-3.758239,-3.477446,-3.733917,-3.528928,-3.483813,-3.548645,-3.481007,-3.553848,-3.514739,-3.570884,...,-3.344651,-3.189195,-3.638741,-3.335511,-3.525113,-5.056159,-3.378225,-3.472254,-3.762935,-3.688629
25%,-0.693696,-0.6626715,-0.6741078,-0.6964464,-0.683433,-0.6447824,-0.681208,-0.7068924,-0.6672844,-0.6845802,...,-0.6879396,-0.6843538,-0.6760252,-0.6687991,-0.6704162,-0.6749451,-0.681452,-0.6542157,-0.6728861,-0.6605299
50%,-0.02928,-0.01374131,0.009642307,0.002323699,0.01556211,-0.01342402,-0.01048982,-0.0008521736,0.01757461,-0.005948866,...,-0.02693117,-0.003459756,0.02319398,-0.006301623,-0.01702899,0.01021326,0.00259,0.01551113,-0.002702133,0.004932989
75%,0.640507,0.6764278,0.6548421,0.6430498,0.6924463,0.679706,0.6784283,0.6673007,0.6823183,0.6915259,...,0.671366,0.6788187,0.6508397,0.6741135,0.6740451,0.6859659,0.663471,0.6798087,0.6688541,0.6764295
max,3.586113,3.62287,4.177119,3.263376,3.127473,2.974591,3.714683,3.635495,3.310963,3.402849,...,3.868584,3.456567,3.737751,3.442619,3.901832,3.508296,3.847153,3.502776,3.665956,3.664475


In [19]:
### use UMAP to reduce sbert dimention to 3
reducer = umap.UMAP(n_components=3, random_state=42)
reduced = reducer.fit_transform(sbert_df)

In [22]:
sbert_df = pd.DataFrame(data=reduced, columns=["comp_1", "comp_2", "comp_3"])
sbert_df = pd.concat([df[["url", "title"]], sbert_df], axis=1)
sbert_df

Unnamed: 0,url,title,comp_1,comp_2,comp_3
0,https://en.wikipedia.org/wiki/White_Noise_(200...,White Noise (2005 film),20.074690,3.325428,3.243009
1,https://en.wikipedia.org/wiki/Coach_Carter,Coach Carter,21.222124,0.752786,3.025838
2,https://en.wikipedia.org/wiki/Elektra_(2005_film),Elektra (2005 film),19.412094,2.943435,5.049709
3,https://en.wikipedia.org/wiki/Racing_Stripes,Racing Stripes,18.475985,2.965726,2.590855
4,https://en.wikipedia.org/wiki/Tom_and_Jerry:_B...,Tom and Jerry: Blast Off to Mars,18.585512,1.549996,3.592942
...,...,...,...,...,...
3503,https://en.wikipedia.org/wiki/Whitney_Houston:...,Whitney Houston: I Wanna Dance with Somebody,21.451591,3.065056,2.276859
3504,https://en.wikipedia.org/wiki/The_Pale_Blue_Eye,The Pale Blue Eye,20.120535,2.130331,4.630341
3505,https://en.wikipedia.org/wiki/Women_Talking_(f...,Women Talking (film),19.670748,3.323276,3.577909
3506,https://en.wikipedia.org/wiki/A_Man_Called_Otto,A Man Called Otto,19.268095,3.371453,4.191731


In [23]:
### plot the UMAP produced data in 3-D 
fig =  px.scatter_3d(
    data_frame=sbert_df,
  x="comp_1",
  y="comp_2",
  z="comp_3",
  hover_name="title",
  hover_data=["url"],
  width=1200,
  height=1000,
)
fig.update_layout(margin=dict(l=-0, r=-0, b=0, t=0))
fig.show()