In [3]:
!pip install -q sentence-transformers
!pip install -q textwrap3
!pip install -q umap-learn
!pip install -q pyyaml
!pip install -q "ipywidgets>=7,<8"

In [4]:
import pandas as pd
from sentence_transformers import SentenceTransformer

In [14]:
movie_plots_file = "movie_plots.csv"

df = pd.read_csv(movie_plots_file)
df.shape

(3192, 4)

In [None]:
sentence_encoder_model = SentenceTransformer("all-MiniLM-L6-v2")

In [15]:
df.drop_duplicates(subset='url', keep=False, inplace=True)
df.shape

(3178, 4)

In [16]:
movie_plots = df['plot'].tolist()
print(len(movie_plots))

3178


In [17]:
embeddings = sentence_encoder_model.encode(movie_plots, show_progress_bar=True)

Batches:   0%|          | 0/100 [00:00<?, ?it/s]

In [18]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,url,plot
0,0,White Noise,https://en.wikipedia.org/wiki/White_Noise_(200...,\nJonathan Rivers is an architect and lives wi...
1,1,Coach Carter,https://en.wikipedia.org/wiki/Coach_Carter,"\nKen Carter lives in Richmond, California. He..."
2,2,Elektra,https://en.wikipedia.org/wiki/Elektra_(2005_film),"\nAfter being killed in Daredevil, Elektra Nat..."


In [19]:
df['embeddings'] = embeddings.tolist()
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,url,plot,embeddings
0,0,White Noise,https://en.wikipedia.org/wiki/White_Noise_(200...,\nJonathan Rivers is an architect and lives wi...,"[-0.07947266846895218, 0.03986681252717972, 0...."
1,1,Coach Carter,https://en.wikipedia.org/wiki/Coach_Carter,"\nKen Carter lives in Richmond, California. He...","[-0.03712575137615204, 0.05965740978717804, 0...."
2,2,Elektra,https://en.wikipedia.org/wiki/Elektra_(2005_film),"\nAfter being killed in Daredevil, Elektra Nat...","[-0.019044550135731697, 0.00964392814785242, -..."


In [21]:
import umap
import numpy as np

umap_embeddings_2dim = umap.UMAP(n_neighbors=5, n_components=2, min_dist=0, metric='cosine').fit_transform(embeddings)

df['umap_embeddings_2dim'] = umap_embeddings_2dim.tolist()
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,url,plot,embeddings,umap_embeddings_2dim
0,0,White Noise,https://en.wikipedia.org/wiki/White_Noise_(200...,\nJonathan Rivers is an architect and lives wi...,"[-0.07947266846895218, 0.03986681252717972, 0....","[8.770747184753418, 5.308734893798828]"
1,1,Coach Carter,https://en.wikipedia.org/wiki/Coach_Carter,"\nKen Carter lives in Richmond, California. He...","[-0.03712575137615204, 0.05965740978717804, 0....","[11.768545150756836, 3.8604795932769775]"
2,2,Elektra,https://en.wikipedia.org/wiki/Elektra_(2005_film),"\nAfter being killed in Daredevil, Elektra Nat...","[-0.019044550135731697, 0.00964392814785242, -...","[9.273338317871094, 7.270703315734863]"


In [29]:
import plotly.express as px

col = 'umap_embeddings_2dim'
hoverdata = ['url']
hovername = 'title'

plot_values = np.stack(df[col], axis=1)

x, y = plot_values[0], plot_values[1]
fig = px.scatter(df, x=x, y=y, hover_data=hoverdata, hover_name=hovername, title='Visualize Movie Plots')

fig.show()

In [30]:
df['type']='movie'
df.head(3)

Unnamed: 0.1,Unnamed: 0,title,url,plot,embeddings,umap_embeddings_2dim,type
0,0,White Noise,https://en.wikipedia.org/wiki/White_Noise_(200...,\nJonathan Rivers is an architect and lives wi...,"[-0.07947266846895218, 0.03986681252717972, 0....","[8.770747184753418, 5.308734893798828]",movie
1,1,Coach Carter,https://en.wikipedia.org/wiki/Coach_Carter,"\nKen Carter lives in Richmond, California. He...","[-0.03712575137615204, 0.05965740978717804, 0....","[11.768545150756836, 3.8604795932769775]",movie
2,2,Elektra,https://en.wikipedia.org/wiki/Elektra_(2005_film),"\nAfter being killed in Daredevil, Elektra Nat...","[-0.019044550135731697, 0.00964392814785242, -...","[9.273338317871094, 7.270703315734863]",movie


In [32]:
movie_directors_file = "movie_directors.csv"

df_directors = pd.read_csv(movie_directors_file)
df_directors.head(3)

Unnamed: 0,selection1_name,selection1_url,selection1_selection2
0,David Lynch,https://www.imdb.com/name/nm0000186?ref_=nmls_hd,Born in precisely the kind of small-town Ameri...
1,Stanley Kubrick,https://www.imdb.com/name/nm0000040?ref_=nmls_hd,"Stanley Kubrick was born in Manhattan, New Yor..."
2,Robert Bresson,https://www.imdb.com/name/nm0000975?ref_=nmls_hd,Robert Bresson trained as a painter before mov...


In [34]:
director_bios = df_directors['selection1_selection2'].tolist()
embeddings_directors = sentence_encoder_model.encode(director_bios, show_progress_bar=True)
df_directors['embeddings'] = embeddings_directors.tolist()
df_directors.head(3)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,selection1_name,selection1_url,selection1_selection2,embeddings
0,David Lynch,https://www.imdb.com/name/nm0000186?ref_=nmls_hd,Born in precisely the kind of small-town Ameri...,"[-0.030740655958652496, -0.07743874937295914, ..."
1,Stanley Kubrick,https://www.imdb.com/name/nm0000040?ref_=nmls_hd,"Stanley Kubrick was born in Manhattan, New Yor...","[0.04756513983011246, -0.0028173087630420923, ..."
2,Robert Bresson,https://www.imdb.com/name/nm0000975?ref_=nmls_hd,Robert Bresson trained as a painter before mov...,"[-0.0322096012532711, -0.046513646841049194, -..."


In [35]:
df_directors['type']='director'

df_directors = df_directors.rename(columns={
    'selection1_name': 'title',
    'selection1_url': 'url'})

df_directors = df_directors[['title', 'url', 'embeddings', 'type']]

df_directors.head(3)

Unnamed: 0,title,url,embeddings,type
0,David Lynch,https://www.imdb.com/name/nm0000186?ref_=nmls_hd,"[-0.030740655958652496, -0.07743874937295914, ...",director
1,Stanley Kubrick,https://www.imdb.com/name/nm0000040?ref_=nmls_hd,"[0.04756513983011246, -0.0028173087630420923, ...",director
2,Robert Bresson,https://www.imdb.com/name/nm0000975?ref_=nmls_hd,"[-0.0322096012532711, -0.046513646841049194, -...",director


In [37]:
df_movies = df[['title', 'url', 'embeddings', 'type']]

df_movies.head(3)

Unnamed: 0,title,url,embeddings,type
0,White Noise,https://en.wikipedia.org/wiki/White_Noise_(200...,"[-0.07947266846895218, 0.03986681252717972, 0....",movie
1,Coach Carter,https://en.wikipedia.org/wiki/Coach_Carter,"[-0.03712575137615204, 0.05965740978717804, 0....",movie
2,Elektra,https://en.wikipedia.org/wiki/Elektra_(2005_film),"[-0.019044550135731697, 0.00964392814785242, -...",movie


In [45]:
df_combined = pd.concat([df_movies, df_directors], ignore_index=True)
df_combined.sample(3)

Unnamed: 0,title,url,embeddings,type
2981,Mainstream,https://en.wikipedia.org/wiki/Mainstream_(film),"[-0.0427854061126709, -0.10656245797872543, 0....",movie
2923,Penguin Bloom,https://en.wikipedia.org/wiki/Penguin_Bloom,"[-0.07983534783124924, -0.04206538200378418, 0...",movie
1988,La La Land,https://en.wikipedia.org/wiki/La_La_Land,"[0.011585831642150879, -0.09833086282014847, 0...",movie


In [48]:
embeddings_combined = np.array(df_combined['embeddings'].tolist())

umap_embeddings_2dim_combined = umap.UMAP(n_neighbors=5, n_components=2, min_dist=0, metric='cosine').fit_transform(embeddings_combined)

df_combined['umap_embeddings_2dim'] = umap_embeddings_2dim_combined.tolist()
df_combined.head(3)

Unnamed: 0,title,url,embeddings,type,umap_embeddings_2dim
0,White Noise,https://en.wikipedia.org/wiki/White_Noise_(200...,"[-0.07947266846895218, 0.03986681252717972, 0....",movie,"[8.922362327575684, 5.541539669036865]"
1,Coach Carter,https://en.wikipedia.org/wiki/Coach_Carter,"[-0.03712575137615204, 0.05965740978717804, 0....",movie,"[11.882466316223145, 4.2011003494262695]"
2,Elektra,https://en.wikipedia.org/wiki/Elektra_(2005_film),"[-0.019044550135731697, 0.00964392814785242, -...",movie,"[10.393428802490234, 8.02609920501709]"


In [50]:
col = 'umap_embeddings_2dim'
hoverdata = ['url']
hovername = 'title'
color = 'type'

plot_values = np.stack(df_combined[col], axis=1)
x, y = plot_values[0], plot_values[1]
fig = px.scatter(
    df_combined, color=color, x=x, y=y, hover_data=hoverdata, hover_name=hovername, title='Visualize Movie Plots and Directors')

fig.show()