## Generate TF-IDF & Sentence Transformers embeddings

In [7]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

In [59]:
### load the dataframe containing the movie plot corpus
df = pd.read_parquet("../artifacts/movie_plots.parquet")
df

Unnamed: 0,index,url,title,plot
0,0,https://en.wikipedia.org/wiki/White_Noise_(200...,White Noise (2005 film),Jonathan Rivers is an architect and lives with...
1,1,https://en.wikipedia.org/wiki/Coach_Carter,Coach Carter,"Ken Carter lives in Richmond, California. He b..."
2,2,https://en.wikipedia.org/wiki/Elektra_(2005_film),Elektra (2005 film),"After being killed,[a] Elektra Natchios is rev..."
3,3,https://en.wikipedia.org/wiki/Racing_Stripes,Racing Stripes,"During a thunderstorm, a traveling circus, Cir..."
4,4,https://en.wikipedia.org/wiki/Tom_and_Jerry:_B...,Tom and Jerry: Blast Off to Mars,Tom chases Jerry as usual from their house and...
...,...,...,...,...
3503,4038,https://en.wikipedia.org/wiki/Whitney_Houston:...,Whitney Houston: I Wanna Dance with Somebody,"In 1983, 19-year-old Whitney Houston is being ..."
3504,4039,https://en.wikipedia.org/wiki/The_Pale_Blue_Eye,The Pale Blue Eye,"In October 1830, alcoholic retired detective A..."
3505,4040,https://en.wikipedia.org/wiki/Women_Talking_(f...,Women Talking (film),"A young woman sleeps alone, in bed. There are ..."
3506,4041,https://en.wikipedia.org/wiki/A_Man_Called_Otto,A Man Called Otto,"Otto Anderson is a 63-year-old widower, living..."


### TF-IDF embeddings

In [60]:
### get a TF-IDF representation of text, returned values are sparse vectors
### convert it to numpy array
vec = TfidfVectorizer()
X = vec.fit_transform(df["plot"]).toarray()
display(X)
X.shape

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

(3508, 48128)

In [61]:
### convert to data-frame
df = pd.DataFrame(X)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48118,48119,48120,48121,48122,48123,48124,48125,48126,48127
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
### save the embedding as parquet file
table = pa.Table.from_pandas(df)
pq.write_table(table, 'tfidf_embeddings.parquet')

In [63]:
### read saved data and check
df = pq.read_table("tfidf_embeddings.parquet").to_pandas()
df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48118,48119,48120,48121,48122,48123,48124,48125,48126,48127
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3506,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
### convert dataframe to 2-D numpy array, each row represent embedding of one plot
tfidf = df.to_numpy()
tfidf.shape

(3508, 48128)

### Sentence Transformers Embeddings

In [65]:
### load the dataframe containing the movie plot corpus
df = pd.read_parquet("../artifacts/movie_plots.parquet")
df

Unnamed: 0,index,url,title,plot
0,0,https://en.wikipedia.org/wiki/White_Noise_(200...,White Noise (2005 film),Jonathan Rivers is an architect and lives with...
1,1,https://en.wikipedia.org/wiki/Coach_Carter,Coach Carter,"Ken Carter lives in Richmond, California. He b..."
2,2,https://en.wikipedia.org/wiki/Elektra_(2005_film),Elektra (2005 film),"After being killed,[a] Elektra Natchios is rev..."
3,3,https://en.wikipedia.org/wiki/Racing_Stripes,Racing Stripes,"During a thunderstorm, a traveling circus, Cir..."
4,4,https://en.wikipedia.org/wiki/Tom_and_Jerry:_B...,Tom and Jerry: Blast Off to Mars,Tom chases Jerry as usual from their house and...
...,...,...,...,...
3503,4038,https://en.wikipedia.org/wiki/Whitney_Houston:...,Whitney Houston: I Wanna Dance with Somebody,"In 1983, 19-year-old Whitney Houston is being ..."
3504,4039,https://en.wikipedia.org/wiki/The_Pale_Blue_Eye,The Pale Blue Eye,"In October 1830, alcoholic retired detective A..."
3505,4040,https://en.wikipedia.org/wiki/Women_Talking_(f...,Women Talking (film),"A young woman sleeps alone, in bed. There are ..."
3506,4041,https://en.wikipedia.org/wiki/A_Man_Called_Otto,A Man Called Otto,"Otto Anderson is a 63-year-old widower, living..."


In [66]:
model = SentenceTransformer('all-MiniLM-L6-v2')
sentence_embeddings = model.encode(df["plot"])
sentence_embeddings

array([[-0.07947264,  0.03986682,  0.01542153, ...,  0.01449287,
        -0.01010946, -0.03626271],
       [-0.03582615,  0.05858802,  0.00956734, ...,  0.00304488,
        -0.08153194, -0.0241226 ],
       [-0.02083369,  0.02929116, -0.04200213, ..., -0.01743418,
         0.00493872,  0.00942789],
       ...,
       [ 0.02226734,  0.00947231, -0.03072867, ..., -0.06515969,
        -0.03394407, -0.05770543],
       [-0.00147249,  0.07110316, -0.01378531, ...,  0.02559882,
         0.05347443, -0.04887092],
       [-0.03115228, -0.01405397,  0.04924991, ...,  0.03488774,
        -0.03904548, -0.08766331]], dtype=float32)

In [69]:
sentence_embeddings.shape

(3508, 384)

In [70]:
### convert to a dataframe
df = pd.DataFrame(sentence_embeddings)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.079473,0.039867,0.015422,0.001454,0.054188,0.008344,0.062162,-0.038170,0.074984,-0.059262,...,0.056672,-0.064081,0.031193,-0.075870,-0.092477,0.007825,0.087413,0.014493,-0.010109,-0.036263
1,-0.035826,0.058588,0.009567,-0.051754,0.007164,0.005096,-0.008980,0.017756,0.112654,0.055845,...,0.048709,-0.004139,-0.097728,0.080659,-0.040395,-0.077078,-0.038374,0.003045,-0.081532,-0.024123
2,-0.020834,0.029291,-0.042002,0.009461,0.119430,0.054744,0.072913,0.014281,0.034877,0.022201,...,0.098046,-0.113098,0.045841,0.025968,0.004102,0.055712,-0.002792,-0.017434,0.004939,0.009428
3,-0.108421,-0.000945,-0.026790,-0.022783,-0.035257,0.043165,0.061273,-0.021300,0.010107,0.010341,...,-0.038871,-0.124378,-0.070355,-0.013237,-0.046645,0.080196,-0.029096,-0.024358,0.092425,-0.078799
4,-0.033221,-0.006662,0.015374,-0.041513,-0.044280,-0.049768,0.103815,-0.051591,0.030272,-0.020047,...,0.053476,-0.027529,-0.069866,-0.017386,0.063754,0.045215,-0.016984,0.100490,-0.020218,0.019145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3503,-0.015937,-0.045859,0.038560,-0.013979,-0.032743,0.017974,-0.004209,0.095838,0.023471,-0.081796,...,0.055380,-0.042650,-0.016786,-0.007887,0.001550,0.039736,0.050041,-0.079494,0.075386,-0.027209
3504,-0.054327,-0.025792,-0.047029,0.000116,0.023716,0.022768,0.042176,-0.066350,0.013970,-0.062164,...,0.037129,-0.081040,-0.019769,0.058770,-0.087794,-0.036639,0.066967,0.043139,-0.066197,0.008521
3505,0.022267,0.009472,-0.030729,0.055254,0.060820,0.053154,0.076183,-0.080574,-0.029180,0.079510,...,0.049057,-0.044715,-0.041976,0.066816,0.021295,0.083398,-0.038968,-0.065160,-0.033944,-0.057705
3506,-0.001472,0.071103,-0.013785,0.059384,0.011841,0.023274,0.001098,0.076087,0.026722,-0.069878,...,0.098609,-0.040856,-0.016767,0.053141,-0.027726,0.066312,0.048097,0.025599,0.053474,-0.048871


In [71]:
### save the embedding as parquet file
table = pa.Table.from_pandas(df)
pq.write_table(table, 'sbert_embeddings.parquet')

In [72]:
### read saved data and check
df = pq.read_table("sbert_embeddings.parquet").to_pandas()
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.079473,0.039867,0.015422,0.001454,0.054188,0.008344,0.062162,-0.038170,0.074984,-0.059262,...,0.056672,-0.064081,0.031193,-0.075870,-0.092477,0.007825,0.087413,0.014493,-0.010109,-0.036263
1,-0.035826,0.058588,0.009567,-0.051754,0.007164,0.005096,-0.008980,0.017756,0.112654,0.055845,...,0.048709,-0.004139,-0.097728,0.080659,-0.040395,-0.077078,-0.038374,0.003045,-0.081532,-0.024123
2,-0.020834,0.029291,-0.042002,0.009461,0.119430,0.054744,0.072913,0.014281,0.034877,0.022201,...,0.098046,-0.113098,0.045841,0.025968,0.004102,0.055712,-0.002792,-0.017434,0.004939,0.009428
3,-0.108421,-0.000945,-0.026790,-0.022783,-0.035257,0.043165,0.061273,-0.021300,0.010107,0.010341,...,-0.038871,-0.124378,-0.070355,-0.013237,-0.046645,0.080196,-0.029096,-0.024358,0.092425,-0.078799
4,-0.033221,-0.006662,0.015374,-0.041513,-0.044280,-0.049768,0.103815,-0.051591,0.030272,-0.020047,...,0.053476,-0.027529,-0.069866,-0.017386,0.063754,0.045215,-0.016984,0.100490,-0.020218,0.019145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3503,-0.015937,-0.045859,0.038560,-0.013979,-0.032743,0.017974,-0.004209,0.095838,0.023471,-0.081796,...,0.055380,-0.042650,-0.016786,-0.007887,0.001550,0.039736,0.050041,-0.079494,0.075386,-0.027209
3504,-0.054327,-0.025792,-0.047029,0.000116,0.023716,0.022768,0.042176,-0.066350,0.013970,-0.062164,...,0.037129,-0.081040,-0.019769,0.058770,-0.087794,-0.036639,0.066967,0.043139,-0.066197,0.008521
3505,0.022267,0.009472,-0.030729,0.055254,0.060820,0.053154,0.076183,-0.080574,-0.029180,0.079510,...,0.049057,-0.044715,-0.041976,0.066816,0.021295,0.083398,-0.038968,-0.065160,-0.033944,-0.057705
3506,-0.001472,0.071103,-0.013785,0.059384,0.011841,0.023274,0.001098,0.076087,0.026722,-0.069878,...,0.098609,-0.040856,-0.016767,0.053141,-0.027726,0.066312,0.048097,0.025599,0.053474,-0.048871


In [73]:
### convert dataframe to 2-D numpy array, each row represent embedding of one plot
tfidf = df.to_numpy()
tfidf.shape

(3508, 384)