In [1]:
import polars as pl
from sentence_transformers import SentenceTransformer

In [2]:
df = pl.read_csv('data/video-transcripts.csv')
df.head()

video_id,datetime,title,transcript
str,str,str,str
"""wwSzpaTHyS8""","""2024-01-30T15:00:03.000000""","""Did The Future Already Happen?…","""Do your past, present and futu…"
"""dFCbJmgeHmA""","""2021-06-15T14:09:03.000000""","""The Day the Dinosaurs Died – M…","""one of the greatest Illusions …"
"""1AElONvi9WQ""","""2020-03-01T13:30:00.000000""","""Why Blue Whales Don't Get Canc…","""Cancer is a creepy and mysteri…"
"""Hug0rfFC_L8""","""2014-12-18T14:02:27.000000""","""The Ultimate Conspiracy Debunk…","""The Internet is like a breedin…"
"""lXfEK8G8CUI""","""2021-08-10T13:59:41.000000""","""How The Immune System ACTUALLY…","""the human immune system is the…"


In [3]:
model_name = 'multi-qa-mpnet-base-dot-v1'
column_name_list = ['title','transcript']

In [4]:
model = SentenceTransformer(model_name)

for column_name in column_name_list:

    embedding_arr = model.encode(df[column_name].to_list())

    # store embeddings in a dataframe
    schema_dict = {column_name+'_embedding-'+str(i): float for i in range(embedding_arr.shape[1])}
    df_embedding = pl.DataFrame(embedding_arr, schema=schema_dict)

    # append embeddings to video index
    df = pl.concat([df, df_embedding], how='horizontal')

In [5]:
df.shape

(217, 1540)

In [6]:
df.head()

video_id,datetime,title,transcript,title_embedding-0,title_embedding-1,title_embedding-2,title_embedding-3,title_embedding-4,title_embedding-5,title_embedding-6,title_embedding-7,title_embedding-8,title_embedding-9,title_embedding-10,title_embedding-11,title_embedding-12,title_embedding-13,title_embedding-14,title_embedding-15,title_embedding-16,title_embedding-17,title_embedding-18,title_embedding-19,title_embedding-20,title_embedding-21,title_embedding-22,title_embedding-23,title_embedding-24,title_embedding-25,title_embedding-26,title_embedding-27,title_embedding-28,title_embedding-29,title_embedding-30,title_embedding-31,title_embedding-32,…,transcript_embedding-731,transcript_embedding-732,transcript_embedding-733,transcript_embedding-734,transcript_embedding-735,transcript_embedding-736,transcript_embedding-737,transcript_embedding-738,transcript_embedding-739,transcript_embedding-740,transcript_embedding-741,transcript_embedding-742,transcript_embedding-743,transcript_embedding-744,transcript_embedding-745,transcript_embedding-746,transcript_embedding-747,transcript_embedding-748,transcript_embedding-749,transcript_embedding-750,transcript_embedding-751,transcript_embedding-752,transcript_embedding-753,transcript_embedding-754,transcript_embedding-755,transcript_embedding-756,transcript_embedding-757,transcript_embedding-758,transcript_embedding-759,transcript_embedding-760,transcript_embedding-761,transcript_embedding-762,transcript_embedding-763,transcript_embedding-764,transcript_embedding-765,transcript_embedding-766,transcript_embedding-767
str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""wwSzpaTHyS8""","""2024-01-30T15:00:03.000000""","""Did The Future Already Happen?…","""Do your past, present and futu…",-0.025905,-0.057205,-0.239068,-0.030249,-0.324251,-0.406952,0.154386,0.147527,-0.05657,0.183445,0.355817,0.253671,0.009071,-0.130822,0.093072,0.157849,0.044513,0.148476,-0.16883,-0.038468,-0.175704,0.007907,-0.160255,0.088843,-0.135349,0.215504,-0.403005,0.187053,-0.177082,0.280815,-0.029954,0.067943,-0.037771,…,-0.083062,-0.095702,0.167,0.02029,0.004926,-0.181677,-0.062654,0.028706,-0.135353,0.282397,0.413036,0.090605,-0.086156,0.247052,-0.266838,-0.22705,0.301867,0.34981,0.212191,0.017764,0.253068,0.262885,0.103541,-0.428383,-0.229856,0.319932,-0.280512,0.345449,0.038927,0.218713,0.20518,0.106285,-0.297685,0.180136,-0.116905,0.060121,0.044395
"""dFCbJmgeHmA""","""2021-06-15T14:09:03.000000""","""The Day the Dinosaurs Died – M…","""one of the greatest Illusions …",-0.051391,-0.076532,-0.327907,-0.016098,0.019872,-0.121818,0.036152,0.383335,-0.545619,0.152098,0.175513,0.31849,-0.420546,-0.272664,-0.073903,-0.403492,0.154533,0.028713,-0.153936,0.03519,-0.185023,-0.004032,-0.077323,-0.11822,-0.298985,0.060209,0.063826,0.016899,0.080266,-0.329317,-0.13258,-0.029919,0.215298,…,0.198497,-0.272788,0.093927,0.290756,0.136055,-0.020901,-0.362898,0.058989,-0.143124,0.065848,-0.132789,0.04787,-0.026864,0.354131,-0.226404,-0.226108,0.707069,0.404196,-0.134853,0.146131,-0.147008,-0.012603,-0.051642,-0.375915,-0.156023,0.174291,-0.108638,-0.102903,0.048374,-0.089792,-0.06719,0.003697,0.485208,0.053736,0.003127,-0.127867,-0.2931
"""1AElONvi9WQ""","""2020-03-01T13:30:00.000000""","""Why Blue Whales Don't Get Canc…","""Cancer is a creepy and mysteri…",0.331041,0.059231,-0.334223,-0.198434,0.177767,0.069817,-0.034318,0.516261,-0.077923,0.319862,0.007129,0.281118,0.029754,0.168317,-0.058006,-0.07772,0.194316,0.172918,0.449949,-0.565341,-0.279291,-0.183681,0.140431,-0.180643,0.318531,0.124176,0.039142,0.264325,-0.38974,-0.023645,-0.200744,-0.152801,-0.282532,…,-0.113357,-0.218527,0.293003,0.330546,-0.117394,0.007646,0.277167,-0.19762,-0.480463,0.161434,-0.479221,0.304498,-0.103126,0.25277,0.066134,-0.294965,0.174575,0.133719,-0.122697,-0.259427,-0.067257,0.427672,0.356506,-0.254509,-0.133694,0.007723,0.117205,-0.129653,-0.116634,-0.439639,-0.13738,-0.209195,0.303494,-0.202466,0.262715,0.278752,0.055127
"""Hug0rfFC_L8""","""2014-12-18T14:02:27.000000""","""The Ultimate Conspiracy Debunk…","""The Internet is like a breedin…",-0.208302,-0.127359,-0.214014,0.079502,-0.416126,-0.196103,0.262214,-0.022504,-0.201781,0.188749,0.036593,0.179495,-0.269735,-0.047,-0.048672,0.456909,0.245806,0.083118,0.106665,-0.196385,-0.130073,0.14085,-0.054648,-0.015026,-0.1206,0.099799,0.297291,-0.052437,-0.120966,-0.055295,-0.089876,0.182102,-0.082569,…,0.071869,-0.465544,-0.098535,0.172604,-0.189845,-0.036173,-0.287569,-0.061934,-0.058097,0.352294,-0.111119,0.00324,0.07724,0.081679,-0.082045,-0.156627,0.390386,0.187721,-0.161565,-0.265613,0.089716,0.123836,-0.066241,-0.187347,-0.073191,0.083775,0.100054,-0.124404,-0.107779,0.15533,0.086105,-0.152376,-0.181196,0.006073,0.400234,-0.168515,-0.017901
"""lXfEK8G8CUI""","""2021-08-10T13:59:41.000000""","""How The Immune System ACTUALLY…","""the human immune system is the…",0.3749,-0.359026,-0.237382,-0.044617,0.135443,-0.185199,0.302108,-0.259655,0.153136,0.294967,-0.199202,0.082187,0.125903,-0.129214,0.054529,0.231106,-0.167716,0.022192,-0.141061,-0.336385,-0.079946,0.06669,-0.42007,0.109387,0.55032,0.007437,-0.026803,0.146167,-0.115751,-0.088627,-0.020723,-0.184874,-0.053338,…,0.208923,-0.341126,0.101539,-0.032865,0.129658,-0.054247,0.096417,0.031902,-0.453055,0.346321,-0.382605,0.088905,0.070048,0.315211,-0.241491,-0.099406,0.40849,0.211493,-0.051736,-0.154101,0.118067,-0.222451,0.074716,0.006628,-0.352337,0.072058,-0.114752,-0.166804,0.038688,0.305608,-0.038484,-0.265214,-0.079144,0.419046,0.039504,-0.211684,-0.232104


In [8]:
df.write_csv('data/video-index.csv')