In [78]:
# !pip install sentence_transformers
# !pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
# !pip3 install black[jupyter]


In [79]:
import pandas as pd
import os
# import nltk
import re
from transformers import DistilBertModel
from sentence_transformers import SentenceTransformer
import torch
# import torchvision
import numpy as np


In [80]:
# Modified version of Document where tokens are not included 
class Document:
  def __init__(self, doc_no, doc_text, vector):
    self.doc_no = doc_no
    self.doc_text = doc_text
    # self.tokens = tokens
    self.vector = vector

  def __str__(self):
    # return 'Document Number: ' + self.doc_no + '\nDocument Text: ' + self.doc_text + '\nTokens: ' + str(self.tokens) + '\n'
    return 'Document Number: ' + self.doc_no + '\nDocument Text: ' + self.doc_text  + '\n Vectors: ' + str(self.vector) + '\n'


  def to_dict(self):
    # return {'docno': self.doc_no, 'doctext': self.doc_text, 'tokens': self.tokens, 'text': ' '.join(self.tokens)}
    return {'docno': self.doc_no, 'doctext': self.doc_text, 'vector': self.vector.tolist()}


In [81]:
# Modified version of preprocess where tokenizing is removed 
def preprocess(file):
    with open(file, "r") as f:
        content = f.read()
    documents = re.findall(r'<DOC>(.*?)</DOC>', content, re.DOTALL)
    preprocessed_documents = []
    for document in documents:
        # Get the document number and text
        raw_no = re.search(r'<DOCNO>(.*?)</DOCNO>', document, re.DOTALL)
        doc_no = raw_no.group(1) if raw_no else ''
        raw_text = re.search(r'<TEXT>(.*?)</TEXT>', document, re.DOTALL)
        doc_text = raw_text.group(1) if raw_text else ''
        doc = Document(doc_no, doc_text, None)
        preprocessed_documents.append(doc)
    return preprocessed_documents


In [82]:
# main function to preprocess a directory of text files
def preprocess_directory(directory, num_files=-1):
  preprocessed_documents = [] 
  ctr = 0
  for filename in os.listdir(directory):
    print('Preprocessing file: ', filename)
    file = os.path.join(directory, filename)
    preprocessed_documents.extend(preprocess(file))
    ctr += 1
    if ctr == num_files and num_files != -1:
      break
  return preprocessed_documents


In [83]:

# Grabs the queries?
def extract_topics(file):
    with open(file, "r") as f:
        topic_content = f.read()
    all_topics = []
    topics = re.findall(r'<top>(.*?)</top>', topic_content, re.DOTALL)
    for topic in topics:
        raw_title = re.search(r'<title>(.*?)\n\n', topic, re.DOTALL)
        title = raw_title.group(1) if raw_title else ''
        all_topics.append(title)
    return all_topics


In [84]:
extracted_documents = []
extracted_documents = preprocess_directory('AP_collection\coll')

Preprocessing file:  AP880212
Preprocessing file:  AP880213
Preprocessing file:  AP880214
Preprocessing file:  AP880215
Preprocessing file:  AP880216
Preprocessing file:  AP880217
Preprocessing file:  AP880218
Preprocessing file:  AP880219
Preprocessing file:  AP880220
Preprocessing file:  AP880221
Preprocessing file:  AP880222
Preprocessing file:  AP880223
Preprocessing file:  AP880224
Preprocessing file:  AP880225
Preprocessing file:  AP880226
Preprocessing file:  AP880227
Preprocessing file:  AP880228
Preprocessing file:  AP880229
Preprocessing file:  AP880301
Preprocessing file:  AP880302
Preprocessing file:  AP880303
Preprocessing file:  AP880304
Preprocessing file:  AP880307
Preprocessing file:  AP880308
Preprocessing file:  AP880309
Preprocessing file:  AP880310
Preprocessing file:  AP880311
Preprocessing file:  AP880312
Preprocessing file:  AP880313
Preprocessing file:  AP880314
Preprocessing file:  AP880315
Preprocessing file:  AP880316
Preprocessing file:  AP880317
Preprocess

In [85]:
# print(extracted_documents[0].doc_no + "\n")
# print(extracted_documents[0].doc_text) 

In [86]:
topics = extract_topics("topics1-50.txt")


# DistillBERT Setup (with Cuda)


In [87]:
model = SentenceTransformer('all-distilroberta-v1', device="cuda:0")


In [88]:
torch.cuda.is_available()

True

In [89]:
torch.cuda.current_device()

0

In [90]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070 Ti Laptop GPU'

In [91]:
import datetime
import pytz

# Set the timezone to Eastern Standard Time (EST)
tz = pytz.timezone('US/Eastern')

def print_time_est():
    # Get the current time in EST
    est_time = datetime.datetime.now(tz)

    # Print the current EST time
    print("Current EST time:", est_time)


In [92]:
print_time_est()

Current EST time: 2023-04-02 14:58:56.263983-04:00


In [93]:
# for doc in extracted_documents: 
#     doc.vector = model.encode([doc.doc_text])
#     print(doc.doc_no + " is done.")

In [94]:
print_time_est()

Current EST time: 2023-04-02 14:58:56.371582-04:00


In [95]:
len(extracted_documents)

79923

In [96]:
extracted_documents[79922].vector

In [97]:
from sklearn.metrics.pairwise import cosine_similarity


In [98]:
extracted_documents[0].vector = model.encode(extracted_documents[0].doc_text)

# Output to File 


In [63]:
# LEGACY CODE 
# import json 

# with open("embedding_saves/distilroberta.json", "w") as outfile:
#     # for doc in extracted_documents:
#     #     json.dump(doc.to_dict(), outfile)
#     json.dump(extracted_documents[0].to_dict(),outfile)

### NOT compressed (too big for Github)

#### Possible csv write *(not compressed)


In [64]:
import csv

# assuming you have a list of Document objects called documents
# and assuming you have already populated the vector attribute of each Document object

# define the headers for your CSV file
headers = ['doc_no', 'doc_text', 'vector']

# open the CSV file in 'w' mode and write the headers
with open("embedding_saves/distilroberta.csv", mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(headers)

    # loop through each Document object and write its attributes to the CSV file
    for document in extracted_documents:
        writer.writerow([document.doc_no, document.doc_text, document.vector.tolist() if document.vector is not None else None])


#### Possible csv read *(not compressed)


In [65]:
# read the CSV file and create new Document objects
extracted_documents = []
# --------------------------------------------------------------
# CHANGE THIS TO extracted_documents later 
# --------------------------------------------------------------
with open("embedding_saves/distilroberta.csv", mode='r') as file:
    reader = csv.reader(file)
    headers = next(reader) # skip the header row

    for row in reader:
        doc_no = row[0]
        doc_text = row[1]
        vector = np.array(row[2], dtype=float)
        document = Document(doc_no, doc_text, vector)
        extracted_documents.append(document)


ValueError: could not convert string to float: '[-0.010395441204309464, -0.08126188069581985, 0.0030520400032401085, -0.051090486347675323, 0.059318866580724716, 0.08064533770084381, -0.0001100039662560448, -0.008711610920727253, -0.014506327919661999, -0.004800998605787754, -0.03369065374135971, 0.0008512154454365373, -0.036835089325904846, -0.04161018505692482, -0.009351484477519989, -0.05484415218234062, 0.013274725526571274, -0.04045410826802254, 0.023280872032046318, 0.043757013976573944, -0.00033808808075264096, -0.03674377501010895, -0.012431818060576916, -0.001356494496576488, -0.03808555379509926, -0.0033934894017875195, 0.002981443190947175, 0.021589577198028564, 0.001361951231956482, 0.04464486241340637, -0.006879636086523533, 0.015462028793990612, -0.004351004026830196, 0.03017631359398365, -0.023280281573534012, 0.052706409245729446, 0.061508309096097946, -0.021434402093291283, 0.03548242524266243, 0.0006167553947307169, 0.019133636727929115, 0.07385644316673279, 0.06488525122404099, -0.015542103908956051, 0.029474250972270966, 0.050524260848760605, -0.05446946993470192, 0.043112657964229584, -0.018399741500616074, 0.0067999535240232944, -0.0119393989443779, 0.031548116356134415, -0.01822241209447384, 0.09563520550727844, 0.023923898115754128, 0.0041021015495061874, -0.0008029509335756302, -0.057666875422000885, -0.034062277525663376, 0.0040086302906274796, 0.0160569678992033, -0.09846959263086319, -0.01299780048429966, -0.013835836201906204, -0.04046562314033508, -0.000939497200306505, -0.034739743918180466, -0.029895160347223282, 0.02186059020459652, -0.03416147828102112, -0.029537996277213097, 0.05073009803891182, 0.02713531069457531, -0.09692453593015671, -0.030576031655073166, -0.0320887416601181, -0.005824755411595106, 0.006061210762709379, -0.0027896712999790907, 0.006402927450835705, 0.025270914658904076, 0.03831854462623596, 0.033601947128772736, 0.03540806472301483, 0.061814822256565094, 0.02115263231098652, 0.0381525382399559, -0.014663275331258774, -0.044189292937517166, -0.008600931614637375, 0.03749668598175049, -0.03730189800262451, -0.040878988802433014, 0.005396823398768902, -0.007805720437318087, -0.034458596259355545, -0.006608644966036081, 0.12716200947761536, -0.015951376408338547, -0.027505608275532722, 0.005644319579005241, 0.03458060696721077, 0.021426599472761154, -0.018885130062699318, 0.012386049143970013, 0.0012181538622826338, 0.006704031489789486, -0.0797676295042038, -0.04656076431274414, 0.002314760349690914, -0.031683217734098434, 0.044396333396434784, -0.07968784123659134, 0.03927421197295189, 0.034663911908864975, -0.0029636688996106386, 0.007290131412446499, -0.046922843903303146, 0.007828122936189175, -0.006733778398483992, -0.05131402239203453, -0.03435452654957771, 0.04061906784772873, 0.08440046012401581, 0.01670333929359913, 0.013826636597514153, -0.055704209953546524, -0.047460734844207764, 0.007226555608212948, -0.021724410355091095, -0.03947891294956207, -0.011899634264409542, -0.010522090829908848, -0.004266907926648855, 0.025320589542388916, -0.010040108114480972, 0.02064961940050125, 0.03982743248343468, 0.02535335347056389, 0.06460213661193848, 0.0764629915356636, 0.04478584602475166, -0.04448746144771576, 0.029020266607403755, 0.004205633420497179, 0.0015482817543670535, -0.0322735458612442, -0.005765872076153755, 0.022720446810126305, 0.03231160342693329, -0.02936515212059021, 0.02425779588520527, 0.031861789524555206, 0.03743167221546173, -0.04536532983183861, -0.03962266072630882, -0.08292163163423538, 0.014205009676516056, -0.012623926624655724, 0.011378942057490349, -0.010275824926793575, 0.010488706640899181, -7.956641638884321e-05, -0.0012872031657025218, 0.0023420648649334908, 0.03160101920366287, 0.012914208695292473, 0.02488667145371437, 0.02055545523762703, -0.03179716318845749, 0.003321251831948757, -0.021948328241705894, -0.015918904915452003, -0.03934164717793465, -0.040357667952775955, -0.034831687808036804, 0.06054408848285675, -0.004720957484096289, 0.03152510151267052, 0.019721249118447304, 0.004934951663017273, -0.006566908676177263, -0.05597623437643051, 0.05456381291151047, 0.04972818121314049, 0.04002906009554863, 0.001069344230927527, -0.006520562339574099, 0.056175366044044495, -0.0566600076854229, -0.019608883187174797, 0.04253778234124184, -0.004214616026729345, -0.033151108771562576, -0.030005844309926033, -0.047906093299388885, 0.04422558471560478, 0.021543577313423157, 0.031617339700460434, 0.0021918327547609806, -0.03654882684350014, -0.015477010048925877, 0.044761113822460175, 0.022124603390693665, -0.0593327172100544, -0.026361161842942238, -0.032933276146650314, 0.008416092954576015, -0.00754788052290678, -0.010817606002092361, 0.000757628062274307, -0.007385297678411007, 0.057605527341365814, 0.0368424654006958, 0.026638345792889595, 0.03610937297344208, 0.004336001817137003, 0.007201069965958595, 0.021581552922725677, -0.0068160113878548145, -0.015448764897882938, -0.016736941412091255, -0.060494694858789444, -0.06765300035476685, 0.015604431740939617, -0.008227188140153885, -0.012303843162953854, -0.014850822277367115, -0.017800547182559967, -0.05879628658294678, -0.019828621298074722, -0.06675481796264648, 0.0015088641084730625, -0.021496284753084183, 0.05202611908316612, -0.008289293386042118, 0.047697145491838455, -0.03395216912031174, -0.032891783863306046, 0.024626940488815308, 0.004237796179950237, 0.020130854099988937, 0.07490897923707962, 0.00863238051533699, 0.055426277220249176, 0.003650050610303879, -0.05463224649429321, 0.037788160145282745, 0.08933666348457336, 0.055707987397909164, 0.02274926006793976, -0.01878364384174347, 0.016616258770227432, -0.051237888634204865, -0.04380381107330322, -0.005590404383838177, -0.005579289048910141, -0.03386809304356575, 0.028212033212184906, 0.05027817562222481, -0.0553150475025177, 0.08303753286600113, 0.08958083391189575, -0.042534179985523224, -0.021305352449417114, -0.034611426293849945, 0.005327358841896057, -0.01955021545290947, -0.018331730738282204, 0.035045113414525986, -0.02333020605146885, 0.020798442885279655, -0.04618434980511665, 0.05373482406139374, -0.008817283436655998, 0.029523806646466255, -0.051692962646484375, 0.016905156895518303, -0.026108939200639725, -0.028265774250030518, -0.07156042754650116, -0.02963254787027836, -0.016015702858567238, 0.00914317648857832, 0.0050862873904407024, 0.006771995685994625, -0.03027372434735298, -0.046079348772764206, 0.027257196605205536, -0.009755820035934448, -0.052555106580257416, 0.06706923991441727, -0.03310060501098633, 0.05292665585875511, -0.015537500381469727, -0.015657270327210426, 0.10007793456315994, -0.01966158300638199, 0.023976009339094162, -0.04516592249274254, -0.029250649735331535, -0.035734668374061584, -0.01724039949476719, 0.05350407212972641, 0.021496256813406944, 0.04852587729692459, -0.02875085547566414, -0.0069650691002607346, -0.0012877010740339756, -0.00881936214864254, 0.011929545551538467, -0.0036407040897756815, -0.0468984916806221, 0.024171730503439903, -0.02036074548959732, 0.029891423881053925, 0.009669722057878971, -0.09033609181642532, -0.05920124053955078, 0.07351566106081009, -0.031998131424188614, -0.027873750776052475, -0.0033057313412427902, -0.007684781681746244, 0.002368407091125846, 0.03826623782515526, -0.053724419325590134, 0.09123650938272476, 0.015400375239551067, -0.015204194001853466, 0.10089296102523804, 0.04624401032924652, 0.019685251638293266, 0.07773285359144211, 0.038993217051029205, -0.06013987958431244, -0.02392563596367836, 0.011446277610957623, 0.017068974673748016, 0.06025318801403046, -0.05653951317071915, -0.02709568291902542, 0.02217281609773636, 0.0064035747200250626, 0.011930307373404503, -0.0054131015203893185, 0.07539164274930954, 0.0017147741746157408, -0.008474492467939854, -0.09512597322463989, 0.019055364653468132, -0.0007794857956469059, 0.019629616290330887, 0.014393080025911331, -0.043674174696207047, -0.020114442333579063, -0.04512499272823334, 0.006957365665584803, -0.003251070622354746, -0.015457049012184143, -0.017615990713238716, 0.005036943592131138, 0.008399641141295433, -0.03291548416018486, -0.023599663749337196, -0.03203875944018364, -0.020081983879208565, 0.029484419152140617, -0.02798977866768837, -0.03038189746439457, 0.046005357056856155, -0.03465776517987251, 0.00447165546938777, 0.007935824804008007, 0.018864354118704796, 0.029843822121620178, -0.043568797409534454, 0.02982291951775551, 0.0023339625913649797, -0.01739664189517498, -0.014258450828492641, 0.007722200360149145, -0.04667655751109123, 0.044729482382535934, 0.03543441370129585, -0.021785393357276917, 0.004509361460804939, -0.03376961499452591, 0.011659211479127407, 0.00946053210645914, 0.012712862342596054, -0.021164918318390846, 0.031586240977048874, -0.006281352136284113, -0.01770126074552536, 0.036047086119651794, -0.03216465562582016, -0.003448856296017766, -0.07223202288150787, -0.009488120675086975, 0.055955320596694946, -0.028211183845996857, 0.08431088179349899, -0.04748981073498726, 0.002774635562673211, 0.04819558560848236, 0.03984127566218376, -0.0167430117726326, 0.06205927953124046, -0.04280180111527443, -0.07842056453227997, 0.009406154975295067, 0.04832495376467705, 0.030791953206062317, -0.05719143524765968, -0.008145798929035664, 0.006553814746439457, 0.026683086529374123, -0.047703273594379425, -0.03796714171767235, 0.01349884644150734, -0.028200140222907066, -0.041006628423929214, 0.037568967789411545, -0.03830037638545036, -0.05882728099822998, 0.02710692584514618, 0.003325097495689988, 0.024634720757603645, -0.06859856098890305, -0.04255691170692444, 0.011052190326154232, -0.05914317071437836, -0.035135187208652496, -0.037036869674921036, -0.0034707989543676376, 0.028429480269551277, 0.025131652131676674, 0.014198748394846916, 0.010719791986048222, -0.06765874475240707, -0.0042798700742423534, 0.07290685176849365, 0.006388765759766102, -0.024987496435642242, 0.0028806934133172035, 0.06759823113679886, 0.002993415342643857, -0.03660285100340843, 0.0426475889980793, -0.0571364089846611, 0.06224846467375755, -0.009123769588768482, -0.09641797840595245, 0.0460490882396698, -0.04694138467311859, 0.009643126279115677, 0.04355410113930702, -0.0020165271125733852, -0.014821897260844707, 0.05781308561563492, 0.06391756981611252, 0.016424864530563354, -0.005695733707398176, 0.017515871673822403, -0.006532914936542511, -0.014527116902172565, -0.067142553627491, -0.016710715368390083, 0.005134819075465202, 0.05802058055996895, 0.019574619829654694, -0.008631082251667976, -0.07643710821866989, -0.041108012199401855, -0.019667010754346848, -0.03127371519804001, -0.0020706872455775738, 0.022710297256708145, 0.017957663163542747, -0.019543107599020004, 0.0002439228555886075, -0.031377166509628296, 0.010633468627929688, 0.026015043258666992, -0.03275870904326439, 0.019920142367482185, 0.031209208071231842, -0.020832667127251625, -0.021044736728072166, 0.0001528310967842117, 0.04037153348326683, 0.04599146917462349, 0.01589193195104599, 3.464913597106668e-33, -0.009084516204893589, 0.007378256414085627, 0.05599281191825867, -0.008912508375942707, 0.0032754584681242704, -0.0015144204953685403, 0.01976841688156128, -0.003718902822583914, 0.03575844317674637, -0.017348088324069977, 0.0494852252304554, -0.0066415732726454735, 0.029947390779852867, -0.07952092587947845, 0.018796103075146675, 0.024746255949139595, -0.04321340471506119, 0.007273540832102299, -0.000644639425445348, -0.009726735763251781, 0.02641560509800911, 0.005630119703710079, -0.0031049330718815327, 0.008096123114228249, -0.02114862948656082, 0.005176534876227379, -0.01002541184425354, -0.05632942169904709, -0.0014767529210075736, 0.013869845308363438, -0.019614998251199722, 0.07713010907173157, 0.02298092283308506, -0.045059364289045334, -0.046245723962783813, -0.03295644372701645, -0.0031783031299710274, 0.05270206183195114, -0.027550719678401947, 0.00514420447871089, -0.025983240455389023, 0.004386067856103182, -0.030544284731149673, -0.015155435539782047, -0.028287917375564575, -0.00311047350987792, 0.04626725986599922, -0.022900745272636414, -0.05017147213220596, 0.000802915426902473, -0.031550608575344086, -0.006303539965301752, -0.021678678691387177, -0.06786273419857025, -0.014365308918058872, 0.06005313992500305, 0.07983911782503128, -0.015612734481692314, 0.044924668967723846, 8.888685988495126e-05, -0.043244145810604095, 0.032044872641563416, -0.03335244953632355, 0.03581715747714043, -0.02024567499756813, -0.01740494929254055, 0.0713808685541153, 0.03747258707880974, -0.038504309952259064, -0.009063382633030415, 0.05598586052656174, -0.09610145539045334, -0.02188888192176819, 0.004931097384542227, 0.020946402102708817, 0.015804093331098557, 0.030526069924235344, 0.053071651607751846, 0.0339667908847332, -0.005697616841644049, -0.0016555896727368236, 0.06148284673690796, -0.011162303388118744, -0.03717966005206108, -0.03699537739157677, 0.015568185597658157, 0.009686212986707687, 0.0735248252749443, -0.041956376284360886, -0.027572352439165115, 0.04506409913301468, 0.0034904966596513987, -0.0403670072555542, 0.0013466511154547334, -0.0013288503978401423, 0.040796201676130295, -0.03513139486312866, -0.033252183347940445, -0.029421107843518257, -0.009245318360626698, 0.018899209797382355, -0.0018577862065285444, 0.07840297371149063, -0.023488454520702362, 0.00898959394544363, -0.020314106717705727, -0.034664660692214966, -0.05692959576845169, 0.011738472618162632, -0.045969974249601364, 0.0073384130373597145, 0.01689162105321884, 0.02188573218882084, -0.0602116622030735, -0.020142415538430214, -0.021378202363848686, 0.016954291611909866, 0.0020064767450094223, -0.05481302738189697, -0.05240252986550331, -0.01888653077185154, -0.029122943058609962, -0.024570830166339874, -0.018757518380880356, -0.018704136833548546, 0.015209329314529896, 0.004917975980788469, -0.04667702689766884, -0.006483151577413082, 0.0038033376913517714, 0.0014627373311668634, 0.04906829446554184, 0.022997774183750153, 0.04253986105322838, -0.015725545585155487, -0.022302938625216484, -0.033998988568782806, -0.022944606840610504, -0.049522049725055695, 0.029752425849437714, 0.02004867047071457, 0.019445214420557022, 0.005645174533128738, 0.0038322850596159697, 0.03129732236266136, -0.04576241225004196, -0.0105011947453022, -0.003935091197490692, 0.05508561432361603, 0.02103167399764061, 0.05779316648840904, 0.05005451291799545, 0.0061379787512123585, -0.011956165544688702, -0.01980493776500225, -0.01661764644086361, -0.037045590579509735, 0.03225719928741455, -0.02540004998445511, -0.017990391701459885, -0.0019320659339427948, -0.040873441845178604, 0.03026384860277176, 0.017793910577893257, 0.001990138553082943, 0.014923552982509136, -0.0003762022533919662, -0.008725561201572418, 0.049761585891246796, 0.015837667509913445, 0.025558723136782646, -0.0031164372339844704, -0.02335933782160282, 0.004389569163322449, 0.01566356047987938, -0.07104755938053131, 0.01661371812224388, 0.016291724517941475, -0.01504075899720192, -0.04354725778102875, -0.01414339430630207, 0.07585067301988602, -0.037893038243055344, 0.018656117841601372, -0.03890443220734596, 0.04521775245666504, 0.008459223434329033, 0.018929267302155495, 0.01907881163060665, -0.03997110575437546, -0.0028920467011630535, 0.013321629725396633, 0.05631614476442337, 0.020232800394296646, 0.016648409888148308, -0.007759463507682085, -0.04579615220427513, -0.011857886798679829, 0.009729031473398209, 0.016161855310201645, 0.0750219076871872, 0.004647139925509691, 0.019916480407118797, -0.014804997481405735, -0.0005736378370784223, -0.030889859423041344, -0.011912530288100243, 0.008403473533689976, -0.020876344293355942, 0.001375727355480194, -0.007997344247996807, 0.01191231794655323, -0.051706019788980484, 0.015593479387462139, -0.046324748545885086, 0.015375844202935696, -0.0078056612983345985, -0.013494309037923813, 0.003110666526481509, -0.013698744587600231, -0.0026610782369971275, 0.037293728440999985, 0.027087226510047913, 0.05405379459261894, 0.02111239545047283, -0.01605803146958351, -0.05820441246032715, -0.03259081393480301, 0.020831424742937088, -0.030298344790935516, 0.03857516869902611, -0.00030325943953357637, -0.0269165001809597, 0.009705821983516216, -0.015585328452289104, 0.11682388186454773, 0.027923503890633583, -0.03025895170867443, -0.048663392663002014, 0.008095605298876762, -0.005601381883025169, 0.023315325379371643, 0.018104782328009605, -0.014685363508760929, -0.07011476159095764, 0.04249902069568634, 0.07102453708648682, -0.0555499792098999, -0.03926446661353111, 0.006499174050986767, 0.007283972110599279, -0.0003719140659086406, -0.08797520399093628, 0.024941042065620422, -0.04599301889538765, -0.004197132308036089, 0.06872981041669846, -0.02343934215605259, -0.01729685813188553, -0.002059630351141095, 0.00826934352517128, -0.027453750371932983, -0.026752525940537453, 0.04703826829791069, -0.03503682091832161, -0.027866868302226067, 0.05891265720129013, -0.023628685623407364, -0.0197176244109869, 0.009049446322023869, 0.0017097856616601348, 0.029360346496105194, 0.05707566812634468]'

### Compressed CSV


In [105]:
import csv
import gzip

# assuming you have a list of Document objects called documents
# and assuming you have already populated the vector attribute of each Document object

# define the headers for your CSV file
headers = ['doc_no', 'doc_text', 'vector']

# open the CSV file in 'w' mode and write the headers
with open("embedding_saves/distilroberta.csv", mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(headers)

    # loop through each Document object and write its attributes to the CSV file
    for document in extracted_documents:
        writer.writerow([document.doc_no, document.doc_text, document.vector.tolist() if document.vector is not None else None])

# gzip the CSV file
with open("embedding_saves/distilroberta.csv", 'rb') as f_in, gzip.open("embedding_saves/distilroberta.csv.gz", 'wb') as f_out:
    f_out.writelines(f_in)



In [76]:
import csv
import gzip

# read the gzip file and create new Document objects
extracted_documents = []
with gzip.open("embedding_saves/distilroberta.csv.gz", mode='rb') as file:
    # read the uncompressed content of the gzip file
    uncompressed_content = file.read()

    # parse the uncompressed content as a CSV file
    csv_content = uncompressed_content.decode('utf-8')
    reader = csv.reader(csv_content.splitlines())

    # extract the header row
    headers = next(reader)

    # loop through each row and create a new Document object
    for row in reader:
        doc_no = row[0]
        doc_text = row[1]
        vector = np.array(row[2], dtype=float)
        document = Document(doc_no, doc_text, vector)
        extracted_documents.append(document)


In [77]:
extracted_documents[0]

IndexError: list index out of range