In [103]:
from sentence_transformers import SentenceTransformer
import os, glob
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string as st
import pickle

In [77]:
PATH_TO_DOCUMENTS = './/data//plaintext_articles//'
document_mapper = pd.DataFrame(os.listdir('.//data//plaintext_articles'),columns=['file_name'])

In [78]:
def load_file(filename: str) -> str:
    with open(f"{PATH_TO_DOCUMENTS}{filename}", "r", encoding="utf-8") as f:
        return f.read()
    
document_mapper["docs"] = document_mapper["file_name"].apply(load_file)

In [79]:
document_mapper.head(5)

Unnamed: 0,file_name,docs
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in.txt,#copyright\n\nÁedán mac Gabráin\n\n2007 Sch...
1,%C3%85land.txt,#copyright\n\nÅland\n\n2007 Schools Wikiped...
2,%C3%89douard_Manet.txt,#copyright\n\nÉdouard Manet\n\n2007 Schools...
3,%C3%89ire.txt,#copyright\n\nÉire\n\n2007 Schools Wikipedi...
4,%C3%93engus_I_of_the_Picts.txt,#copyright\n\nÓengus I of the Picts\n\n2007...


In [118]:
def clean_text(document : str) -> str :
    tokens = word_tokenize(document)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', st.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    #words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in stripped if not w in stop_words]
    return (" ").join(words)

In [122]:
document_mapper['clean_text'] = document_mapper['docs'].apply(clean_text)

In [124]:
document_mapper.head(5)

Unnamed: 0,file_name,docs,clean_text
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in.txt,#copyright\n\nÁedán mac Gabráin\n\n2007 Sch...,copyright áedán mac gabráin 2007 schools wiki...
1,%C3%85land.txt,#copyright\n\nÅland\n\n2007 Schools Wikiped...,copyright åland 2007 schools wikipedia select...
2,%C3%89douard_Manet.txt,#copyright\n\nÉdouard Manet\n\n2007 Schools...,copyright édouard manet 2007 schools wikipedi...
3,%C3%89ire.txt,#copyright\n\nÉire\n\n2007 Schools Wikipedi...,copyright éire 2007 schools wikipedia selecti...
4,%C3%93engus_I_of_the_Picts.txt,#copyright\n\nÓengus I of the Picts\n\n2007...,copyright óengus picts 2007 schools wikipedia...


In [149]:
encoding_model = SentenceTransformer('all-MiniLM-L12-v2')
encoding_model.max_seq_length = 512
print("Max Sequence Length:", encoding_model.max_seq_length)

Max Sequence Length: 512


In [152]:
embeddings = encoding_model.encode(document_mapper['clean_text'].tolist(),show_progress_bar=True)

Batches:   0%|          | 0/144 [00:00<?, ?it/s]

In [153]:
import pickle
with open('embeddings.pkl', "wb") as fOut:
    pickle.dump({'sentences': document_mapper['clean_text'].tolist(), 'embeddings': embeddings}, fOut, protocol=pickle.HIGHEST_PROTOCOL)

In [154]:
#with open('embeddings.pkl', "rb") as fIn:
#    stored_data = pickle.load(fIn)
#    stored_sentences = stored_data['sentences']
#    stored_embeddings = stored_data['embeddings']

In [161]:
document_mapper['embeddings'] = embeddings.tolist()

In [175]:
document_mapper.head(5)

Unnamed: 0,file_name,docs,clean_text,embeddings
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in.txt,#copyright\n\nÁedán mac Gabráin\n\n2007 Sch...,copyright áedán mac gabráin 2007 schools wiki...,"[0.04089818894863129, -0.00878914725035429, -0..."
1,%C3%85land.txt,#copyright\n\nÅland\n\n2007 Schools Wikiped...,copyright åland 2007 schools wikipedia select...,"[0.060515306890010834, -0.026957755908370018, ..."
2,%C3%89douard_Manet.txt,#copyright\n\nÉdouard Manet\n\n2007 Schools...,copyright édouard manet 2007 schools wikipedi...,"[0.08456774055957794, 0.04743683338165283, 0.0..."
3,%C3%89ire.txt,#copyright\n\nÉire\n\n2007 Schools Wikipedi...,copyright éire 2007 schools wikipedia selecti...,"[0.06309432536363602, -0.012940289452672005, 0..."
4,%C3%93engus_I_of_the_Picts.txt,#copyright\n\nÓengus I of the Picts\n\n2007...,copyright óengus picts 2007 schools wikipedia...,"[0.02952096424996853, 0.0008616832201369107, 0..."


In [None]:
document_mapper.to_pickle('df_embeddings.pkl')