In [102]:
import pandas as pd
import os

OCR_DIR = 'ocr-chunks'
EMBEDDING_DIR = 'embeddings'
VLR_METADATA_PATH = 'vlrs-6-13-24.csv'
DOC_TYPE = 'Voluntary Local Review'
S3_BUCKET_NAME = 'voluntary-local-reviews'
DOCUMENT_TABLE_FILE = 'supabase-data/document.csv'
DATA_PG_VECTOR_STORE_FILE = 'supabase-data/data_pg_vector_store.csv'


def create_file_name(row: pd.Series, extension: str) -> str:
    return '_'.join([
        row['Local / Regional Government'],
        row['Country'],
        row['Report(s)'],
        row['Language'],
        str(row['Year'])
    ]) + extension


def export_documents():
    df = pd.read_csv(VLR_METADATA_PATH)
    data = []
    for _, row in df.iterrows():
        data.append({
            'id': row['UUID'],
            'year': row['Year'],
            'doc_type': DOC_TYPE,
            'geography': row['Local / Regional Government'] + ', ' + row['Country'],
            'language': row['Language'],
            'source_url': row['URL'],
            'aws_s3_bucket_name': S3_BUCKET_NAME,
            'aws_s3_object_name': create_file_name(row, '.pdf'),
            'aws_s3_file_name': create_file_name(row, '.pdf') 
        })

    # Write to disk.
    pd.DataFrame(data).to_csv(DOCUMENT_TABLE_FILE, index=False)


dfs = []

df = pd.read_csv(VLR_METADATA_PATH)

i = 0
for _, row in df.iterrows():
    # TODO: Remove error catching
    try:
        file_name = create_file_name(row, '.csv')

        chunks = pd.read_csv(f'{OCR_DIR}/OCR_{file_name}')

        # Modify to fit with supabase
        chunks['document_id'] = row['UUID']
        chunks = chunks
        embeddings = pd.read_csv(f'{EMBEDDING_DIR}/EMBED_OCR_{file_name}')

        chunks = chunks.reset_index()
        dfs.append(chunks.merge(embeddings, on='index'))
    except Exception as e:
        continue

# Write to disk.
df = pd.concat(dfs, axis=0).drop([
    'index',
    'file_path',
    'file_name'
], axis=1)
df.to_csv(DATA_PG_VECTOR_STORE_FILE, index=False)

In [103]:
df

Unnamed: 0,text,page,chunk_num,document_id,embedding,embedding_model
0,The city of Accra is one of the first three ci...,2,0,2a6efd8a-a789-4e75-a74f-2e28ca81d71f,"[-0.009741212, 0.0060071796, 0.106254786, 0.03...",text-embedding-3-small
1,prevent the harmful effects of climate change ...,2,1,2a6efd8a-a789-4e75-a74f-2e28ca81d71f,"[0.041960187, -0.01523865, 0.11900444, 0.00449...",text-embedding-3-small
2,and progress towards achieving the SDGs and th...,2,2,2a6efd8a-a789-4e75-a74f-2e28ca81d71f,"[0.04147777, -0.025059938, 0.09582123, 0.00098...",text-embedding-3-small
3,"the vision to make Accra ""A Smart, Safe, Susta...",2,3,2a6efd8a-a789-4e75-a74f-2e28ca81d71f,"[0.06121284, 0.003966062, 0.062263295, -0.0130...",text-embedding-3-small
4,city's commitment to work in partnership with ...,2,4,2a6efd8a-a789-4e75-a74f-2e28ca81d71f,"[0.03851966, 0.021301301, 0.104896665, 0.00027...",text-embedding-3-small
...,...,...,...,...,...,...
436,response to service request) Description: The ...,103,436,afd5c6f4-e1d1-410e-a7e3-6b7686bbd1ed,"[0.0014117621, 0.020967219, 0.0349234, -0.0138...",text-embedding-3-small
437,Appendix: Goal 15 Indicator name: Average time...,104,437,afd5c6f4-e1d1-410e-a7e3-6b7686bbd1ed,"[0.0033689532, 0.07675242, 0.044218324, 0.0124...",text-embedding-3-small
438,"limbs, down tree limbs. A service request is c...",104,438,afd5c6f4-e1d1-410e-a7e3-6b7686bbd1ed,"[0.010064325, 0.026623923, 0.03661459, 0.00470...",text-embedding-3-small
439,total number of street and forestry trees plan...,104,439,afd5c6f4-e1d1-410e-a7e3-6b7686bbd1ed,"[0.004417236, -0.02759491, 0.056299403, 0.0370...",text-embedding-3-small


In [99]:
embed_df.sample()

Unnamed: 0,index,embedding,embedding_model,file_name,file_id
229,229,"[-0.008090429, -0.022952644, 0.045685515, 0.06...",text-embedding-3-small,OCR_City of Los Angeles_United States_2021 Vol...,City of Los Angeles_United States_2021 Volunta...


In [100]:
two

Unnamed: 0,index,embedding,embedding_model,file_name
0,0,"[-0.009741212, 0.0060071796, 0.106254786, 0.03...",text-embedding-3-small,OCR_Accra_Ghana_Voluntary Local Review_English...
1,1,"[0.041960187, -0.01523865, 0.11900444, 0.00449...",text-embedding-3-small,OCR_Accra_Ghana_Voluntary Local Review_English...
2,2,"[0.04147777, -0.025059938, 0.09582123, 0.00098...",text-embedding-3-small,OCR_Accra_Ghana_Voluntary Local Review_English...
3,3,"[0.06121284, 0.003966062, 0.062263295, -0.0130...",text-embedding-3-small,OCR_Accra_Ghana_Voluntary Local Review_English...
4,4,"[0.03851966, 0.021301301, 0.104896665, 0.00027...",text-embedding-3-small,OCR_Accra_Ghana_Voluntary Local Review_English...
...,...,...,...,...
323,323,"[0.022398287, 0.034503005, 0.10470776, 0.03335...",text-embedding-3-small,OCR_Accra_Ghana_Voluntary Local Review_English...
324,324,"[0.04654649, 0.039405834, 0.10785033, 0.025256...",text-embedding-3-small,OCR_Accra_Ghana_Voluntary Local Review_English...
325,325,"[0.058389492, -0.004446882, 0.071481556, 0.024...",text-embedding-3-small,OCR_Accra_Ghana_Voluntary Local Review_English...
326,326,"[0.041920796, -0.005688397, 0.05289246, 0.0181...",text-embedding-3-small,OCR_Accra_Ghana_Voluntary Local Review_English...
