In [1]:
import pinecone
import openai
import numpy as np
import os
from dotenv import load_dotenv

# Langchain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.document_loaders import TextLoader

# Load variables from the .env file
load_dotenv('./Sn33k/.env')

# Access the variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")
PINECONE_ENVIRONMENT= os.getenv("PINECONE_ENVIRONMENT")

openai.api_key = OPENAI_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

  from tqdm.autonotebook import tqdm


In [2]:

# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENVIRONMENT,  # next to api key in console
)

index_name = "cloudhacks-summy"

embeddings = OpenAIEmbeddings(model='text-embedding-ada-002')
# if you already have an index, you can load it like this
docsearch = Pinecone.from_existing_index(index_name, embeddings)

# List all indexes information
index_description = pinecone.describe_index(index_name)
print('index_description: ', index_description)

index = pinecone.Index(index_name) 
index_stats_response = index.describe_index_stats()
print('index_stats_response: ', index_stats_response)

index_description:  IndexDescription(name='cloudhacks-summy', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')
index_stats_response:  {'dimension': 1536,
 'index_fullness': 0.00032,
 'namespaces': {'': {'vector_count': 32}},
 'total_vector_count': 32}


In [3]:
from langchain.vectorstores import Pinecone

text_field = "text"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embeddings.embed_query, text_field
)

In [4]:
MODEL = "text-embedding-ada-002"

In [5]:
query = "maskocr explanation"

xq = openai.Embedding.create(input=query, engine=MODEL)["data"][0]["embedding"]
len(xq)
xq

[-0.01494944654405117,
 -0.016076289117336273,
 -0.0011352939764037728,
 -0.03272351250052452,
 -0.0055703590624034405,
 0.034526459872722626,
 -0.012320146895945072,
 -0.035668328404426575,
 -0.030620072036981583,
 0.011103156954050064,
 0.03086046501994133,
 -0.008030632510781288,
 -0.017353378236293793,
 -0.004650104325264692,
 -0.025331424549221992,
 0.0022837345022708178,
 0.02662353590130806,
 -0.015039593912661076,
 -0.00934528186917305,
 -0.029373032972216606,
 0.007050279062241316,
 0.021259766072034836,
 -0.03254321590065956,
 -0.025812210515141487,
 -0.013815090991556644,
 0.023648671805858612,
 -0.0033486008178442717,
 -0.029568351805210114,
 -0.01951691508293152,
 -0.02207109145820141,
 0.00038124845013953745,
 -0.0015249938005581498,
 -0.03341464325785637,
 -0.010817689821124077,
 -0.04834906384348869,
 0.01176423765718937,
 0.011659066192805767,
 -0.01435597613453865,
 -0.006963887717574835,
 0.001923144911415875,
 -0.019411742687225342,
 -0.010922862216830254,
 0.009022

In [6]:
res = index.query([xq], top_k=5, include_metadata=True)

In [7]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['text']}")

0.78: MaskOCR,pretrainsboththeencoderandthedecoderwiththemaskingstrategyinasequential
manner. Wefollowtheself-supervisedpretrainingframeworkandadoptamaskedimagemodeling
∗CorrespondingAuthor. approach to pretrain the encoder for semantic patch representation learning. We divide the text
imageintoasetofverticalpatches,andrandomlymasksomepatchesthatmaycontainapartof
somecharacter,orsomewholecharacters. Wepredicttherepresentationsofthemaskedpatches
fromthevisiblepatchesintherepresentationspacelearnedfromtheencoder,andmapthepredicted
representationstothemaskedpatchimages.
Wepretrainthedecoderinasupervisedmannerwiththemaskingstrategyforlanguagemodeling
oversynthesizedtextimages. Wefixthepretrainedencoderandonlyupdatethedecoder,sothatthis
pretrainingtaskexploresthelanguageruleandtheencoderisnotaffectedbythesynthesizedtext
imagestylethatmightbedifferentfromthedownstreamtasks.
We validate the effectiveness of the proposed MaskOCR approach on the benchmark datasets,
including Chinese and English