# <span style="color:blue"> Building Knowledge Base</span>

In [None]:
!pip install pinecone-client
!pip install python-dotenv
!pip install gensim nltk
!pip install PyPDF2
import gensim
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
import numpy as np
#import json
from pinecone import Pinecone
from dotenv import load_dotenv
import os
from PyPDF2 import PdfReader

In [None]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in pinecone.list_indexes():
    # if does not exist, create index
    pinecone.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']), #actually, it is 1536
        metric='cosine',
        metadata_config={'indexed': ['cstu_id']}#indexed key in the dictionary specifies cstu_id column be indexed, means that the values in the column will be stored in a way that makes them faster to search
    )

In [None]:
# Tokenize your text
#def tokenize_text(text):
#    return nltk.word_tokenize(text)

# Train a Word2Vec model
def train_word2vec_model(tokens):
    model = Word2Vec(tokens, vector_size=1536, min_count=1)
    return model

# Generate an embedding for a text
def generate_embedding(model, text):
    tokens = nltk.word_tokenize(text)
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not word_vectors: # If no valid word vectors are found, return a vector of zeros
        return np.zeros(model.vector_size)
    embedding = np.mean(word_vectors, axis=0)
    return embedding

#dotenv_path = r"D:\.env"  # Specify the path to the .env file
env = load_dotenv() # Copy .env file to the same directory before running
pinecone_api_key ='c58d9979-cd72-4771-bddf-356cb86e35a3'#pinecone_api_key = os.getenv("PINECONE_API_KEY")

# initialize connection to pinecone (get API key at app.pinecone.io)
pc = Pinecone(
    api_key=pinecone_api_key,
    environment="us-west1-gcp-free")
# Specify the name of your index
index_name = "cstugpt-kb"
if index_name not in pc.list_indexes().names():
    pc.create_index(name=index_name, metric="cosine", dimension=1536)

index = pc.Index(index_name) # connect to pinecone index

Collecting pinecone-client
  Downloading pinecone_client-3.0.2-py3-none-any.whl (201 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/201.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m201.4/201.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pinecone-client
Successfully installed pinecone-client-3.0.2
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def mls_upsert(cstu_file, index_name, name_space, cstu_id, chunk_size, stride):
   # create a reader object
    print("Knowledge base file name:", cstu_file)
    reader = PdfReader(cstu_file)
    page_len = len(reader.pages)
    print("length of the knowledge base file:", page_len)
    doc = ""
    for i in range(page_len):
        doc += reader.pages[i].extract_text()
        print("page completed:", i)
    doc = doc.splitlines()
    # Train Word2Vec model on entire corpus
    tokens = [tokenize_text(text) for text in doc]
    model = train_word2vec_model(tokens)
    model.save("CSTU-embedding-model.mdl")
    #Connect to index
    index = pc.Index(index_name)
    count = 0
    for i in range(0, len(doc), chunk_size):#The loop iterates over the document in steps of chunk_size
        #find begining and end of the chunk
        i_begin = max(0, i-stride)
        i_end = min(len(doc), i_begin+chunk_size)
        doc_chunk = doc[i_begin:i_end]
        print("-------------------------------------------------------------")
        print("The ", i//chunk_size + 1, " doc chunk text:", doc_chunk)
        texts = ""
        for x in doc_chunk:
            texts += x
        print("Texts:", texts)

        #Create embeddings of the chunk texts
        embed = generate_embedding(model, texts)
        print("Embeds length:", len(embed))

        # Meta data preparation
        metadata = {
            "cstu_id": cstu_id + '_' + str(count),
            "text": texts
        }
        count += 1
        print("Upserted vector count is: ", count)
        print("==========================================================")

        #upsert to pinecone and corresponding namespace
        #index.upsert(vectors=[(metadata["cstu_id"], embed, metadata)], namespace=name_space)

mls_upsert(r"cstugpt_kb.pdf", "cstugpt-kb", "cstu","cstu-kb", 8, 1)


Knowledge base file name: cstugpt_kb.pdf
length of the knowledge base file: 10
page completed: 0
page completed: 1
page completed: 2
page completed: 3
page completed: 4
page completed: 5
page completed: 6
page completed: 7
page completed: 8
page completed: 9
-------------------------------------------------------------
The  1  doc chunk text: ['Overview of  California Science And Technology University  (CSTU) : ', 'Contact Email: admission@cstu.edu ; Contact Office: (408) 400 -3948 ; ', 'Address: 1601 McCarthy Boulevard, Milpitas, CA 95035 ; ', 'The president of CSTU : Prof. Glen Qin.  ', 'CSTU is offering  05 programs:  Bachelor of Science in Computer Systems and Engineering, ', 'Bachelor of Science in Business Administration, Master of Science in Computer Systems and ', 'Engineering , Master of Business Administration and Emerging Technology Training Program .  ', 'The available courses  for regist ration  in Spring  2024 includes: 1) Generative Artificial ']
Texts: Overview of  Cali

In [None]:
from joblib import load
embedding_model = load('CSTU-embedding-model.mdl')
res = generate_embedding(embedding_model, 'hi ')
print(res)

nan


In [None]:
index.delete(delete_all=True, namespace="cstu")
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'cstu': {'vector_count': 59},
                'namespace-cstu-kb': {'vector_count': 48},
                'namespace-cstu-qa': {'vector_count': 11}},
 'total_vector_count': 118}