In [1]:
import json 
import os 
import dotenv 
from pinecone import Pinecone, ServerlessSpec
# from transformers import pipeline

dotenv.load_dotenv()

True

In [2]:
from transformers import pipeline
import numpy as np

In [3]:
univ_list = os.path.join('assets', 'univ_list.json')
universities = json.load(open(univ_list, 'r'))
universities[0]

{'Rank': 1,
 'Institution Name': 'Massachusetts Institute of Technology (MIT) ',
 'Location': 'United States',
 'Size': 'M',
 'Academic Reputation': 100.0,
 'Employer Reputation': 100.0,
 'Faculty Student': 100.0,
 'Citations per Faculty': 100.0,
 'International Faculty': 99.3,
 'International Students': 86.8,
 'International Research Network': 96.0,
 'Employment Outcomes': 100.0,
 'Sustainability': 99.0,
 'Overall': 100,
 'review': 'Massachusetts Institute of Technology (MIT)  is a top-ranked university.\nLocated in United States, it offers a unique learning experience.\nAcademically, it has a strong reputation, ranking 1 globally.\nEmployers highly value its graduates, with a reputation score of 100.0.\nThe faculty-to-student ratio is 100.0, allowing for close mentorship.\nFaculty members are highly productive, with an average of 100.0 citations per year.\nThe university has a diverse community, with 99.3 international faculty members and 86.8 international students.\nIt has a strong

In [4]:
pinecone_api_key = os.environ['PINECONE_API_KEY']
pc = Pinecone(api_key = pinecone_api_key)
index_name = os.environ['PINECONE_INDEX_NAME']
embedding_model = os.environ['EMBEDDING_MODEL']
dimension = int(os.environ['EMBEDDING_DIMENSION'])

print(index_name)
print(embedding_model)
print(dimension)

univ-indexer
sentence-transformers/all-MiniLM-L6-v2
384


In [5]:
len([univ['review'] for univ in universities])

1503

In [6]:
import torch 
from tqdm import tqdm 
import numpy as np 

def embed_data(data, embed_pipeline, batch_size = 32):
    embeddings = []
    for st_index in tqdm(range(0, len(data), batch_size), 'Embedding the reviews'):
        batch_data = data[st_index : st_index + batch_size]
        try:
            with torch.no_grad():
                embedding = embed_pipeline(batch_data)
            embedding = [np.mean(e[0], axis = 0).tolist() for e in embedding]
            embeddings.extend(embedding)
        except Exception as err:
            print(f'Error occured at the batch {st_index} - {st_index + batch_size}')
            print(err)
            print()
    return embeddings

In [7]:
embedder = pipeline('feature-extraction', model=embedding_model)
# Index data
data = [univ['review'] for univ in universities]
embeddings = embed_data(data, embedder)

Embedding the reviews: 100%|██████████| 47/47 [01:23<00:00,  1.77s/it]


In [8]:
len(embeddings)

1503

In [9]:
len(embeddings[3])

384

In [10]:
pc.create_index(
    name= index_name,
    dimension=dimension, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [11]:
uindex = pc.Index(index_name)
uindex

<pinecone.data.index.Index at 0x13505eaebd0>

In [12]:
universities[0]

{'Rank': 1,
 'Institution Name': 'Massachusetts Institute of Technology (MIT) ',
 'Location': 'United States',
 'Size': 'M',
 'Academic Reputation': 100.0,
 'Employer Reputation': 100.0,
 'Faculty Student': 100.0,
 'Citations per Faculty': 100.0,
 'International Faculty': 99.3,
 'International Students': 86.8,
 'International Research Network': 96.0,
 'Employment Outcomes': 100.0,
 'Sustainability': 99.0,
 'Overall': 100,
 'review': 'Massachusetts Institute of Technology (MIT)  is a top-ranked university.\nLocated in United States, it offers a unique learning experience.\nAcademically, it has a strong reputation, ranking 1 globally.\nEmployers highly value its graduates, with a reputation score of 100.0.\nThe faculty-to-student ratio is 100.0, allowing for close mentorship.\nFaculty members are highly productive, with an average of 100.0 citations per year.\nThe university has a diverse community, with 99.3 international faculty members and 86.8 international students.\nIt has a strong

In [13]:
import uuid

def create_vectors(data: dict, embeddings : list) -> list:
    vectors = []
    for i in range(len(data)):
        d = data[i]
        e = embeddings[i]
        vec = {
            'id'     : str(uuid.uuid4()),
            'values' : e,
            'metadata' : d
        }
        vectors.append(vec)
    return vectors

In [14]:
vectors = create_vectors(universities, embeddings)

In [15]:
vectors[0].keys()

dict_keys(['id', 'values', 'metadata'])

In [16]:
vectors[0]['metadata']

{'Rank': 1,
 'Institution Name': 'Massachusetts Institute of Technology (MIT) ',
 'Location': 'United States',
 'Size': 'M',
 'Academic Reputation': 100.0,
 'Employer Reputation': 100.0,
 'Faculty Student': 100.0,
 'Citations per Faculty': 100.0,
 'International Faculty': 99.3,
 'International Students': 86.8,
 'International Research Network': 96.0,
 'Employment Outcomes': 100.0,
 'Sustainability': 99.0,
 'Overall': 100,
 'review': 'Massachusetts Institute of Technology (MIT)  is a top-ranked university.\nLocated in United States, it offers a unique learning experience.\nAcademically, it has a strong reputation, ranking 1 globally.\nEmployers highly value its graduates, with a reputation score of 100.0.\nThe faculty-to-student ratio is 100.0, allowing for close mentorship.\nFaculty members are highly productive, with an average of 100.0 citations per year.\nThe university has a diverse community, with 99.3 international faculty members and 86.8 international students.\nIt has a strong

In [17]:
def upsert_to_pinecone(vectors, indexer, batch_size = 32):
    for i in tqdm(range(0, len(vectors), batch_size), 'Upserting vectors to pinecone'):
        batch_vec = vectors[i : i + batch_size]
        indexer.upsert(vectors = batch_vec)
    print('Done ...')

In [18]:
upsert_to_pinecone(vectors, uindex, 100)

Upserting vectors to pinecone: 100%|██████████| 16/16 [00:12<00:00,  1.25it/s]

Done ...





#### querying 

In [39]:
embedder = pipeline('feature-extraction', model=embedding_model)
user_query = 'get me a top ranked university located in london'
embedding = embedder(user_query) # shape : (sentence size, embedding size)
vec = np.mean(embedding[0], axis = 0).tolist() # shape : (embedding size)

In [40]:
response = uindex.query(
    vector=vec, 
    top_k=3,
    include_metadata=True
)['matches']

In [41]:
[res['metadata']['Institution Name'] for res in response]

['London Metropolitan University',
 'City, University of London',
 'University of East London']