In [9]:
import numpy as np
import openai
from retrying import retry
import pandas as pd
import pinecone
import itertools
from dotenv import load_dotenv
import os

  from tqdm.autonotebook import tqdm


In [2]:
df = pd.read_csv("UCONN_CLUB_INFO.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)
df.head()

Unnamed: 0,Club Names,Club Short Descriptions,Club Long Descriptions,Club Urls
0,4H Vernon Afterschool,Are you ready to make a difference in a child'...,4-H Vernon After School Enrichment Club throug...,https://uconntact.uconn.edu/organization/co4hv...
1,A Completely Different Note,"A Completely Different Note (CDN), UCONN's old...",,https://uconntact.uconn.edu/organization/acomp...
2,A.P. Christian Youth - Avery Point Campus,We are God believing Youth who want to spread ...,Bible Studies\nSocial Gatherings\nPrayer Meeti...,https://uconntact.uconn.edu/organization/apchr...
3,Academic Achievement Center,The Academic Achievement Center assists studen...,,https://uconntact.uconn.edu/organization/aac
4,Accounting Society,The Accounting Society has weekly meetings wit...,The Accounting Society works very closely with...,https://uconntact.uconn.edu/organization/accou...


In [11]:
load_dotenv()

True

In [6]:
max_retries = 5
initial_backoff = 1  

@retry(stop_max_attempt_number=max_retries, wait_exponential_multiplier=1000, wait_exponential_max=10000)
def make_embedding_request(input_text):
    response = openai.Embedding.create(
        input=input_text,
        model="text-embedding-ada-002"
    )
    return response['data'][0]['embedding']

allclubsembds = []
shortembds = []
descembds = []
nameembds = []
for i in range(len(df)):
    cn = df.loc[i]["Club Names"]
    ct = df.loc[i]["Club Short Descriptions"]
    cd = df.loc[i]["Club Long Descriptions"]
    if isinstance(cd, str):
        try:
            cn_embedding = make_embedding_request(cn)
            nameembds.append(cn_embedding)
            ct_embedding = make_embedding_request(ct)
            shortembds.append(ct_embedding)
            cd_embedding = make_embedding_request(cd)
            descembds.append(cd_embedding)
            cnctcd = list((0.15 * np.array(cn_embedding)) + (0.35 * np.array(ct_embedding)) + (0.5 * np.array(cd_embedding)))
            allclubsembds.append(cnctcd)
        except openai.error.APIError as e:
            print("API Error occurred:", e)
    else:
        try:
            cn_embedding = make_embedding_request(cn)
            nameembds.append(cn_embedding)
            ct_embedding = make_embedding_request(ct)
            shortembds.append(ct_embedding)
            cnct = list((0.15 * np.array(cn_embedding)) + (0.85 * np.array(ct_embedding)))
            allclubsembds.append(cnct)
        except openai.error.APIError as e:
            print("API Error occurred:", e)

In [7]:
np.save("club_short_desc_embeds.npy", np.array(shortembds))
np.save("club_titles_embeds.npy", np.array(nameembds))
np.save("club_long_desc_embeds.npy", np.array(descembds))
np.save("allclubsembds.npy", np.array(allclubsembds))

In [22]:
pinecone.init(api_key=os.getenv("PINECONE_API_KEY"), environment='asia-southeast1-gcp-free')

In [23]:
index = pinecone.Index("clubs-index")

In [24]:
vecs = []
for i in range(len(df)):
    mtdta = dict()
    if isinstance(df.loc[i]["Club Long Descriptions"], str):
        mtdta["Club Long Description"] = df.loc[i]["Club Long Descriptions"]
    else:
        mtdta["Club Long Description"] = ""
    mtdta["Club Short Description"] = df.loc[i]["Club Short Descriptions"]
    mtdta["Club Name"] = df.loc[i]["Club Names"]
    vecs.append((df.loc[i]["Club Urls"], allclubsembds[i], mtdta))
    

In [25]:
def chunks(iterable, batch_size=100):
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

for ids_vectors_chunk in chunks(vecs, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk, namespace="uconn-clubs")  

In [49]:
response = openai.Embedding.create(
    input="computer science",
    model="text-embedding-ada-002"
)
query_embedding = response['data'][0]['embedding']
query_response = index.query(
    namespace='uconn-clubs',
    top_k=10,
    include_values=True,
    include_metadata=True,
    vector=query_embedding,
)
print(type(query_response))

<class 'pinecone.core.client.model.query_response.QueryResponse'>


In [50]:
for m in query_response["matches"]:
    # print(m.keys())
    print("Course: ", m["id"])
    print("Description: ", m["metadata"]["Club Name"])
    print()
    print()

Course:  https://uconntact.uconn.edu/organization/datascience
Description:  Data Science, UConn


Course:  https://uconntact.uconn.edu/organization/wics
Description:  Women in Computer Science


Course:  https://uconntact.uconn.edu/organization/msbapm_
Description:  MS in Business Analytics and Project Management


Course:  https://uconntact.uconn.edu/organization/ieee
Description:  Electrical and Electronics Engineers, The Institute of (IEEE)


Course:  https://uconntact.uconn.edu/organization/pcbuildingclub
Description:  PC Building Club


Course:  https://uconntact.uconn.edu/organization/createcommunicate
Description:  Create and Communicate


Course:  https://uconntact.uconn.edu/organization/uconnaiclub
Description:  Artificial Intelligence Club, UConn


Course:  https://uconntact.uconn.edu/organization/uconncyber
Description:  Cyber, UConn


Course:  https://uconntact.uconn.edu/organization/csd
Description:  Center for Students with Disabilities


Course:  https://uconntact.uconn.

In [None]:
query_response["matches"][9]

In [12]:
pinecone.init(api_key=os.getenv("PINECONE_API_KEY"), environment='asia-southeast1-gcp-free')
index = pinecone.Index("clubs-index")
delete_response = index.delete(ids=['https://uconntact.uconn.edu/organization/soccerclubuconnmens'], namespace='uconn-clubs')