In [1]:
import pandas as pd
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
import pinecone
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm


In [2]:
files = pd.read_csv("course_descriptions.csv", encoding = "ANSI")

In [6]:
def create_course_description(row):
    return f'''The course name is {row["course_name"]}, the slug is {row["course_slug"]},
            the technology is {row["course_technology"]} and the course topic is {row["course_topic"]}'''

In [7]:
pd.set_option('display.max_rows', 106)
files['course_description_new'] = files.apply(create_course_description, axis = 1)
print(files["course_description_new"])

0      The course name is Introduction to Tableau, th...
1      The course name is The Complete Data Visualiza...
2      The course name is Introduction to R Programmi...
3      The course name is Data Preprocessing with Num...
4      The course name is Introduction to Data and Da...
5      The course name is Data Cleaning and Preproces...
6      The course name is Introduction to Business An...
7      The course name is Data Analysis with Excel Pi...
8      The course name is SQL, the slug is sql,\n    ...
9      The course name is Credit Risk Modeling in Pyt...
10     The course name is Python Programmer Bootcamp,...
11     The course name is SQL + Tableau + Python, the...
12     The course name is Introduction to Jupyter, th...
13     The course name is Statistics, the slug is sta...
14     The course name is Mathematics, the slug is ma...
15     The course name is Introduction to Excel, the ...
16     The course name is Probability, the slug is pr...
17     The course name is Start

In [8]:
%load_ext dotenv
%dotenv

In [9]:
load_dotenv(find_dotenv(), override = True)

True

In [10]:
pc = Pinecone(api_key = os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

In [11]:
index_name = "my-index"
dimension = 384
metric = "cosine"

In [12]:
if index_name in [index.name for index in pc.list_indexes()]:
    pc.delete_index(index_name)
    print(f"{index_name} succesfully deleted.")
else:
     print(f"{index_name} not in index list.")

my-index succesfully deleted.


In [13]:
pc.create_index(
    name = index_name, 
    dimension = dimension, 
    metric = metric, 
    spec = ServerlessSpec(
        cloud = "aws", 
        region = "us-east-1")
    )

In [14]:
index = pc.Index(index_name)

## Embedding the data

In [15]:
#model = SentenceTransformer("all-MiniLM-L6-v2")
model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [16]:
def create_embeddings(row):
    combined_text = ' '.join([str(row[field]) for field in ['course_description', 'course_description_new', 'course_description_short']])
    embedding = model.encode(combined_text, show_progress_bar = False)
    return embedding

In [17]:
files["embedding"] = files.apply(create_embeddings, axis = 1)

In [18]:
vectors_to_upsert = [(str(row["course_name"]), row["embedding"].tolist()) for _, row in files.iterrows()]
index.upsert(vectors = vectors_to_upsert)

print("Data upserted to Pinecone index")

Data upserted to Pinecone index


## Semantic search

In [21]:
query = "clustering"
query_embedding = model.encode(query, show_progress_bar = False).tolist()

In [22]:
query_results = index.query(
    vector = [query_embedding],
    top_k = 12, 
    include_values = True
)

In [23]:
query_results

{'matches': [{'id': 'Machine Learning in Excel',
              'score': 0.35445559,
              'values': [-0.0183002669,
                         -0.027948603,
                         -0.0253203455,
                         -0.0126938606,
                         -0.0240366831,
                         -0.0219841097,
                         -0.0511236563,
                         -0.0535800382,
                         0.00997647829,
                         0.0282286629,
                         -0.040832445,
                         -0.0362686366,
                         0.0683277175,
                         -0.0348471329,
                         -0.00728512695,
                         0.0366663188,
                         -0.00331024779,
                         -0.00411821343,
                         -4.75644083e-05,
                         -0.0627968907,
                         0.0846959949,
                         0.0300104674,
                         -0.0528304912

In [25]:
score_threshold = 0.3
for match in query_results["matches"]:
    if match['score'] >= score_threshold:
        print(f"Matched item ID: {match['id']}, score: {match['score']}")

Matched item ID: Machine Learning in Excel, score: 0.35445559
Matched item ID: Machine Learning with K-Nearest Neighbors, score: 0.313482136
