# Embedding course description and build ANN Index

Import libraries

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

  from .autonotebook import tqdm as notebook_tqdm


Read data

In [2]:
df = pd.read_csv('./data/courses.csv')

In [3]:
df

Unnamed: 0,Course Name,Course URL,About This Course,Description,keywords
0,CS50's Introduction to Computer Science,https://www.edx.org/learn/computer-science/har...,"This isCS50x, Harvard University's introductio...","this iscs50x, harvard university's introductio...","programming, harvardx, courses, iscs50x, cs50,..."
1,CS50's Introduction to Programming with Python,https://www.edx.org/learn/python/harvard-unive...,An introduction to programming using a languag...,an introduction to programming using a languag...,"programming, python, cs50x, software, cs50p, p..."
2,CS50's Introduction to Artificial Intelligence...,https://www.edx.org/learn/artificial-intellige...,This course explores the concepts and algorith...,this course explores the concepts and algorith...,"algorithms, python, handwriting, learning, sea..."
3,CS50's Introduction to Cybersecurity,https://www.edx.org/learn/cybersecurity/harvar...,This is CS50's introduction to cybersecurity f...,this is cs50's introduction to cybersecurity f...,"cybersecurity, threats, protect, usability, ri..."
4,CS50's Web Programming with Python and JavaScript,https://www.edx.org/learn/web-development/harv...,"Topics include database design, scalability, s...","topics include database design, scalability, s...","heroku, github, applications, cloud, projects,..."
...,...,...,...,...,...
1552,Predicting CTR with Machine Learning in Python,https://www.datacamp.com/courses/predicting-ct...,Have you ever wondered how companies like Face...,have you ever wondered how companies like face...,"ads, ad, learning, python, click, learn, targe..."
1553,Optimizing R Code with Rcpp,https://www.datacamp.com/courses/optimizing-r-...,"R is a great language for data science, but so...","r is a great language for data science, but so...","rcpp, boost, performance, language, compiled, ..."
1554,GDPR in Practice: Compliance and Fines,https://www.datacamp.com/courses/gdpr-in-pract...,Apply GDPR Concepts in Real Business Scenarios...,apply gdpr concepts in real business scenarios...,"gdpr, compliance, privacy, regulation, data, a..."
1555,Scalable AI Models with PyTorch Lightning,https://www.datacamp.com/courses/scalable-ai-m...,Foundations of Scalable AI | This course takes...,foundations of scalable ai | this course takes...,"ai, optimizers, optimized, optimize, learning,..."


Embedding description

In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(df['Description'].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

Batches: 100%|██████████| 49/49 [05:41<00:00,  6.98s/it]


Create ```FAISS``` index

In [5]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
faiss.normalize_L2(embeddings)  
index.add(embeddings)
faiss.write_index(index, "course_index.faiss")

Save metadata

In [6]:
metadata_df = df[['Course Name', 'Course URL', 'About This Course']].reset_index(drop=True)
df.to_csv("./data/course_metadata.csv", index=False)