In [16]:
import requests
from bs4 import BeautifulSoup
import csv

# Base URL of the webpage
base_url = "https://courses.analyticsvidhya.com/collections/courses?page="

# Open a CSV file to save the data
with open('courses_vidhya.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Course Name', 'Lessons', 'Price', 'Link'])

    page = 1
    while True:
        # Fetch the webpage for the current page
        response = requests.get(base_url + str(page))
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all course cards
        courses = soup.find_all('a', class_='course-card')

        # Break if no courses are found (last page)
        if not courses:
            break

        for course in courses:
            # Extract details
            course_name = course.find('h3').text.strip() if course.find('h3') else 'N/A'
            lessons = course.find('span', class_='course-card__lesson-count')
            lessons = lessons.text.strip() if lessons else 'N/A'
            price = course.find('span', class_='course-card__price')
            price = price.text.strip() if price else 'N/A'
            link = "https://courses.analyticsvidhya.com" + course['href'] if course.get('href') else 'N/A'

            # Write to CSV
            writer.writerow([course_name, lessons, price, link])

        # Go to the next page
        page += 1

print("Scraping complete. Data saved to 'courses.csv'.")


Scraping complete. Data saved to 'courses.csv'.


In [18]:
import json

# Load the CSV data
with open('courses_vidhya.csv', 'r') as file:
    data = [line.strip().split(',') for line in file.readlines()][1:]

# Convert to dictionary format
courses_data = [{"course_name": course[0], "lessons": course[1], "price": course[2], "link": course[3]} for course in data]

# Save as JSON
with open('courses_vidhya.json', 'w') as json_file:
    json.dump(courses_data, json_file)

print("Data saved as JSON.")


Data saved as JSON.


In [24]:
import json

# Load the JSON data
with open('courses_vidhya.json', 'r') as file:
    courses_data = json.load(file)

# Display the first 5 courses
for course in courses_data[:100]:
    print(course)


{'course_name': 'Frameworks for Effective Problem Solving', 'lessons': '18 Lessons', 'price': 'Free', 'link': 'https://courses.analyticsvidhya.com/courses/frameworks-for-effective-problem-solving'}
{'course_name': 'Anyone can Build AI Agents - Free Course', 'lessons': '5 Lessons', 'price': 'Free', 'link': 'https://courses.analyticsvidhya.com/courses/your-ultimate-guide-to-becoming-an-agentic-ai-expert-by-2025'}
{'course_name': 'A Comprehensive Learning Path to Become a Data Analyst in 2025', 'lessons': '298 Lessons', 'price': 'Free', 'link': 'https://courses.analyticsvidhya.com/courses/a-comprehensive-learning-path-to-become-a-data-analyst-in-2025'}
{'course_name': 'Reimagining GenAI: Common Mistakes and Best Practices for Success', 'lessons': '6 Lessons', 'price': 'Free', 'link': 'https://courses.analyticsvidhya.com/courses/reimagining-genai-common-mistakes-and-best-practices-for-success'}
{'course_name': 'Coding a ChatGPT-style Language Model from Scratch in PyTorch', 'lessons': '7 L

In [42]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed the course names
course_embeddings = model.encode(course_names)

# Print a preview of the embeddings
print(course_embeddings[:5])  # Preview embeddings


[[ 0.0207321  -0.00083077 -0.0013544  ...  0.03848465  0.05758337
   0.0582363 ]
 [-0.04901564 -0.09395265 -0.04070522 ... -0.00400748  0.02121344
  -0.03748144]
 [-0.00795452 -0.04323222 -0.00785497 ... -0.04750539 -0.06105438
   0.01614271]
 [-0.04188452  0.07991543 -0.01995979 ...  0.031822   -0.05563084
   0.00499994]
 [-0.07910202 -0.08839826  0.01572709 ...  0.103826    0.01209753
  -0.02748035]]


In [40]:
# Extract course names and other details from the data
course_names = [course['course_name'] for course in courses_data]
lessons = [course['lessons'] for course in courses_data]
prices = [course['price'] for course in courses_data]
links = [course['link'] for course in courses_data]

# Print out the first few extracted course names and details
print(course_names[:5])
print(lessons[:5])
print(prices[:5])
print(links[:5])


['Frameworks for Effective Problem Solving', 'Anyone can Build AI Agents - Free Course', 'A Comprehensive Learning Path to Become a Data Analyst in 2025', 'Reimagining GenAI: Common Mistakes and Best Practices for Success', 'Coding a ChatGPT-style Language Model from Scratch in PyTorch']
['18 Lessons', '5 Lessons', '298 Lessons', '6 Lessons', '7 Lessons']
['Free', 'Free', 'Free', 'Free', 'Free']
['https://courses.analyticsvidhya.com/courses/frameworks-for-effective-problem-solving', 'https://courses.analyticsvidhya.com/courses/your-ultimate-guide-to-becoming-an-agentic-ai-expert-by-2025', 'https://courses.analyticsvidhya.com/courses/a-comprehensive-learning-path-to-become-a-data-analyst-in-2025', 'https://courses.analyticsvidhya.com/courses/reimagining-genai-common-mistakes-and-best-practices-for-success', 'https://courses.analyticsvidhya.com/courses/coding-a-chatgpt-style-language-model-from-scratch-in-pytorch']


In [44]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to search for similar courses based on a query
def search_courses(query, course_names, course_embeddings, top_n=5):
    # Encode the query using the same model
    query_embedding = model.encode([query])

    # Compute cosine similarity between query and course embeddings
    similarities = cosine_similarity(query_embedding, course_embeddings)

    # Get the top N most similar courses
    top_indices = similarities[0].argsort()[-top_n:][::-1]
    
    # Return the top N courses with their similarity scores
    results = [(course_names[i], similarities[0][i]) for i in top_indices]
    
    return results

# Example search query
query = "Data analysis course"
results = search_courses(query, course_names, course_embeddings)

# Print the results
for course, score in results:
    print(f"Course: {course}, Similarity Score: {score:.4f}")


Course: A Comprehensive Learning Path to Become a Data Analyst in 2025, Similarity Score: 0.6695
Course: Data Science Career Conclave, Similarity Score: 0.6485
Course: Introductory Data Science for Business Managers, Similarity Score: 0.6233
Course: A Comprehensive Learning Path to Become a Data Scientist in 2024, Similarity Score: 0.6063
Course: Linear Programming for Data Science Professionals, Similarity Score: 0.5631


In [54]:
import faiss
import numpy as np

# Convert course embeddings to a numpy array
embedding_matrix = np.array(course_embeddings).astype('float32')

# Create a FAISS index for efficient similarity search
index = faiss.IndexFlatL2(embedding_matrix.shape[1])  # L2 distance metric
index.add(embedding_matrix)

# Save the index to disk if necessary
faiss.write_index(index, 'course_index.faiss')


In [58]:
def search_courses(query, top_k=5):
    # Encode the query
    query_embedding = model.encode([query])
    
    # Calculate the similarity between the query and course embeddings
    similarities = cosine_similarity(query_embedding, course_embeddings)
    
    # Get the indices of the top_k most similar courses
    indices = similarities.argsort()[0][-top_k:][::-1]
    
    # Collect the results
    results = []
    for idx in indices:
        course = courses_data[idx]
        results.append({
            'Course Name': course['course_name'],  # Corrected key
            'Lessons': course['lessons'],  # Corrected key
            'Price': course['price'],  # Corrected key
            'Link': course['link']  # Corrected key
        })
    
    return results

# Example: Search for a course related to "Python"
query = "Python"
search_results = search_courses(query)

for result in search_results:
    print(result)


{'Course Name': 'Introduction to Python', 'Lessons': '69 Lessons', 'Price': 'Free', 'Link': 'https://courses.analyticsvidhya.com/courses/introduction-to-data-science'}
{'Course Name': 'Machine Learning Starter Program', 'Lessons': 'N/A', 'Price': 'Free', 'Link': 'https://courses.analyticsvidhya.com/bundles/machine-learning-starter-program'}
{'Course Name': 'Introduction to Web Scraping using Python', 'Lessons': '13 Lessons', 'Price': 'Free', 'Link': 'https://courses.analyticsvidhya.com/courses/introduction-to-web-scraping'}
{'Course Name': 'Pandas for Data Analysis in Python', 'Lessons': '27 Lessons', 'Price': 'Free', 'Link': 'https://courses.analyticsvidhya.com/courses/pandas-for-data-analysis-in-python'}
{'Course Name': 'Loan Prediction Practice Problem (Using Python)', 'Lessons': '16 Lessons', 'Price': 'Free', 'Link': 'https://courses.analyticsvidhya.com/courses/loan-prediction-practice-problem-using-python'}


In [65]:
import requests
from bs4 import BeautifulSoup
import csv

# Base URL of the webpage
base_url = "https://courses.analyticsvidhya.com/collections/courses?page="

# Open a CSV file to save the data
with open('free_courses_vidhya.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Course Name', 'Lessons', 'Price', 'Link'])

    page = 1
    while True:
        # Fetch the webpage for the current page
        response = requests.get(base_url + str(page))
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all course cards
        courses = soup.find_all('a', class_='course-card')

        # Break if no courses are found (last page)
        if not courses:
            break

        for course in courses:
            # Extract details
            course_name = course.find('h3').text.strip() if course.find('h3') else 'N/A'
            lessons = course.find('span', class_='course-card__lesson-count')
            lessons = lessons.text.strip() if lessons else 'N/A'
            price = course.find('span', class_='course-card__price')
            price = price.text.strip() if price else 'N/A'
            link = "https://courses.analyticsvidhya.com" + course['href'] if course.get('href') else 'N/A'

            # Check if the course is free
            if price.lower() == 'free':  # Only include free courses
                # Write to CSV
                writer.writerow([course_name, lessons, price, link])

        # Go to the next page
        page += 1

print("Scraping complete. Data saved to 'free_courses_vidhya.csv'.")


Scraping complete. Data saved to 'free_courses_vidhya.csv'.


In [67]:
import json

# Load the CSV data
with open('free_courses_vidhya.csv', 'r') as file:
    data = [line.strip().split(',') for line in file.readlines()][1:]

# Convert to dictionary format
courses_data = [{"course_name": course[0], "lessons": course[1], "price": course[2], "link": course[3]} for course in data]

# Save as JSON
with open('free_courses_vidhya.json', 'w') as json_file:
    json.dump(courses_data, json_file)

print("Data saved as JSON.")

Data saved as JSON.
