In [None]:
import requests
from bs4 import BeautifulSoup

def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        return None


In [1]:
def extract_gymnastics_data(soup):
    data = []
    for item in soup.find_all('div', class_='skill-item'):  # Adjust based on site structure
        skill_name = item.find('h2').text.strip()
        description = item.find('p').text.strip()
        data.append({'skill': skill_name, 'description': description})
    return data


## Data Indexing

In [2]:
import re

def index_data(data):
    index = {}
    for entry in data:
        words = re.findall(r'\w+', entry['description'].lower())
        for word in words:
            if word not in index:
                index[word] = []
            index[word].append(entry['skill'])
    return index


## Text Processing

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def process_text(index):
    processed_index = {}
    for word, skills in index.items():
        if word not in stop_words:
            stemmed_word = stemmer.stem(word)
            if stemmed_word not in processed_index:
                processed_index[stemmed_word] = []
            processed_index[stemmed_word].extend(skills)
    return processed_index


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vedanshsurjan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Search Query

In [5]:
def search(query, index):
    query_words = re.findall(r'\w+', query.lower())
    results = {}
    for word in query_words:
        if word in index:
            for skill in index[word]:
                results[skill] = results.get(skill, 0) + 1
    return sorted(results.items(), key=lambda x: x[1], reverse=True)


## Query Expansion

In [6]:
from nltk.corpus import wordnet

# Download WordNet data
import nltk
nltk.download('wordnet')

def expand_query_with_synonyms(query):
    expanded_terms = set(query.split())  # Start with original query terms
    for word in query.split():
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                expanded_terms.add(lemma.name().replace('_', ' '))
    return list(expanded_terms)

query = "cartwheel"
expanded_query = expand_query_with_synonyms(query)
print("Expanded Query:", expanded_query)


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vedanshsurjan/nltk_data...


Expanded Query: ['silver dollar', 'cartwheel']


## Stemming and Lemmatization

In [7]:
from nltk.stem import PorterStemmer

def stem_query(query):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in query.split()]

query = "jumping cartwheels"
expanded_query = stem_query(query)
print("Stemmed Query:", expanded_query)


Stemmed Query: ['jump', 'cartwheel']


## Integration with Search Engine

In [8]:
def search_with_expansion(query, index):
    expanded_query = expand_query_with_synonyms(query)  # Or use another expansion method
    results = {}
    for term in expanded_query:
        if term in index:
            for skill in index[term]:
                results[skill] = results.get(skill, 0) + 1  # Increment score for matches
    return sorted(results.items(), key=lambda x: x[1], reverse=True)

query = "cartwheel"
results = search_with_expansion(query, indexed_data)
print("Search Results:", results)


NameError: name 'indexed_data' is not defined