# 📘 Project: Mini Google Search Clone (TF-IDF)
This project crawls a few pages, processes them using TF-IDF, and returns the most relevant pages for a user query.

# 1. Install & Import Libraries

In [18]:
#  Step 1: Install required libraries

!pip install requests beautifulsoup4 scikit-learn

#  Step 2: Import all necessary libraries
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np




# 2. Crawl Web Pages

In [19]:
# Step 3: List of URLs to crawl (Wikipedia articles)
urls = [
      "https://en.wikipedia.org/wiki/Natural_language_processing",
    "https://en.wikipedia.org/wiki/Machine_learning",
    "https://en.wikipedia.org/wiki/Artificial_intelligence",
    "https://en.wikipedia.org/wiki/Deep_learning",
    "https://en.wikipedia.org/wiki/Computer_vision",
    "https://en.wikipedia.org/wiki/Reinforcement_learning"

]


# 3. Preprocess & Vectorize Text (TF-IDF)

In [24]:
# Step 4: Function to scrape and extract clean text from URLs

def get_clean_text_from_url(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        # Remove scripts and styles
        for script in soup(["script", "style"]):
            script.decompose()
        text = soup.get_text(separator=" ", strip=True)
        return text
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return ""

# Step 5: Crawl all URLs and store their text
documents = []
for url in urls:
    print(f"Fetching: {url}")
    text = get_clean_text_from_url(url)
    documents.append(text[:100000])  # Limit text size to avoid overload


Fetching: https://en.wikipedia.org/wiki/Natural_language_processing
Natural language processing - Wikipedia Jump to content Main menu Main menu move to sidebar hide Navigation Main page Contents Current events Random article About Wikipedia Contact us Contribute Help Learn to edit Community portal Recent changes Upload file Special pages Search Search Appearance Donate Create account Log in Personal tools Donate Create account Log in Pages for logged out editors learn more Contributions Talk Contents move to sidebar hide (Top) 1 History Toggle History subsection 1.1 Symbolic NLP (1950s – early 1990s) 1.2 Statistical NLP (1990s–present) 2 Approaches: Symbolic, statistical, neural networks Toggle Approaches: Symbolic, statistical, neural networks subsection 2.1 Statistical approach 2.2 Neural networks 3 Common NLP tasks Toggle Common NLP tasks subsection 3.1 Text and speech processing 3.2 Morphological analysis 3.3 Syntactic analysis 3.4 Lexical semantics (of individual words in context)

# 4. Build the Search Function

In [21]:
# Step 6: Convert documents into TF-IDF vectors

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(documents)

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


TF-IDF matrix shape: (6, 5000)


In [22]:
# Step 7: Search function to find most relevant pages

def search(query, top_k=3):
    query_vec = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Sort scores in descending order
    top_indices = similarity_scores.argsort()[::-1][:top_k]

    results = []
    for idx in top_indices:
        results.append({
            "url": urls[idx],
            "score": similarity_scores[idx]
        })
    return results


# 5. Test the Search Engine

In [23]:
# ✅ Step 8: Test the search engine

query = "neural network"
results = search(query)

print(f"\n🔍 Search Results for: '{query}'\n")
for i, result in enumerate(results, 1):
    print(f"{i}. URL: {result['url']}")
    print(f"   Score: {result['score']:.4f}\n")



🔍 Search Results for: 'neural network'

1. URL: https://en.wikipedia.org/wiki/Deep_learning
   Score: 0.3136

2. URL: https://en.wikipedia.org/wiki/Reinforcement_learning
   Score: 0.0861

3. URL: https://en.wikipedia.org/wiki/Machine_learning
   Score: 0.0849

