In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install newspaper3k

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install rake-nltk


Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install openai


Note: you may need to restart the kernel to use updated packages.


In [9]:
pip install yake


Collecting yake
  Downloading yake-0.4.8-py2.py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.2/60.2 kB[0m [31m700.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting segtok (from yake)
  Downloading segtok-1.5.11-py3-none-any.whl (24 kB)
Collecting jellyfish (from yake)
  Obtaining dependency information for jellyfish from https://files.pythonhosted.org/packages/ac/5a/fafb2fe555f34e5aeed8c11153257c5af09197451eecb36207e4e2973aed/jellyfish-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading jellyfish-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.5 kB)
Downloading jellyfish-1.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: segtok, jellyfish, yake
Successfully installed jellyfish

In [16]:
import requests
from newspaper import Article
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, ListFlowable, ListItem
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
from io import BytesIO
from yake import KeywordExtractor  # Import YAKE for keyword extraction

# Function to fetch content from a URL using Newspaper3k
def fetch_content(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.title, article.text
    except Exception as e:
        print(f"An error occurred while fetching content: {str(e)}")
        return None, None

# Function to extract and preprocess content from HTML
def extract_and_preprocess_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.title.string if soup.title else "Title not found"
    paragraphs = soup.find_all('p')
    content = ' '.join([p.get_text() for p in paragraphs])
    return title, content

# Function to preprocess text, tokenize, and create a frequency table
def preprocess_text(text):
    # Tokenize text into words
    words = word_tokenize(text)

    # Initialize a stop words set
    stop_words = set(stopwords.words('english'))

    # Remove punctuation and convert to lowercase
    words = [word.lower() for word in words if word.isalpha()]

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Create a frequency distribution
    fdist = FreqDist(words)

    return fdist

# Function to extract pointwise website summary
def extract_pointwise_summary(sentences, frequency_table, num_sentences=5):
    # Sort sentences by their importance based on word frequency
    ranked_sentences = sorted(sentences, key=lambda x: sum(frequency_table[word] for word in word_tokenize(x.lower())), reverse=True)

    # Select the top N sentences as the summary
    summary = ranked_sentences[:num_sentences]

    # Detokenize the selected sentences to form the summary
    summary_text = TreebankWordDetokenizer().detokenize(summary)

    return summary_text

# Function to count images, URLs, and videos on the website
def count_website_media(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            images = len(soup.find_all('img'))
            links = len(soup.find_all('a', href=True))
            videos = len(soup.find_all('video'))
            return images, links, videos
        else:
            print(f"Failed to fetch content from {url}. Status code: {response.status_code}")
            return None, None, None
    except Exception as e:
        print(f"An error occurred while counting media: {str(e)}")
        return None, None, None

# Function to dynamically generate information about the website using YAKE for keyword extraction
def generate_website_info(content):
    # Initialize the YAKE keyword extractor
    keyword_extractor = KeywordExtractor()

    # Extract keywords from the content
    keywords = keyword_extractor.extract_keywords(content)

    # Extract the top keywords and create a description
    top_keywords = [keyword for keyword, score in keywords[:5]]  # Adjust the number of keywords as needed
    website_info = f"This website is related to {', '.join(top_keywords)}."

    return website_info

# Function to generate a PDF report
def generate_pdf_report(website_url, title, content, keyword_frequency, website_summary, images_count, links_count, videos_count, website_info):
    pdf_buffer = BytesIO()
    doc = SimpleDocTemplate(pdf_buffer, pagesize=letter)

    elements = []

    # Title
    title_text = f"Website Analysis Report for '{title}'"
    title_paragraph = Paragraph(title_text, getSampleStyleSheet()['Title'])
    elements.append(title_paragraph)
    elements.append(Spacer(1, 12))

    # URL
    url_paragraph = Paragraph(f"URL: {website_url}", getSampleStyleSheet()['Normal'])
    elements.append(url_paragraph)
    elements.append(Spacer(1, 12))

    # Important Details
    details_paragraph = Paragraph("Important Details:", getSampleStyleSheet()['Heading2'])
    elements.append(details_paragraph)
    elements.append(Spacer(1, 6))

    details_text = f"Title: {title}\n"
    details_paragraph = Paragraph(details_text, getSampleStyleSheet()['Normal'])
    elements.append(details_paragraph)
    elements.append(Spacer(1, 12))

    # About the Website (Dynamic Info)
    about_website_paragraph = Paragraph("About the Website:", getSampleStyleSheet()['Heading2'])
    elements.append(about_website_paragraph)
    elements.append(Spacer(1, 6))

    # Include the dynamically generated website information
    about_website_paragraph = Paragraph(website_info, getSampleStyleSheet()['Normal'])
    elements.append(about_website_paragraph)
    elements.append(Spacer(1, 12))

    # Keywords Section
    keywords_paragraph = Paragraph("Keywords:", getSampleStyleSheet()['Heading2'])
    elements.append(keywords_paragraph)
    elements.append(Spacer(1, 6))

    # Display the highly important keywords (you can customize the number)
    num_keywords_to_display = 10
    important_keywords = keyword_frequency.most_common(num_keywords_to_display)
    keywords_text = ", ".join([f"{word} ({freq} times)" for word, freq in important_keywords])
    keywords_paragraph = Paragraph(keywords_text, getSampleStyleSheet()['Normal'])
    elements.append(keywords_paragraph)
    elements.append(Spacer(1, 12))

    # Media Information
    media_paragraph = Paragraph("Media Information:", getSampleStyleSheet()['Heading2'])
    elements.append(media_paragraph)
    elements.append(Spacer(1, 6))

    media_text = f"Images: {images_count}\nLinks: {links_count}\nVideos: {videos_count}"
    media_paragraph = Paragraph(media_text, getSampleStyleSheet()['Normal'])
    elements.append(media_paragraph)
    elements.append(Spacer(1, 12))

    # Website Summary (Pointwise)
    summary_paragraph = Paragraph("Website Summary:", getSampleStyleSheet()['Heading2'])
    elements.append(summary_paragraph)
    elements.append(Spacer(1, 6))

    # Generate a pointwise summary
    summary_paragraph = Paragraph(website_summary, getSampleStyleSheet()['Normal'])
    elements.append(summary_paragraph)
    elements.append(Spacer(1, 12))

    # Save PDF
    doc.build(elements)

    # Save the PDF to a file
    pdf_filename = 'website_analysis_report.pdf'
    with open(pdf_filename, 'wb') as pdf_file:
        pdf_file.write(pdf_buffer.getvalue())

    print(f"PDF report generated as '{pdf_filename}'.")

# Main function
def main():
    website_url = 'https://www.sbicard.com/'  # Replace with the target website URL
    title, content = fetch_content(website_url)

    if content:
        # Preprocess text and create a frequency table
        keyword_frequency = preprocess_text(content)

        # Count images, URLs, and videos on the website
        images_count, links_count, videos_count = count_website_media(website_url)

        # Generate a pointwise website summary
        website_summary = extract_pointwise_summary(sent_tokenize(content), keyword_frequency, num_sentences=5)

        # Generate dynamic website information
        website_info = generate_website_info(content)

        # Generate PDF report
        generate_pdf_report(website_url, title, content, keyword_frequency, website_summary, images_count, links_count, videos_count, website_info)
    else:
        print("Failed to fetch content from the URL.")

if __name__ == '__main__':
    main()


PDF report generated as 'website_analysis_report.pdf'.
