In [5]:
pip install reportlab


Note: you may need to restart the kernel to use updated packages.


In [8]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
import pdfkit
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors  # Import the 'colors' module
from io import BytesIO


# Function to fetch content from a URL
def fetch_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to fetch content from {url}. Status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"An error occurred while fetching content: {str(e)}")
        return None

# Function to extract and preprocess content from HTML
def extract_and_preprocess_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    paragraphs = soup.find_all('p')
    content = ' '.join([p.get_text() for p in paragraphs])
    return content

# Function to preprocess text, tokenize, and create a frequency table
def preprocess_text(text):
    # Tokenize text into words
    words = word_tokenize(text)

    # Initialize a stop words set
    stop_words = set(stopwords.words('english'))

    # Remove punctuation and convert to lowercase
    words = [word.lower() for word in words if word.isalpha()]

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Create a frequency distribution
    fdist = FreqDist(words)

    return fdist

# Function to generate a PDF report
def generate_pdf_report(website_url, content, keyword_frequency):
    pdf_buffer = BytesIO()
    doc = SimpleDocTemplate(pdf_buffer, pagesize=letter)

    elements = []

    # Title
    title = Paragraph("Website Analysis Report", getSampleStyleSheet()['Title'])
    elements.append(title)
    elements.append(Spacer(1, 12))

    # URL
    url_paragraph = Paragraph(f"URL: {website_url}", getSampleStyleSheet()['Normal'])
    elements.append(url_paragraph)
    elements.append(Spacer(1, 12))

    # Content Analysis
    content_analysis = Paragraph("Content Analysis:", getSampleStyleSheet()['Heading2'])
    elements.append(content_analysis)
    elements.append(Spacer(1, 6))

    # Keywords Frequency Table (simplified)
    keyword_table_data = []
    for word, freq in keyword_frequency.items():
        keyword_table_data.append([word, freq])

    keyword_table_style = [
        ('GRID', (0, 0), (-1, -1), 1, colors.black),
        ('ALIGN', (1, 1), (-1, -1), 'RIGHT'),
    ]
    keyword_table = Table(keyword_table_data, colWidths=[200, 100], style=keyword_table_style)
    elements.append(keyword_table)
    elements.append(Spacer(1, 12))

    # Save PDF
    doc.build(elements)

    # Save the PDF to a file
    pdf_filename = 'website_analysis_report.pdf'
    with open(pdf_filename, 'wb') as pdf_file:
        pdf_file.write(pdf_buffer.getvalue())

    print(f"PDF report generated as '{pdf_filename}'.")

# Main function
def main():
    website_url = 'https://medium.com/dataflair/these-projects-will-make-you-the-superhero-of-python-city-14101e62393b'  # Replace with the target website URL
    content = fetch_content(website_url)

    if content:
        # Extract and preprocess content
        article_content = extract_and_preprocess_content(content)

        # Tokenize sentences
        sentences = sent_tokenize(article_content)

        # Preprocess text and create a frequency table
        keyword_frequency = preprocess_text(article_content)

        # Generate PDF report
        generate_pdf_report(website_url, content, keyword_frequency)
    else:
        print("Failed to fetch content from the URL.")

if __name__ == '__main__':
    main()


PDF report generated as 'website_analysis_report.pdf'.
