In [1]:
pip install gensim



Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.1/211.1 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting cssselect>=0.9.2 (from newspaper3k)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl (18 kB)
Collecting feedparser>=5.2.1 (from newspaper3k)
  Downloading feedparser-6.0.10-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.1/81.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tldextract>=2.0.1 (from newspaper3k)
  Obtaining dependency information for tldextract>=2.0.1 from https://files.pythonhosted.org/packages/e4/6b/2e0c1449c0768f25ea8054476a991152a59507ac019a5647d92e44540a73/tldextract-3.5.0-py3-none-any.whl.metadata
  Downloading tldextract-3.5.0-py3-none-any.whl.metadata (11 kB)
Collecting feedfinder2>=0.0.4 (from newspaper3k)
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
  Preparing metadata 

In [3]:
import requests
from newspaper import Article
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, Table, ListFlowable, ListItem
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
from io import BytesIO

# Function to fetch content from a URL using Newspaper3k
def fetch_content(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.title, article.text
    except Exception as e:
        print(f"An error occurred while fetching content: {str(e)}")
        return None, None

# Function to extract and preprocess content from HTML
def extract_and_preprocess_content(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.title.string if soup.title else "Title not found"
    paragraphs = soup.find_all('p')
    content = ' '.join([p.get_text() for p in paragraphs])
    return title, content

# Function to preprocess text, tokenize, and create a frequency table
def preprocess_text(text):
    # Tokenize text into words
    words = word_tokenize(text)

    # Initialize a stop words set
    stop_words = set(stopwords.words('english'))

    # Remove punctuation and convert to lowercase
    words = [word.lower() for word in words if word.isalpha()]

    # Remove stopwords
    words = [word for word in words if word not in stop_words]

    # Create a frequency distribution
    fdist = FreqDist(words)

    return fdist

# Function to extract pointwise website summary
def extract_pointwise_summary(sentences, frequency_table, num_sentences=5):
    # Sort sentences by their importance based on word frequency
    ranked_sentences = sorted(sentences, key=lambda x: sum(frequency_table[word] for word in word_tokenize(x.lower())), reverse=True)

    # Select the top N sentences as the summary
    summary = ranked_sentences[:num_sentences]

    # Detokenize the selected sentences to form the summary
    summary_text = TreebankWordDetokenizer().detokenize(summary)

    return summary_text

# Function to generate a PDF report
def generate_pdf_report(website_url, title, content, keyword_frequency, website_summary):
    pdf_buffer = BytesIO()
    doc = SimpleDocTemplate(pdf_buffer, pagesize=letter)

    elements = []

    # Title
    title_text = f"Website Analysis Report for '{title}'"
    title_paragraph = Paragraph(title_text, getSampleStyleSheet()['Title'])
    elements.append(title_paragraph)
    elements.append(Spacer(1, 12))

    # URL
    url_paragraph = Paragraph(f"URL: {website_url}", getSampleStyleSheet()['Normal'])
    elements.append(url_paragraph)
    elements.append(Spacer(1, 12))

    # Important Details
    details_paragraph = Paragraph("Important Details:", getSampleStyleSheet()['Heading2'])
    elements.append(details_paragraph)
    elements.append(Spacer(1, 6))

    details_text = f"Title: {title}\n"
    details_paragraph = Paragraph(details_text, getSampleStyleSheet()['Normal'])
    elements.append(details_paragraph)
    elements.append(Spacer(1, 12))

    # Website Summary (Pointwise)
    summary_paragraph = Paragraph("Website Summary:", getSampleStyleSheet()['Heading2'])
    elements.append(summary_paragraph)
    elements.append(Spacer(1, 6))

    # Generate a pointwise summary
    pointwise_summary = extract_pointwise_summary(sent_tokenize(content), keyword_frequency, num_sentences=5)
    summary_paragraph = Paragraph(pointwise_summary, getSampleStyleSheet()['Normal'])
    elements.append(summary_paragraph)
    elements.append(Spacer(1, 12))

    # Content Analysis
    content_analysis = Paragraph("Content Analysis:", getSampleStyleSheet()['Heading2'])
    elements.append(content_analysis)
    elements.append(Spacer(1, 6))

    # Keywords Frequency Table (simplified)
    keyword_table_data = []
    for word, freq in keyword_frequency.items():
        keyword_table_data.append([word, freq])

    keyword_table_style = [
        ('GRID', (0, 0), (-1, -1), 1, colors.black),
        ('ALIGN', (1, 1), (-1, -1), 'RIGHT'),
    ]
    keyword_table = Table(keyword_table_data, colWidths=[200, 100], style=keyword_table_style)
    elements.append(keyword_table)
    elements.append(Spacer(1, 12))

    # Save PDF
    doc.build(elements)

    # Save the PDF to a file
    pdf_filename = 'website_analysis_report.pdf'
    with open(pdf_filename, 'wb') as pdf_file:
        pdf_file.write(pdf_buffer.getvalue())

    print(f"PDF report generated as '{pdf_filename}'.")

# Main function
def main():
    website_url = 'https://codewithcurious.com/projects/chatgpt-using-python/'  # Replace with the target website URL
    title, content = fetch_content(website_url)

    if content:
        # Preprocess text and create a frequency table
        keyword_frequency = preprocess_text(content)

        # Generate a pointwise website summary
        website_summary = extract_pointwise_summary(sent_tokenize(content), keyword_frequency, num_sentences=5)

        # Generate PDF report
        generate_pdf_report(website_url, title, content, keyword_frequency, website_summary)
    else:
        print("Failed to fetch content from the URL.")

if __name__ == '__main__':
    main()


PDF report generated as 'website_analysis_report.pdf'.
