<a href="https://colab.research.google.com/github/jhayesn13/Test/blob/main/Working_Crawler_with_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Working Code

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import threading
import re

# Create webcrawler class
class WebCrawler:
    def __init__(self, start_url, visiting_strategy='preorder'):
        self.start_url = start_url
        self.visiting_strategy = visiting_strategy.lower()
        self.visited_urls = set()
        self.corpus = {}
        self.main_domain = urlparse(start_url).netloc
        self.lock = threading.Lock()  # Lock for thread-safe access to shared data

    def crawl(self, url):
        if url not in self.visited_urls and self.is_same_domain(url):
            print(f"Visiting: {url}")
            self.visited_urls.add(url)
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.content, 'html.parser')
                title = soup.title.string.strip() if soup.title else 'Untitled'
                text_content = self.extract_text_content(soup)

                with self.lock:  # Thread-safe update of shared data
                    self.corpus[url] = text_content

                print(f"Text Content: {text_content[:100]}...")  # Output a snippet of text

                if self.visiting_strategy == 'preorder':
                    links = self.extract_links(soup)
                    threads = []
                    for link in links:
                        thread = threading.Thread(target=self.crawl, args=(link,))
                        threads.append(thread)
                        thread.start()

                    # Wait for all threads to complete
                    for thread in threads:
                        thread.join()

                # Additional visiting strategies (inorder, postorder) can be implemented here

            except Exception as e:
                print(f"Error crawling {url}: {e}")

    def extract_text_content(self, soup):
        # Extract text content only from the body of the HTML
        text_content = ' '.join([p.get_text(separator=' ', strip=True) for p in soup.body.find_all('p')])
        return text_content

    def extract_links(self, soup):
        # Extract all links from the page
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        # Filter internal links only
        links = [urljoin(self.start_url, link) for link in links if link.startswith(('http', 'https'))]
        # Exclude PDF links
        links = [link for link in links if not link.endswith('.pdf')]
        # Filter out external links
        links = [link for link in links if self.is_same_domain(link)]
        # Exclude links with 'resources' in the URL
        links = [link for link in links if 'resources' not in link.lower()]
        return links

    def is_same_domain(self, url):
        return urlparse(url).netloc == self.main_domain

    def start_crawling(self):
        self.crawl(self.start_url)

    def get_crawled_data(self):
        return self.corpus

if __name__ == "__main__":
    # Get the starting URL from the user
    start_url = input("Enter the website's URL: ")

    # Instantiate the WebCrawler with the provided URL and visiting strategy
    crawler = WebCrawler(start_url=start_url, visiting_strategy='preorder')

    # Start crawling
    crawler.start_crawling()

    # Get the crawled data
    crawled_data = crawler.get_crawled_data()

    # Print the crawled data
    for url, content in crawled_data.items():
        print(f"URL: {url}")
        print(f"Content: {content[:100]}...")  # Print a snippet of content


In [None]:
#Displaying content from a specific webpage

# Assuming 'crawled_data' is the dictionary containing the crawled data
desired_url = 'https://www.stjohns.edu/academics/programs/clinical-laboratory-sciences-bachelor-science'

# Check if the URL is present in the crawled data
if desired_url in crawled_data:
    content = crawled_data[desired_url]
    print("Content for URL:")
    print(content)
else:
    print("URL not found in crawled data.")

In [None]:
#Crawler with processed text. Includes tokens, removed stopwords, lemmatized tokens, stemming, and sentiment score

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import threading
import re
import spacy
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer

class TextProcessor:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('wordnet')
        nltk.download('vader_lexicon')

        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.sentiment_analyzer = SentimentIntensityAnalyzer()

    def clean_text(self, text):
        cleaned_text = re.sub(r'[^\w\s]', '', text)
        cleaned_text = cleaned_text.lower()
        return cleaned_text

    def tokenize_text(self, text):
        doc = self.nlp(text)
        tokens = [token.text for token in doc]
        return tokens

    def remove_stopwords(self, tokens):
        filtered_tokens = [token for token in tokens if token.lower() not in self.stop_words]
        return filtered_tokens

    def lemmatize_text(self, tokens):
        doc = self.nlp(" ".join(tokens))
        lemmatized_tokens = [token.lemma_ for token in doc]
        return lemmatized_tokens

    def stem_text(self, tokens):
        stemmed_tokens = [self.stemmer.stem(token) for token in tokens]
        return stemmed_tokens

    def analyze_sentiment(self, text):
        sentiment_scores = self.sentiment_analyzer.polarity_scores(text)
        return sentiment_scores

class WebCrawler:
    def __init__(self, start_url, visiting_strategy='preorder'):
        self.start_url = start_url
        self.visiting_strategy = visiting_strategy.lower()
        self.visited_urls = set()
        self.corpus = {}
        self.main_domain = urlparse(start_url).netloc
        self.lock = threading.Lock()  # Lock for thread-safe access to shared data
        self.text_processor = TextProcessor()  # Instantiate TextProcessor

    def crawl(self, url):
        if url not in self.visited_urls and self.is_same_domain(url):
            print(f"Visiting: {url}")
            self.visited_urls.add(url)
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.content, 'html.parser')
                title = soup.title.string.strip() if soup.title else 'Untitled'
                text_content = self.extract_text_content(soup)

                # Process text content
                cleaned_text = self.text_processor.clean_text(text_content)
                tokens = self.text_processor.tokenize_text(cleaned_text)
                filtered_tokens = self.text_processor.remove_stopwords(tokens)
                lemmatized_tokens = self.text_processor.lemmatize_text(filtered_tokens)
                stemmed_tokens = self.text_processor.stem_text(filtered_tokens)
                sentiment_scores = self.text_processor.analyze_sentiment(text_content)

                with self.lock:  # Thread-safe update of shared data
                    self.corpus[url] = {
                        'title': title,
                        'text_content': text_content,
                        'cleaned_text': cleaned_text,
                        'tokens': tokens,
                        'filtered_tokens': filtered_tokens,
                        'lemmatized_tokens': lemmatized_tokens,
                        'stemmed_tokens': stemmed_tokens,
                        'sentiment_scores': sentiment_scores
                    }

                print(f"Title: {title}")

                if self.visiting_strategy == 'preorder':
                    links = self.extract_links(soup)
                    threads = []
                    for link in links:
                        thread = threading.Thread(target=self.crawl, args=(link,))
                        threads.append(thread)
                        thread.start()

                    # Wait for all threads to complete
                    for thread in threads:
                        thread.join()

                # Additional visiting strategies (inorder, postorder) can be implemented here

            except Exception as e:
                print(f"Error crawling {url}: {e}")

    def extract_text_content(self, soup):
        text_content = ' '.join([p.get_text(separator=' ', strip=True) for p in soup.body.find_all('p')])
        return text_content

    def extract_links(self, soup):
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        links = [urljoin(self.start_url, link) for link in links if link.startswith(('http', 'https'))]
        links = [link for link in links if not link.endswith('.pdf')]
        links = [link for link in links if self.is_same_domain(link)]
        links = [link for link in links if 'resources' not in link.lower()]
        return links

    def is_same_domain(self, url):
        return urlparse(url).netloc == self.main_domain

    def start_crawling(self):
        self.crawl(self.start_url)

    def get_crawled_data(self):
        return self.corpus

if __name__ == "__main__":
    # Get the starting URL from the user
    start_url = input("Enter the website's URL: ")

    # Instantiate the WebCrawler with the provided URL and visiting strategy
    crawler = WebCrawler(start_url=start_url, visiting_strategy='preorder')

    # Start crawling
    crawler.start_crawling()

    # Get the crawled data
    crawled_data = crawler.get_crawled_data()

    # Print the crawled data
    for url, data in crawled_data.items():
        print(f"URL: {url}")
        print(f"Title: {data['title']}")
        print(f"Cleaned Text: {data['cleaned_text']}")
        print(f"Tokens: {data['tokens']}")
        print(f"Filtered Tokens: {data['filtered_tokens']}")
        print(f"Lemmatized Tokens: {data['lemmatized_tokens']}")
        print(f"Stemmed Tokens: {data['stemmed_tokens']}")
        print(f"Sentiment Scores: {data['sentiment_scores']}")
