In [33]:
from urllib.request import urlopen
import bs4 as bs
import requests
from bs4 import BeautifulSoup
import unicodedata

In [44]:
class AutomatedDataScraping():
    def __init__(self, url):
        self.url = url
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'en-US,en;q=0.9',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        }
        self.getResponse()
        
    def cleanResponse(self, content):
        return [BeautifulSoup(element.text, 'html.parser').get_text().replace('\xa0', ' ').strip() for element in content]
    
    def cleanPara(self, content):
        cleaned_text = [
            unicodedata.normalize('NFKD', element).encode('ascii', 'ignore').decode('utf-8', 'ignore').replace('\xa0', ' ').strip()
            for element in content
        ]
        return cleaned_text

    def extract_paragraphs(self, soup):
        p_tags = soup.find_all('p')
        return self.cleanResponse(p_tags)
    
    def extract_videos(self, soup):
        video_tags = soup.find_all('video')
        video_sources = [video.get('src', '') for video in video_tags]
        return video_sources
    
    def urlAnalyzer(self):
        url_without_protocol = self.url.split("://")[1]
        url_words = url_without_protocol.split("/")
        url_words_split = [word.split("-") for word in url_words]
        url_words_flat = [word for sublist in url_words_split for word in sublist]
        return url_words_flat
        
    def getResponse(self):
        with requests.Session() as session:
            self.response = session.get(self.url, headers=self.headers)
            soup = BeautifulSoup(self.response.text, 'html.parser')
            
            self.title_tag = soup.find('title')
            self.title = self.title_tag.text if self.title_tag else None
            self.h1_tags = soup.find_all(['h1'])
            self.h2_tags = soup.find_all(['h2'])
            self.h3_tags = soup.find_all(['h3'])
            self.h4_tags = soup.find_all(['h4'])
            self.h5_tags = soup.find_all(['h5'])
            self.h6_tags = soup.find_all(['h6'])
            self.img_tags = soup.find_all('img')
            self.alt_attributes = [img.get('alt', '') for img in self.img_tags]
            self.paragraphs = self.extract_paragraphs(soup)
            self.videos = self.extract_videos(soup)
            
    def returnResponse(self):
        self.dictionary = {
            'title': self.title,
            'h1': self.cleanResponse(self.h1_tags),
            'img_tags': self.img_tags,
            'h2': self.cleanResponse(self.h2_tags),
            'h3': self.cleanResponse(self.h4_tags),
            'h4': self.cleanResponse(self.h4_tags),
            'h5': self.cleanResponse(self.h5_tags),
            'h6': self.cleanResponse(self.h6_tags),
            'p': ' '.join(self.cleanPara(self.paragraphs)),
            'videos': self.videos,
            'url': self.urlAnalyzer(),
        } 
        return self.dictionary

## Web Scrapping

In [46]:
urls = [
    "https://www.realbuzz.com/articles-interests/sports-activities/article/top-10-tips-for-starting-out-in-basketball/",
    "https://www.wikihow.com/Improve-at-Basketball",
    "https://www.webmd.com/fitness-exercise/health-benefits-basketball",
]

instance = AutomatedDataScraping(
    url=urls[2]
)
result = instance.returnResponse()
result['p']

'aBasketball is a sport that is loved and enjoyed by people all over the world. In 2019, The International Basketball Federation (FIBA) said that basketball had 450 million players and fans globally. The sport has two teams with five players who each compete to score points by pushing a ball down a hoop placed 10 feet off the ground. A Because basketball doesnat need much equipment and can be set up easily anywhere, it can be played both indoors and outdoors. A Also, you can get started with basketball as a team of one or even two people.A aA Basketball is a lot more than just an exciting game to play. The sport offers a lot of physical, mental, and emotional health benefits for anyone who wants to become more active or develop a healthier lifestyle. In this sense, it is a great mind and full-body workout.A Playing basketball benefits your physical health in a number of different ways, including:A Helps you improve your heart health: Engaging in intense sports activities like basketbal