In [None]:
import requests
from bs4 import BeautifulSoup
import csv

# Function to get and parse the HTML content
def get_parsed_html(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

# Function to extract the title of the Wikipedia page
def extract_title(soup):
    title = soup.find(id="firstHeading").get_text()
    return title

# Function to extract the content of the Wikipedia page
def extract_content(soup):
    content = {}
    current_heading = None  # To keep track of the current header (h2, h3)

    # Find headings and paragraphs
    for element in soup.find_all(['h2', 'h3', 'p']):
        if element.name in ['h2', 'h3']:
            current_heading = element.get_text().strip()  # Set the current heading
            content[current_heading] = []  # Initialize the list of paragraphs for this heading
        elif element.name == 'p' and current_heading:
            # Add the paragraph under the current heading
            paragraph = element.get_text().strip()
            if paragraph:  # Ensure it's not an empty paragraph
                content[current_heading].append(paragraph)

    return content

# Function to extract all Wikipedia links from the page
def extract_wikipedia_links(soup):
    links = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        if href.startswith('/wiki/') and not ':' in href:
            full_url = f"https://en.wikipedia.org{href}"
            links.append(full_url)
    return links

# Main function to scrape the data and write to a CSV file
def scrape_wikipedia_to_csv(url, filename):
    # Get the parsed HTML content of the page
    soup = get_parsed_html(url)

    # Extract the title, content, and links
    title = extract_title(soup)
    content = extract_content(soup)
    links = extract_wikipedia_links(soup)

    # Write the extracted data to a CSV file
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)

        # Write the title as the first row
        writer.writerow(['Title', title])
        writer.writerow([])  # Empty row for separation

        # Write the headings and their respective paragraphs
        writer.writerow(['Heading', 'Paragraphs'])
        for heading, paragraphs in content.items():
            for paragraph in paragraphs:
                writer.writerow([heading, paragraph])

        writer.writerow([])  # Empty row for separation

        # Write the links
        writer.writerow(['Wikipedia Links'])
        for link in links:
            writer.writerow([link])

# Function to display the contents of the CSV file
def display_csv_content(filename):
    with open(filename, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            print(row)

# Example usage: scrape the "Web scraping" Wikipedia page and display the CSV content
scrape_wikipedia_to_csv("https://en.wikipedia.org/wiki/Web_scraping", "wikipedia_data.csv")
display_csv_content("wikipedia_data.csv")


In [None]:
import requests #sending https requests
from bs4 import BeautifulSoup

#function to parse html of the web page
def getpage (link, timeout):
  attempts = 1
  maxattempts = timeout * 60 #set max num of attemptd in seconds (timeout is in min, *60)
  while True:
    try:
      page = requests.get(link)
      soup = BeautifulSoup(page.content, 'html.parser')
      print("sucessfully connected")
      return soup
    #if connection error or timeout occurs, catch these exception
    except (requests.ConnectionError, requests.ConnectTimeOut) as exp:
      print("error")
      attempts+=1
      #check if the number of attemptd had exceeded maxattempts print errors
      if attempts <= maxattempts:
        continue
      else:
        print("error")
        return None
link = "https://en.wikipedia.org/wiki/Python_"
mins= 1
getpage(link,mins)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [None]:
link = 'https://datatables.net/examples/data_sources/dom.html'
req = requests.get(link)
soup = BeautifulSoup(req.content, 'html.parser')
#find the table element content using its tag
table = soup.find('table',{'id':'example'})
#create an empty list to store the rows
data = []

#find all rows within the table body
for row in table.tbody.find_all('tr'):
  cols = row.find_all('td') #get table data
  cols = [col.text.strip() for col in cols]#extraxt the text from each cell
  data.append(cols)

df = pd.DataFrame(data, columns = ['Name', 'Position', 'Office', 'Age', 'Start date', 'Salary'])
sort = df.sort_values(by='Name')
print(sort)

                   Name                       Position         Office Age  \
4            Airi Satou                     Accountant          Tokyo  33   
24       Angelica Ramos  Chief Executive Officer (CEO)         London  47   
2            Ashton Cox        Junior Technical Author  San Francisco  66   
18        Bradley Greer              Software Engineer         London  41   
27       Brenden Wagner              Software Engineer  San Francisco  28   
5    Brielle Williamson         Integration Specialist       New York  61   
42           Bruno Nash              Software Engineer         London  38   
22         Caesar Vance              Pre-Sales Support       New York  21   
50         Cara Stevens                Sales Assistant       New York  46   
3          Cedric Kelly    Senior Javascript Developer      Edinburgh  22   
12      Charde Marshall              Regional Director  San Francisco  36   
8         Colleen Hurst           Javascript Developer  San Francisco  39   

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_parsed_html(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  return soup

#Function to extreact a specific table by its index
def extract_table(soup, table_index):
  tables = soup.find_all('table')#finds all 'table' elements in the parsed html
  if len(tables) > table_index:#check if the requested table index exists within the list
    table = tables[table_index]#selects the tables based in the index provided
    return table
  else:
    print(f"table index {table_index} is out of range")
    return None

#Function to convert a beautifulsoup table to df
def table_to_df(table):
#initialise an empty list to store table rows
  rows = []
#initialise an empty list to store table column headers
  headers = []
#get headers if ther exist
  header_row = table.find_all('th') #find all 'th' (table header')elements in the table
  if header_row:#if headers are found
    headers = [header.get_text().strip() for header in header_row]#extract text from each header and add it to the list
#get all rows
  for tr in table.find_all('tr'): #iiterates through each 'tr'(table row) in the table
    cells = tr.find_all(['td', 'th']) #find all 'td'(table data) or 'th'(table headers) in the table rows
    row = [cell. get_text().strip() for cell in cells] #exctract and strp text for each cell
#adjust rows to match the header length
#if the row has fewer cols than the headers
    if len(row)< len(headers):
      row.extend([None]*(len(headers)-len(row)))# fill the missing cols with None
    elif len(row)> len(headers):
      row = row[:len(headers)] #cut the row to match the number of the headers
    rows.append(row)
#convert to df
  if headers: #exit
    df=pd.DataFrame(rows, columns=headers)
  else:
    df=pd.DataFrame(rows)
  return df

#main function to scrape and display a specific table
def display_specific_table(url, table_index):
  soup = get_parsed_html(url)
  table = extract_table(soup, table_index)

  if table: #found
    df = table_to_df(table) #convert the table to df
    print(df)

url = "https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
table_index = 2
display_specific_table (url, table_index)

     Country/Territory   IMF[1][13]  World Bank[14]  United Nations[15]  \
0    Country/Territory   IMF[1][13]  World Bank[14]  United Nations[15]   
1             Forecast         Year        Estimate                Year   
2                World  109,529,216            2024         105,435,540   
3        United States   28,781,083            2024          27,360,935   
4                China   18,532,633       [n 1]2024          17,794,782   
..                 ...          ...             ...                 ...   
207           Kiribati          311            2024                 279   
208              Palau          308            2024                 263   
209   Marshall Islands          305            2024                 284   
210              Nauru          161            2024                 154   
211             Tuvalu           66            2024                  62   

      Forecast         Year   Estimate  Year Estimate  Year  
0         None         None       Non