### HTML Page Parsing

In [7]:
import requests
from bs4 import BeautifulSoup
from urllib3.response import HTTPResponse

class FileAdapter(requests.adapters.HTTPAdapter):
    def send(self, request, *args, **kwargs):
        resp = HTTPResponse(body=open(request.url[7:], 'rb'), status=200, preload_content=False)
        return self.build_response(request, resp)

session = requests.Session()
session.mount('file://', FileAdapter())

url = 'file:///Users/mac/Project/utopiq/fif/data/training/first_group/data_science/html/news/index.html'
response = session.get(url)

# Parse the HTML code using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US" xmlns="https://www.w3.org/1999/xhtml">
 <head profile="https://gmpg.org/xfn/11">
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <title>
   News - KDnuggets
  </title>
  <link href="/wp-content/themes/kdn17/images/favicon.ico" rel="shortcut icon"/>
  <link href="/wp-content/themes/kdn17/style.css" media="screen" rel="stylesheet" type="text/css"/>
  <link href="/feed/" rel="alternate" title="KDnuggets: AI, Analytics, Data Science, Machine Learning Feed" type="application/rss+xml"/>
  <script>
   (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,'script','dataLayer','GTM-PSQ38QTS');
  </script>
  <script dat

### Store into Data Frame

In [3]:
import pandas as pd

# Extract the relevant information from the HTML code
latest_posts = []
for row in soup.select('table.thb ul li.li-has-thumb '):
    title = row.find('a').get_text()
    description = row.find('font').get_text().strip()
    author = row.find('div', class_='author-link').find('a').get_text()
    latest_posts.append([title, description, author])

# Store the information in a pandas dataframe
columns = ['Title', 'Description', 'Author']
df = pd.DataFrame(latest_posts, columns=columns)
df


Unnamed: 0,Title,Description,Author
0,\n\n 10 Free Resources to Learn L...,Learn large language models with these free re...,\n Natassha Selvaraj\n ...
1,\n\n How To Use Docker Volumes fo...,Learn how to use Docker volumes to ensure data...,\n Bala Priya C\n
2,\n\n Top 5 Free Machine Learning ...,The article highlights five top free machine l...,\n Josep Ferrer\n
3,\n\n Pip Install YOU: A Beginner’...,Have you ever wanted to create your library in...,\n Kanwal Mehreen\n
4,\n\n How to Apply Padding to Arra...,"In this article, you will learn how to apply p...",\n Shittu Olumide\n
...,...,...,...
70,\n\n A Beginner’s Guide to PyTorc...,learn one of the most important Python package...,\n Cornellius Yudha Wijaya\n ...
71,\n\n Describing Data: A Statology...,This collection of tutorials on describing dat...,\n Matthew Mayo\n
72,\n\n Convert Bytes to String in P...,Strings are common built-in data types in Pyth...,\n Bala Priya C\n
73,\n\n 5 Tips for Managing Data Sci...,"Data scientists are still people, and these ti...",\n Cornellius Yudha Wijaya\n ...


### Clean Up

In [4]:
for column in columns:
   df[column] = df[column].str.replace('\n', '')
df.head()

Unnamed: 0,Title,Description,Author
0,10 Free Resources to Learn LLMs ...,Learn large language models with these free re...,Natassha Selvaraj
1,How To Use Docker Volumes for Pe...,Learn how to use Docker volumes to ensure data...,Bala Priya C
2,Top 5 Free Machine Learning Cour...,The article highlights five top free machine l...,Josep Ferrer
3,Pip Install YOU: A Beginner’s Gu...,Have you ever wanted to create your library in...,Kanwal Mehreen
4,How to Apply Padding to Arrays w...,"In this article, you will learn how to apply p...",Shittu Olumide


### Export the data to a CSV file

In [5]:
csv_path = 'data_from_scraping/kdnuggets_latest_posts'
df.to_csv(csv_path, index=False)

### Validate

In [6]:
df_kdnuggets_latest_posts = pd.read_csv(csv_path)

df.head()

Unnamed: 0,Title,Description,Author
0,10 Free Resources to Learn LLMs ...,Learn large language models with these free re...,Natassha Selvaraj
1,How To Use Docker Volumes for Pe...,Learn how to use Docker volumes to ensure data...,Bala Priya C
2,Top 5 Free Machine Learning Cour...,The article highlights five top free machine l...,Josep Ferrer
3,Pip Install YOU: A Beginner’s Gu...,Have you ever wanted to create your library in...,Kanwal Mehreen
4,How to Apply Padding to Arrays w...,"In this article, you will learn how to apply p...",Shittu Olumide
