In [12]:
import matplotlib.pyplot as plt
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
#import datetime as dt
import pytz
from pprint import pprint
from bs4 import BeautifulSoup


The file is a Jupyter Notebook that contains multiple cells with Python code. Here's a brief explanation of the content in each cell:

1. **Cell 0**: 
    - Imports necessary libraries such as `matplotlib.pyplot`, `requests`, `pandas`, `numpy`, `datetime`, `pytz`, `pprint`, and `BeautifulSoup`.

2. **Cell 2**:
    - Defines headers to mimic a browser request.
    - Defines a URL to fetch data from.
    - Defines a function `get_table` to fetch and parse the HTML content from the URL.
    - Calls `get_table` to get the HTML content and tables.
    - Defines a function `convert` to extract data from the HTML table and store it in a list.
    - Defines a function `make_df` to create a DataFrame from the extracted data.
    - Prepares data by calling `convert` for each table.
    - Creates a DataFrame `df` from the data.
    - Groups the DataFrame by the column 'Năm'.
    - Defines functions `display_all`, `display_by_name`, and `display_by_year` to display the grouped data.

3. **Cell 3**:
    - Calls `display_by_name` function to display data for the year 2024.

4. **Cell 4**:
    - Calls `display_by_year` function to display data for the year 2025.

5. **Cell 5**:
    - Contains a markdown link to a pandas groupby tutorial.

6. **Cell 6**:
    - Imports `pandas` and reads a CSV file into a DataFrame `df`.
    - Displays the first few rows of the DataFrame.

7. **Cell 7**:
    - Displays the number of unique values in the 'Product_Category' column of the DataFrame.

8. **Cell 8**:
    - Groups the DataFrame by 'Product_Category' and displays the number of groups.

9. **Cell 9**:
    - Displays the size of each group in the 'Product_Category' column.

10. **Cell 10**:
     - Counts the number of non-NA/null entries for each column in each group.

11. **Cell 11**:
     - Displays the first row of each group in the 'Product_Category' column.

12. **Cell 12**:
     - Displays the last row of each group in the 'Product_Category' column.

13. **Cell 13**:
     - Displays the nth row (3rd row) of each group in the 'Product_Category' column.

14. **Cell 14**:
     - Retrieves and displays the group 'Healthcare' from the grouped DataFrame.

15. **Cell 15**:
     - Iterates over each group in the grouped DataFrame and prints the group name and its contents.

16. **Cell 16**:
     - Groups the DataFrame by 'Product_Category'.
     - Selects specific columns and calculates the mean for each group.

17. **Cell 17**:
     - Aggregates the 'Quantity' column by calculating the min, max, sum, and mean for each group.

18. **Cell 18**:
     - Defines a dictionary of functions to apply to each column and aggregates the DataFrame accordingly.

19. **Cell 19**:
     - Provides descriptive statistics for the 'Quantity' column in each group.

This notebook demonstrates data extraction from a webpage, data manipulation using pandas, and various groupby operations to analyze the data.

In [14]:

# Headers to mimic the browser 
headers = { 
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' 
}

#url = 'https://www.informatik.uni-leipzig.de/~duc/amlich/DuLieu/Sun-Moon-2020.html'
url = 'http://jean.david.free.fr/amlich/DuLieu/index_files/Sun-Moon-2020.html'  

def get_table(url):    
    current = requests.get(url = url, headers = headers, ).content  # rend le texte utf8 alors que text rend mal
    soup = BeautifulSoup(current, 'html.parser')
    #print(soup.prettify())
    tables = soup.find_all('table')
    #tables = soup.find_all(class_='head')
    return soup, tables


In [16]:
text, result = get_table(url)
result  # une seule table pour 20 ans qu'il va falloir découper en plusieurs tables, une table par année


[<table border="1">
 <tr><td class="head" colspan="3"><b>2020</b></td></tr>
 <tr><td><b>Ngày giờ Sóc </b></td><td><b>Ngày giờ</b></td><td><b>Tiết khí</b></td></tr>
 <tr><td>  </td><td>06/01 04:30</td><td>Tiểu hàn</td></tr>
 <tr><td>  </td><td>20/01 21:54</td><td>Đại hàn</td></tr>
 <tr><td>25/01 04:42</td><td>04/02 16:03</td><td>Lập xuân</td></tr>
 <tr><td>  </td><td>19/02 11:57</td><td>Vũ Thủy</td></tr>
 <tr><td>23/02 22:32</td><td>05/03 09:57</td><td>Kinh trập</td></tr>
 <tr><td>  </td><td class="saison">20/03 10:49</td><td>Xuân phân</td></tr>
 <tr><td>24/03 16:28</td><td>04/04 14:38</td><td>Thanh minh</td></tr>
 <tr><td>  </td><td>19/04 21:45</td><td>Cốc vũ</td></tr>
 <tr><td>23/04 09:26</td><td>05/05 07:51</td><td>Lập hạ</td></tr>
 <tr><td>  </td><td>20/05 20:49</td><td>Tiểu mãn</td></tr>
 <tr><td>23/05 00:39</td><td>05/06 11:58</td><td>Mang chủng</td></tr>
 <tr><td>  </td><td class="saison">21/06 04:43</td><td>Hạ chí</td></tr>
 <tr><td>21/06 13:41</td><td>06/07 22:14</td><td>Tiểu t

In [17]:
def extract_tables(html_data):
  """
  Extracts tables from the given HTML data based on the presence of <tr><td colspan="3" class="head"></tr>

  Args:
    html_data: The HTML data as a string.

  Returns:
    A list of strings, where each string represents a single table.
  """

  tables = []
  start_index = 0
  end_index = 0

  while True:
    start_index = html_data.find('<tr><td class="head" colspan="3">', start_index)
    if start_index == -1:
      break

    end_index = html_data.find('<tr><td class="head" colspan="3">', start_index + 1) 
    if end_index == -1:
      end_index = len(html_data)

    tables.append(html_data[start_index:end_index])
    start_index = end_index

  return tables

In [18]:
# découpage en année 
tables2 = extract_tables(str(result[0]))
tables2  # sans tag <table> 

['<tr><td class="head" colspan="3"><b>2020</b></td></tr>\n<tr><td><b>Ngày giờ Sóc\xa0</b></td><td><b>Ngày giờ</b></td><td><b>Tiết khí</b></td></tr>\n<tr><td>\xa0\xa0</td><td>06/01 04:30</td><td>Tiểu hàn</td></tr>\n<tr><td>\xa0\xa0</td><td>20/01 21:54</td><td>Đại hàn</td></tr>\n<tr><td>25/01 04:42</td><td>04/02 16:03</td><td>Lập xuân</td></tr>\n<tr><td>\xa0\xa0</td><td>19/02 11:57</td><td>Vũ Thủy</td></tr>\n<tr><td>23/02 22:32</td><td>05/03 09:57</td><td>Kinh trập</td></tr>\n<tr><td>\xa0\xa0</td><td class="saison">20/03 10:49</td><td>Xuân phân</td></tr>\n<tr><td>24/03 16:28</td><td>04/04 14:38</td><td>Thanh minh</td></tr>\n<tr><td>\xa0\xa0</td><td>19/04 21:45</td><td>Cốc vũ</td></tr>\n<tr><td>23/04 09:26</td><td>05/05 07:51</td><td>Lập hạ</td></tr>\n<tr><td>\xa0\xa0</td><td>20/05 20:49</td><td>Tiểu mãn</td></tr>\n<tr><td>23/05 00:39</td><td>05/06 11:58</td><td>Mang chủng</td></tr>\n<tr><td>\xa0\xa0</td><td class="saison">21/06 04:43</td><td>Hạ chí</td></tr>\n<tr><td>21/06 13:41</td><td>

In [19]:
print(tables2[0])

<tr><td class="head" colspan="3"><b>2020</b></td></tr>
<tr><td><b>Ngày giờ Sóc </b></td><td><b>Ngày giờ</b></td><td><b>Tiết khí</b></td></tr>
<tr><td>  </td><td>06/01 04:30</td><td>Tiểu hàn</td></tr>
<tr><td>  </td><td>20/01 21:54</td><td>Đại hàn</td></tr>
<tr><td>25/01 04:42</td><td>04/02 16:03</td><td>Lập xuân</td></tr>
<tr><td>  </td><td>19/02 11:57</td><td>Vũ Thủy</td></tr>
<tr><td>23/02 22:32</td><td>05/03 09:57</td><td>Kinh trập</td></tr>
<tr><td>  </td><td class="saison">20/03 10:49</td><td>Xuân phân</td></tr>
<tr><td>24/03 16:28</td><td>04/04 14:38</td><td>Thanh minh</td></tr>
<tr><td>  </td><td>19/04 21:45</td><td>Cốc vũ</td></tr>
<tr><td>23/04 09:26</td><td>05/05 07:51</td><td>Lập hạ</td></tr>
<tr><td>  </td><td>20/05 20:49</td><td>Tiểu mãn</td></tr>
<tr><td>23/05 00:39</td><td>05/06 11:58</td><td>Mang chủng</td></tr>
<tr><td>  </td><td class="saison">21/06 04:43</td><td>Hạ chí</td></tr>
<tr><td>21/06 13:41</td><td>06/07 22:14</td><td>Tiểu thử</td></tr>
<tr><td>21/07 00:33</t

In [20]:
soup  = BeautifulSoup(tables2[0], 'html.parser')
soup

<tr><td class="head" colspan="3"><b>2020</b></td></tr>
<tr><td><b>Ngày giờ Sóc </b></td><td><b>Ngày giờ</b></td><td><b>Tiết khí</b></td></tr>
<tr><td>  </td><td>06/01 04:30</td><td>Tiểu hàn</td></tr>
<tr><td>  </td><td>20/01 21:54</td><td>Đại hàn</td></tr>
<tr><td>25/01 04:42</td><td>04/02 16:03</td><td>Lập xuân</td></tr>
<tr><td>  </td><td>19/02 11:57</td><td>Vũ Thủy</td></tr>
<tr><td>23/02 22:32</td><td>05/03 09:57</td><td>Kinh trập</td></tr>
<tr><td>  </td><td class="saison">20/03 10:49</td><td>Xuân phân</td></tr>
<tr><td>24/03 16:28</td><td>04/04 14:38</td><td>Thanh minh</td></tr>
<tr><td>  </td><td>19/04 21:45</td><td>Cốc vũ</td></tr>
<tr><td>23/04 09:26</td><td>05/05 07:51</td><td>Lập hạ</td></tr>
<tr><td>  </td><td>20/05 20:49</td><td>Tiểu mãn</td></tr>
<tr><td>23/05 00:39</td><td>05/06 11:58</td><td>Mang chủng</td></tr>
<tr><td>  </td><td class="saison">21/06 04:43</td><td>Hạ chí</td></tr>
<tr><td>21/06 13:41</td><td>06/07 22:14</td><td>Tiểu thử</td></tr>
<tr><td>21/07 00:33</t

In [21]:
# ajout des tags <table></table>
html_data = '<table>' + '</table><table>'.join(tables2) # string à transformer avec bs4
soup  = BeautifulSoup(html_data, 'html.parser')
tables = soup.find_all('table')

In [22]:
def convert(html_content, data):  # table with year column
    
    # Extract the header row with the class 'head'
    header_row = html_content.find('td', class_='head')
    header_text = header_row.get_text(strip=True) if header_row else ""
    #print(header_text)
    # Extract table rows
    rows = html_content.find_all('tr')


    for row in rows[2:]:  # Skip the first two header rows
        cols = row.find_all('td')
        col_data = [col.get_text(strip=True) for col in cols]
        col_data.insert(0, header_text)  # Insert the header text as a column
        data.append(col_data)
    return data

def convert2(html_content,data):  # table without year column with a list of years
    # Extract the header row with the class 'head'
    header_row = html_content.find('td', class_='head')
    header_text = header_row.get_text(strip=True) if header_row else ""

    # Extract table rows
    rows = html_content.find_all('tr')

    year = []
    for row in rows[2:]:  # Skip the first two header rows
        cols = row.find_all('td')
        col_data = [col.get_text(strip=True) for col in cols]
        #col_data.insert(0, header_text)  # Insert the header text as a column
        data.append(col_data)
        year.append(header_text)
            
    return data, year

def make_df():
    # Create the DataFrame
    df = pd.DataFrame(data, columns=['Năm', 'Ngày giờ Sóc', 'Ngày giờ', 'Tiết khí'])

    return df


In [23]:
data2 = []
years = []

for table in tables[:]:
    _, year = convert2(table,data2)
    years.append(year)

In [24]:
years

[['2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020',
  '2020'],
 ['2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021',
  '2021'],
 ['2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022',
  '2022'],
 ['2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023',
  '2023'],
 ['2024',
  '2024',
  '202

In [25]:
data = []

for table in tables[:]:
    convert(table,data)


In [26]:
# create dataframe
df = make_df()
# Display the DataFrame
#print(df)
df

Unnamed: 0,Năm,Ngày giờ Sóc,Ngày giờ,Tiết khí
0,2020,,06/01 04:30,Tiểu hàn
1,2020,,20/01 21:54,Đại hàn
2,2020,25/01 04:42,04/02 16:03,Lập xuân
3,2020,,19/02 11:57,Vũ Thủy
4,2020,23/02 22:32,05/03 09:57,Kinh trập
...,...,...,...,...
482,2039,18/10 02:08,23/10 20:24,Sương giáng
483,2039,,07/11 20:42,Lập đông
484,2039,16/11 12:45,22/11 18:11,Tiểu tuyết
485,2039,,07/12 13:44,Đại tuyết


In [27]:

# Group the DataFrame by column 'Năm'
grouped_df = df.groupby('Năm')

def display_all():
    # Display the grouped DataFrame
    for name, group in grouped_df:
        print(f"Năm: {name}")
        print(group[['Ngày giờ Sóc', 'Ngày giờ', 'Tiết khí']].to_string(index=False))
        print()

def display_by_name(year):   
    for name, group in grouped_df:
        if name == str(year):
            print(f"Năm: {name}")
            print(group[['Ngày giờ Sóc', 'Ngày giờ', 'Tiết khí']].to_string(index=False))

def display_by_year(year):    
    if type(year) == int:
       year = str(year)
    print(f"Năm: {year}")
    print(grouped_df.get_group(str(year)).drop(columns=['Năm']).to_string(index=False))



In [28]:
def NM_count(year):
    # Group the DataFrame by 'Năm' and count the non-null entries in 'Ngày giờ Sóc' for each year
    
    newmoon_counts = df[df['Ngày giờ Sóc'] != ''].groupby('Năm')['Ngày giờ Sóc'].count()[str(year)]
    return f'{year} : {newmoon_counts} NM'

In [29]:
year = 2024
display_by_name(year)

NM_count(year)

Năm: 2024
Ngày giờ Sóc    Ngày giờ    Tiết khí
             06/01 03:49    Tiểu hàn
 11/01 18:57 20/01 21:07     Đại hàn
             04/02 15:27    Lập xuân
 10/02 05:59 19/02 11:13     Vũ Thủy
             05/03 09:22   Kinh trập
 10/03 16:00 20/03 10:06   Xuân phân
             04/04 14:02  Thanh minh
 09/04 01:21 19/04 20:59      Cốc vũ
             05/05 07:10      Lập hạ
 08/05 10:22 20/05 19:59    Tiểu mãn
             05/06 11:10  Mang chủng
 06/06 19:37 21/06 03:51      Hạ chí
             06/07 21:20    Tiểu thử
 06/07 05:57 22/07 14:44     Đại thử
 04/08 18:13 07/08 07:09     Lập thu
             22/08 21:55      Xử thử
 03/09 08:55 07/09 10:11     Bạch lộ
             22/09 19:43    Thu phân
 03/10 01:49 08/10 02:00      Hàn lộ
             23/10 05:14 Sương giáng
 01/11 19:47 07/11 05:20    Lập đông
             22/11 02:56  Tiểu tuyết
 01/12 13:21 06/12 22:17   Đại tuyết
             21/12 16:20    Đông chí
 31/12 05:26                        


'2024 : 13 NM'

In [30]:
year = '2025'

display_by_year(year)
NM_count(year)

Năm: 2025
Ngày giờ Sóc    Ngày giờ    Tiết khí
             05/01 09:32    Tiểu hàn
             20/01 03:00     Đại hàn
 29/01 19:36 03/02 21:10    Lập xuân
             18/02 17:06     Vũ Thủy
 28/02 07:44 05/03 15:07   Kinh trập
             20/03 16:01   Xuân phân
 29/03 17:57 04/04 19:48  Thanh minh
             20/04 02:56      Cốc vũ
 28/04 02:31 05/05 12:57      Lập hạ
             21/05 01:54    Tiểu mãn
 27/05 10:02 05/06 16:56  Mang chủng
             21/06 09:42      Hạ chí
 25/06 17:31 07/07 03:05    Tiểu thử
             22/07 20:29     Đại thử
 25/07 02:11 07/08 12:51     Lập thu
             23/08 03:33      Xử thử
 23/08 13:06 07/09 15:52     Bạch lộ
 22/09 02:54 23/09 01:19    Thu phân
             08/10 07:41      Hàn lộ
 21/10 19:25 23/10 10:51 Sương giáng
             07/11 11:04    Lập đông
 20/11 13:47 22/11 08:35  Tiểu tuyết
             07/12 04:04   Đại tuyết
 20/12 08:43 21/12 22:03    Đông chí


'2025 : 12 NM'

# méthode 2

In [31]:
import pandas as pd

def create_dataframe_from_solar_terms(data):
  """
  Creates a pandas DataFrame from the given solar terms data and groups by year.

  Args:
    data: A list of lists, where each inner list represents a row in the table.

  Returns:
    A pandas DataFrame with the following columns:
      - Year: The year of the solar term.
      - Tiết khí: The name of the solar term.
      - Ngày giờ Sóc: The date and time of the Sóc observation (if available).
      - Ngày giờ: The date and time of the official solar term.
  """

  df = pd.DataFrame(data, columns=["Ngày giờ Sóc", "Ngày giờ", "Tiết khí"])
  return df

def add_header():
  # Extract Year from the "head" rows
  df['Year'] = df['Tiết khí'].str.contains('head').ffill().astype(str)
  df['Year'] = df['Year'].str.extract(r'(\d{4})').fillna(method='ffill')

  # Filter out "head" rows
  df = df[df['Tiết khí'] != 'head']

  # Group by Year and extract the first occurrence of each group
  grouped_df = df.groupby('Year').first().reset_index() 

  return grouped_df


In [32]:

# Example usage:
# Assuming you have the data from the table in a list of lists format (e.g., 'data')
# Here's an example of how to create the 'data' list:

data = data2  # 2020-2039
'''[
    ["", "06/01 04:30", "Tiểu hàn"],
    ["", "20/01 21:54", "Đại hàn"],
    ["25/01 04:42", "04/02 16:03", "Lập xuân"], 
    # ... (rest of the data)
]'''
data

[['', '06/01 04:30', 'Tiểu hàn'],
 ['', '20/01 21:54', 'Đại hàn'],
 ['25/01 04:42', '04/02 16:03', 'Lập xuân'],
 ['', '19/02 11:57', 'Vũ Thủy'],
 ['23/02 22:32', '05/03 09:57', 'Kinh trập'],
 ['', '20/03 10:49', 'Xuân phân'],
 ['24/03 16:28', '04/04 14:38', 'Thanh minh'],
 ['', '19/04 21:45', 'Cốc vũ'],
 ['23/04 09:26', '05/05 07:51', 'Lập hạ'],
 ['', '20/05 20:49', 'Tiểu mãn'],
 ['23/05 00:39', '05/06 11:58', 'Mang chủng'],
 ['', '21/06 04:43', 'Hạ chí'],
 ['21/06 13:41', '06/07 22:14', 'Tiểu thử'],
 ['21/07 00:33', '22/07 15:37', 'Đại thử'],
 ['', '07/08 08:06', 'Lập thu'],
 ['19/08 09:41', '22/08 22:45', 'Xử thử'],
 ['', '07/09 11:08', 'Bạch lộ'],
 ['17/09 18:00', '22/09 20:30', 'Thu phân'],
 ['', '08/10 02:55', 'Hàn lộ'],
 ['17/10 02:31', '23/10 05:59', 'Sương giáng'],
 ['', '07/11 06:14', 'Lập đông'],
 ['15/11 12:07', '22/11 03:40', 'Tiểu tuyết'],
 ['', '06/12 23:09', 'Đại tuyết'],
 ['14/12 23:16', '21/12 17:02', 'Đông chí'],
 ['', '05/01 10:23', 'Tiểu hàn'],
 ['13/01 12:00', '20/

In [33]:
df = create_dataframe_from_solar_terms(data2)
print(df) 

    Ngày giờ Sóc     Ngày giờ     Tiết khí
0                 06/01 04:30     Tiểu hàn
1                 20/01 21:54      Đại hàn
2    25/01 04:42  04/02 16:03     Lập xuân
3                 19/02 11:57      Vũ Thủy
4    23/02 22:32  05/03 09:57    Kinh trập
..           ...          ...          ...
482  18/10 02:08  23/10 20:24  Sương giáng
483               07/11 20:42     Lập đông
484  16/11 12:45  22/11 18:11   Tiểu tuyết
485               07/12 13:44    Đại tuyết
486  15/12 23:31  22/12 07:40     Đông chí

[487 rows x 3 columns]


In [34]:
df['Năm'] = [year for sublist in years for year in sublist]
df

Unnamed: 0,Ngày giờ Sóc,Ngày giờ,Tiết khí,Năm
0,,06/01 04:30,Tiểu hàn,2020
1,,20/01 21:54,Đại hàn,2020
2,25/01 04:42,04/02 16:03,Lập xuân,2020
3,,19/02 11:57,Vũ Thủy,2020
4,23/02 22:32,05/03 09:57,Kinh trập,2020
...,...,...,...,...
482,18/10 02:08,23/10 20:24,Sương giáng,2039
483,,07/11 20:42,Lập đông,2039
484,16/11 12:45,22/11 18:11,Tiểu tuyết,2039
485,,07/12 13:44,Đại tuyết,2039


In [35]:
# Assuming df is already defined and contains the 'year' column
grouped_by_year = df.groupby('Năm')
target = '2025'
# Display the grouped DataFrame
for name, group in grouped_by_year:
    if name == target:
        print(f"Year: {name} :", grouped_by_year.size()[name])
        print(group.drop(columns=['Năm']).to_string(index=False))
        #print(group.tail().drop(columns=['Năm']).to_string(index=False))
        print()

Year: 2025 : 24
Ngày giờ Sóc    Ngày giờ    Tiết khí
             05/01 09:32    Tiểu hàn
             20/01 03:00     Đại hàn
 29/01 19:36 03/02 21:10    Lập xuân
             18/02 17:06     Vũ Thủy
 28/02 07:44 05/03 15:07   Kinh trập
             20/03 16:01   Xuân phân
 29/03 17:57 04/04 19:48  Thanh minh
             20/04 02:56      Cốc vũ
 28/04 02:31 05/05 12:57      Lập hạ
             21/05 01:54    Tiểu mãn
 27/05 10:02 05/06 16:56  Mang chủng
             21/06 09:42      Hạ chí
 25/06 17:31 07/07 03:05    Tiểu thử
             22/07 20:29     Đại thử
 25/07 02:11 07/08 12:51     Lập thu
             23/08 03:33      Xử thử
 23/08 13:06 07/09 15:52     Bạch lộ
 22/09 02:54 23/09 01:19    Thu phân
             08/10 07:41      Hàn lộ
 21/10 19:25 23/10 10:51 Sương giáng
             07/11 11:04    Lập đông
 20/11 13:47 22/11 08:35  Tiểu tuyết
             07/12 04:04   Đại tuyết
 20/12 08:43 21/12 22:03    Đông chí



In [36]:
print(grouped_by_year.get_group('2022').tail().to_string(index=False))


Ngày giờ Sóc    Ngày giờ   Tiết khí  Năm
 25/10 17:48 07/11 17:45   Lập đông 2022
             22/11 15:20 Tiểu tuyết 2022
 24/11 05:57 07/12 10:46  Đại tuyết 2022
             22/12 04:48   Đông chí 2022
 23/12 17:17                        2022


In [37]:
grouped_by_year.get_group('2022')


Unnamed: 0,Ngày giờ Sóc,Ngày giờ,Tiết khí,Năm
48,03/01 01:33,05/01 16:14,Tiểu hàn,2022
49,,20/01 09:39,Đại hàn,2022
50,01/02 12:46,04/02 03:50,Lập xuân,2022
51,,18/02 23:43,Vũ Thủy,2022
52,03/03 00:35,05/03 21:43,Kinh trập,2022
53,,20/03 22:33,Xuân phân,2022
54,01/04 13:24,05/04 02:20,Thanh minh,2022
55,,20/04 09:24,Cốc vũ,2022
56,01/05 03:28,05/05 19:26,Lập hạ,2022
57,,21/05 08:22,Tiểu mãn,2022


In [38]:
grouped_by_year.size()

Năm
2020    24
2021    24
2022    25
2023    24
2024    25
2025    24
2026    24
2027    25
2028    24
2029    24
2030    25
2031    24
2032    24
2033    25
2034    24
2035    25
2036    24
2037    24
2038    25
2039    24
dtype: int64

In [39]:
year = '2023'
print(grouped_by_year.get_group(year).to_string(index=False))

Ngày giờ Sóc    Ngày giờ    Tiết khí  Năm
             05/01 22:05    Tiểu hàn 2023
             20/01 15:29     Đại hàn 2023
 22/01 03:53 04/02 09:42    Lập xuân 2023
             19/02 05:34     Vũ Thủy 2023
 20/02 14:06 06/03 03:36   Kinh trập 2023
             21/03 04:24   Xuân phân 2023
 22/03 00:23 05/04 08:13  Thanh minh 2023
             20/04 15:13      Cốc vũ 2023
 20/04 11:12 06/05 01:18      Lập hạ 2023
 19/05 22:53 21/05 14:09    Tiểu mãn 2023
             06/06 05:18  Mang chủng 2023
 18/06 11:37 21/06 21:58      Hạ chí 2023
             07/07 15:30    Tiểu thử 2023
 18/07 01:32 23/07 08:50     Đại thử 2023
             08/08 01:23     Lập thu 2023
 16/08 16:38 23/08 16:01      Xử thử 2023
             08/09 04:26     Bạch lộ 2023
 15/09 08:40 23/09 13:50    Thu phân 2023
             08/10 20:15      Hàn lộ 2023
 15/10 00:55 23/10 23:21 Sương giáng 2023
             07/11 23:35    Lập đông 2023
 13/11 16:27 22/11 21:02  Tiểu tuyết 2023
             07/12 16:33   Đại tuy

In [40]:
NM_count(year)

'2023 : 12 NM'

# Méthode 3 : le plus simple !!!!!


In [41]:
html_table = requests.get(url = url, headers = headers, ).content

In [42]:
def create_grouped_dataframe(html_table):
    # Parse the HTML table using BeautifulSoup
    soup = BeautifulSoup(html_table, 'html.parser')
    
    # Initialize lists to store data
    years = []
    data = []
    
    # Iterate through each row in the table
    for row in soup.find_all('tr'):
        # Check if the row contains the year (class 'head')
        if 'head' in str(row.find('td',{'class':'head'})):
            year = row.find('td').text.strip()
            continue
        
        if 'Ngày giờ Sóc' in str(row):
            continue

        # Extract the data from the row
        cols = row.find_all('td')
        if len(cols) == 3:
            ngay_gio_soc = cols[0].text.strip()
            ngay_gio = cols[1].text.strip()
            tiet_khi = cols[2].text.strip()
            
            # Append the data with the corresponding year
            data.append([year, ngay_gio_soc, ngay_gio, tiet_khi])
    
    # Create a DataFrame from the extracted data
    df = pd.DataFrame(data, columns=['Year', 'Ngày giờ Sóc', 'Ngày giờ', 'Tiết khí'])
    
    # Group the DataFrame by the 'Year' column
    grouped_df = df.groupby('Year')
    
    return grouped_df

In [43]:
grp_df = create_grouped_dataframe(html_table)

In [44]:
print(grp_df.get_group('2022').to_string(index=False))

Year Ngày giờ Sóc    Ngày giờ    Tiết khí
2022  03/01 01:33 05/01 16:14    Tiểu hàn
2022              20/01 09:39     Đại hàn
2022  01/02 12:46 04/02 03:50    Lập xuân
2022              18/02 23:43     Vũ Thủy
2022  03/03 00:35 05/03 21:43   Kinh trập
2022              20/03 22:33   Xuân phân
2022  01/04 13:24 05/04 02:20  Thanh minh
2022              20/04 09:24      Cốc vũ
2022  01/05 03:28 05/05 19:26      Lập hạ
2022              21/05 08:22    Tiểu mãn
2022  30/05 18:30 05/06 23:26  Mang chủng
2022              21/06 16:14      Hạ chí
2022  29/06 09:52 07/07 09:38    Tiểu thử
2022              23/07 03:07     Đại thử
2022  29/07 00:55 07/08 19:29     Lập thu
2022              23/08 10:16      Xử thử
2022  27/08 15:17 07/09 22:32     Bạch lộ
2022              23/09 08:03    Thu phân
2022  26/09 04:54 08/10 14:22      Hàn lộ
2022              23/10 17:35 Sương giáng
2022  25/10 17:48 07/11 17:45    Lập đông
2022              22/11 15:20  Tiểu tuyết
2022  24/11 05:57 07/12 10:46   Đạ

# https://builtin.com/data-science/pandas-groupby

In [45]:
import pandas as pd
df = pd.read_csv(r"..\csv\Dummy_Sales_Data_v1.csv")
df.head()

Unnamed: 0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
0,2951110000999929511,92,238,Not Delivered,2021-08-08,Healthcare,Pablo,21,25.0,Singapore,HC-188,444116
1,2181910000999921819,61,136,Not Delivered,2021-10-03,Office,Pablo,34,14.0,UK,O-555,444772
2,3239110000999932391,67,235,Not Delivered,2021-09-27,Office,Kristen,25,11.0,Kenya,O-188,444666
3,1112610000999911126,33,133,Not Shipped,2021-07-30,Fashion,Abdul,34,24.0,USA,F-555,444007
4,1548310000999915483,13,189,Not Delivered,2021-08-15,Fashion,Stella,24,19.0,Kenya,F-555,444223


In [46]:
df.Product_Category.nunique()

5

In [47]:
df_group = df.groupby("Product_Category")
df_group.ngroups

5

In [48]:
df.groupby("Product_Category").size()

Product_Category
Entertainment    1968
Fashion          1971
Healthcare       1953
Home             2060
Office           2011
dtype: int64

In [49]:
df.groupby("Product_Category").count()


Unnamed: 0_level_0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Entertainment,1968,1968,1968,1968,1968,1968,1968,1959,1968,1968,1968
Fashion,1971,1971,1971,1971,1971,1971,1971,1963,1971,1971,1971
Healthcare,1953,1953,1953,1953,1953,1953,1953,1937,1953,1953,1953
Home,2060,2060,2060,2060,2060,2060,2060,2049,2060,2060,2060
Office,2011,2011,2011,2011,2011,2011,2011,2004,2011,2011,2011


In [50]:
df.groupby("Product_Category").first()

Unnamed: 0_level_0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Entertainment,4934810000999949348,51,204,Not Delivered,2021-11-13,Abdul,32,18.0,UK,ENT-188,445113
Fashion,1112610000999911126,33,133,Not Shipped,2021-07-30,Abdul,34,24.0,USA,F-555,444007
Healthcare,2951110000999929511,92,238,Not Delivered,2021-08-08,Pablo,21,25.0,Singapore,HC-188,444116
Home,4337210000999943372,57,226,Not Shipped,2021-09-27,John,24,14.0,UK,H-555,444666
Office,2181910000999921819,61,136,Not Delivered,2021-10-03,Pablo,34,14.0,UK,O-555,444772


In [51]:
df.groupby("Product_Category").last()

Unnamed: 0_level_0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Entertainment,1468010000999914680,53,229,Delivered,2021-07-12,Anthony,25,12.0,Singapore,ENT-630,443889
Fashion,2281610000999922816,18,117,Shipped,2021-12-23,Stella,22,24.0,Italy,F-101,445553
Healthcare,1847410000999918474,37,135,Shipped,2021-10-03,Maria,30,23.0,China,HC-901,444772
Home,2301610000999923016,75,201,Not Delivered,2021-10-16,Sofia,20,14.0,Kenya,H-555,444885
Office,2385710000999923857,81,207,Delivered,2021-11-13,Emma,29,18.0,USA,O-203,445113


In [52]:
df.groupby("Product_Category").nth(3)

Unnamed: 0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
8,2804110000999928041,31,163,Not Shipped,2021-12-23,Fashion,Abdul,34,16.0,Kenya,F-901,445553
15,1444810000999914448,5,119,Not Shipped,2021-10-13,Office,Stella,33,20.0,India,O-555,444882
18,3105110000999931051,79,213,Delivered,2021-10-26,Home,Abdul,27,14.0,China,H-203,444995
19,4276410000999942764,83,224,Not Delivered,2021-07-10,Healthcare,Sofia,24,24.0,Kenya,HC-203,443887
21,4030410000999940304,21,123,Not Delivered,2021-10-10,Entertainment,Kristen,30,15.0,India,ENT-188,444779


In [53]:
df_group.get_group('Healthcare')

Unnamed: 0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
0,2951110000999929511,92,238,Not Delivered,2021-08-08,Healthcare,Pablo,21,25.0,Singapore,HC-188,444116
6,2750410000999927504,73,242,Not Delivered,2021-07-08,Healthcare,Emma,34,10.0,UK,HC-555,443885
14,2559910000999925599,55,233,Not Delivered,2021-07-15,Healthcare,Kristen,25,18.0,India,HC-555,443992
19,4276410000999942764,83,224,Not Delivered,2021-07-10,Healthcare,Sofia,24,24.0,Kenya,HC-203,443887
22,2111510000999921115,74,250,Delivered,2021-12-08,Healthcare,Maria,25,21.0,Italy,HC-901,445338
...,...,...,...,...,...,...,...,...,...,...,...,...
9981,4904910000999949049,73,209,Shipped,2021-08-20,Healthcare,Sofia,33,12.0,UK,HC-203,444228
9983,4107210000999941072,57,162,Not Delivered,2021-08-26,Healthcare,Abdul,23,21.0,India,HC-188,444334
9986,1868610000999918686,30,162,Not Delivered,2021-12-12,Healthcare,Pablo,35,11.0,Italy,HC-630,445442
9988,4264110000999942641,4,187,Not Delivered,2021-11-20,Healthcare,Maria,34,15.0,UK,HC-203,445220


In [54]:
for name_of_group, contents_of_group in df_group:
   print(name_of_group)
   print(contents_of_group)

Entertainment
                  OrderID  Quantity  UnitPrice(USD)         Status  \
5     4934810000999949348        51             204  Not Delivered   
12    3882310000999938823        78             219      Delivered   
20    2469010000999924690        15             156        Shipped   
21    4030410000999940304        21             123  Not Delivered   
23    3629310000999936293        78             155      Delivered   
...                   ...       ...             ...            ...   
9968  4220510000999942205        39             145        Shipped   
9973  1074010000999910740         2             186  Not Delivered   
9990  4241810000999942418        94             211    Not Shipped   
9993  2363310000999923633        82             116        Shipped   
9998  1468010000999914680        53             229      Delivered   

       OrderDate Product_Category Sales_Manager  Shipping_Cost(USD)  \
5     2021-11-13    Entertainment         Abdul                  32   
12 

In [55]:
#Create a groupby object
df_group = df.groupby("Product_Category")

#Select only required columns
df_columns = df_group[["UnitPrice(USD)","Quantity"]]

#Apply aggregate function
df_columns.mean()

Unnamed: 0_level_0,UnitPrice(USD),Quantity
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Entertainment,176.038618,49.851118
Fashion,176.117199,51.2552
Healthcare,175.489503,50.905274
Home,175.354854,50.70534
Office,175.1273,50.913476


In [56]:
df.groupby("Product_Category")[["Quantity"]].aggregate(['min','max','sum','mean'])

Unnamed: 0_level_0,Quantity,Quantity,Quantity,Quantity
Unnamed: 0_level_1,min,max,sum,mean
Product_Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Entertainment,1,100,98107,49.851118
Fashion,1,100,101024,51.2552
Healthcare,1,100,99418,50.905274
Home,1,100,104453,50.70534
Office,1,100,102387,50.913476


In [57]:
function_dictionary = {'OrderID':'count','Quantity':'mean'}
df.groupby("Product_Category").aggregate(function_dictionary)

Unnamed: 0_level_0,OrderID,Quantity
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Entertainment,1968,49.851118
Fashion,1971,51.2552
Healthcare,1953,50.905274
Home,2060,50.70534
Office,2011,50.913476


In [58]:
df.groupby("Product_Category")[["Quantity"]].describe()

Unnamed: 0_level_0,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Product_Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Entertainment,1968.0,49.851118,29.184244,1.0,24.0,50.0,75.0,100.0
Fashion,1971.0,51.2552,28.687947,1.0,27.0,51.0,76.0,100.0
Healthcare,1953.0,50.905274,29.265929,1.0,25.0,50.0,77.0,100.0
Home,2060.0,50.70534,28.917479,1.0,26.0,50.0,76.0,100.0
Office,2011.0,50.913476,28.983277,1.0,26.0,52.0,76.0,100.0
