In [1]:
import matplotlib.pyplot as plt
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
#import datetime as dt
import pytz
from pprint import pprint
from bs4 import BeautifulSoup


The file is a Jupyter Notebook that contains multiple cells with Python code. Here's a brief explanation of the content in each cell:

1. **Cell 0**: 
    - Imports necessary libraries such as `matplotlib.pyplot`, `requests`, `pandas`, `numpy`, `datetime`, `pytz`, `pprint`, and `BeautifulSoup`.

2. **Cell 2**:
    - Defines headers to mimic a browser request.
    - Defines a URL to fetch data from.
    - Defines a function `get_table` to fetch and parse the HTML content from the URL.
    - Calls `get_table` to get the HTML content and tables.
    - Defines a function `convert` to extract data from the HTML table and store it in a list.
    - Defines a function `make_df` to create a DataFrame from the extracted data.
    - Prepares data by calling `convert` for each table.
    - Creates a DataFrame `df` from the data.
    - Groups the DataFrame by the column 'Năm'.
    - Defines functions `display_all`, `display_by_name`, and `display_by_year` to display the grouped data.

3. **Cell 3**:
    - Calls `display_by_name` function to display data for the year 2024.

4. **Cell 4**:
    - Calls `display_by_year` function to display data for the year 2025.

5. **Cell 5**:
    - Contains a markdown link to a pandas groupby tutorial.

6. **Cell 6**:
    - Imports `pandas` and reads a CSV file into a DataFrame `df`.
    - Displays the first few rows of the DataFrame.

7. **Cell 7**:
    - Displays the number of unique values in the 'Product_Category' column of the DataFrame.

8. **Cell 8**:
    - Groups the DataFrame by 'Product_Category' and displays the number of groups.

9. **Cell 9**:
    - Displays the size of each group in the 'Product_Category' column.

10. **Cell 10**:
     - Counts the number of non-NA/null entries for each column in each group.

11. **Cell 11**:
     - Displays the first row of each group in the 'Product_Category' column.

12. **Cell 12**:
     - Displays the last row of each group in the 'Product_Category' column.

13. **Cell 13**:
     - Displays the nth row (3rd row) of each group in the 'Product_Category' column.

14. **Cell 14**:
     - Retrieves and displays the group 'Healthcare' from the grouped DataFrame.

15. **Cell 15**:
     - Iterates over each group in the grouped DataFrame and prints the group name and its contents.

16. **Cell 16**:
     - Groups the DataFrame by 'Product_Category'.
     - Selects specific columns and calculates the mean for each group.

17. **Cell 17**:
     - Aggregates the 'Quantity' column by calculating the min, max, sum, and mean for each group.

18. **Cell 18**:
     - Defines a dictionary of functions to apply to each column and aggregates the DataFrame accordingly.

19. **Cell 19**:
     - Provides descriptive statistics for the 'Quantity' column in each group.

This notebook demonstrates data extraction from a webpage, data manipulation using pandas, and various groupby operations to analyze the data.

In [2]:


# Headers to mimic the browser 
headers = { 
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' 
}

url = 'https://www.informatik.uni-leipzig.de/~duc/amlich/DuLieu/Sun-Moon-2020.html'

def get_table(url):    
    current = requests.get(url = url, headers = headers).text
    soup = BeautifulSoup(current, 'html.parser')
    #print(soup.prettify())
    tables = soup.find_all('table')
    return soup, tables

text, tables = get_table(url)
#print(tables[0])

def convert(html_content):  # table
    # Extract the header row with the class 'head'
    header_row = html_content.find('td', class_='head')
    header_text = header_row.get_text(strip=True) if header_row else ""
    #print(header_text)
    # Extract table rows
    rows = html_content.find_all('tr')


    for row in rows[2:]:  # Skip the first two header rows
        cols = row.find_all('td')
        col_data = [col.get_text(strip=True) for col in cols]
        col_data.insert(0, header_text)  # Insert the header text as a column
        data.append(col_data)
    #return data

def make_df():
    # Create the DataFrame
    df = pd.DataFrame(data, columns=['Năm', 'Ngày giờ Sóc', 'Ngày giờ các Tiết khí'])

    return df

# Prepare data for the DataFrame
data = []
for table in tables[:]:
    convert(table)

df = make_df()
# Display the DataFrame
#print(df)

# Group the DataFrame by column 'Năm'
grouped_df = df.groupby('Năm')

def display_all():
    # Display the grouped DataFrame
    for name, group in grouped_df:
        print(f"Năm: {name}")
        print(group[['Ngày giờ Sóc', 'Ngày giờ các Tiết khí']].to_string(index=False))
        print()

def display_by_name(year):   
    for name, group in grouped_df:
        if name == str(year):
            print(f"Năm: {name}")
            print(group[['Ngày giờ Sóc', 'Ngày giờ các Tiết khí']].to_string(index=False))

def display_by_year(year):    
    #if type(year) == int:
    #    year = str(year)
    print(f"Năm: {year}")
    print(grouped_df.get_group(str(year))[['Ngày giờ Sóc','Ngày giờ các Tiết khí']].to_string(index=False))



In [3]:
year = 2024
display_by_name(year)

Năm: 2024
Ngày giờ Sóc     Ngày giờ các Tiết khí
                06/01 03:49 - Tiểu hàn
 11/01 18:57     20/01 21:07 - Đại hàn
                04/02 15:27 - Lập xuân
 10/02 05:59     19/02 11:13 - Vũ Thủy
               05/03 09:22 - Kinh trập
 10/03 16:00   20/03 10:06 - Xuân phân
              04/04 14:02 - Thanh minh
 09/04 01:21      19/04 20:59 - Cốc vũ
                  05/05 07:10 - Lập hạ
 08/05 10:22    20/05 19:59 - Tiểu mãn
              05/06 11:10 - Mang chủng
 06/06 19:37      21/06 03:51 - Hạ chí
                06/07 21:20 - Tiểu thử
 06/07 05:57     22/07 14:44 - Đại thử
 04/08 18:13     07/08 07:09 - Lập thu
                  22/08 21:55 - Xử thử
 03/09 08:55     07/09 10:11 - Bạch lộ
                22/09 19:43 - Thu phân
 03/10 01:49      08/10 02:00 - Hàn lộ
             23/10 05:14 - Sương giáng
 01/11 19:47    07/11 05:20 - Lập đông
              22/11 02:56 - Tiểu tuyết
 01/12 13:21   06/12 22:17 - Đại tuyết
                21/12 16:20 - Đông chí
 31/12 05:26   

In [4]:
year = 2025
display_by_year(year)

Năm: 2025
Ngày giờ Sóc     Ngày giờ các Tiết khí
                05/01 09:32 - Tiểu hàn
                 20/01 03:00 - Đại hàn
 29/01 19:36    03/02 21:10 - Lập xuân
                 18/02 17:06 - Vũ Thủy
 28/02 07:44   05/03 15:07 - Kinh trập
               20/03 16:01 - Xuân phân
 29/03 17:57  04/04 19:48 - Thanh minh
                  20/04 02:56 - Cốc vũ
 28/04 02:31      05/05 12:57 - Lập hạ
                21/05 01:54 - Tiểu mãn
 27/05 10:02  05/06 16:56 - Mang chủng
                  21/06 09:42 - Hạ chí
 25/06 17:31    07/07 03:05 - Tiểu thử
                 22/07 20:29 - Đại thử
 25/07 02:11     07/08 12:51 - Lập thu
                  23/08 03:33 - Xử thử
 23/08 13:06     07/09 15:52 - Bạch lộ
 22/09 02:54    23/09 01:19 - Thu phân
                  08/10 07:41 - Hàn lộ
 21/10 19:25 23/10 10:51 - Sương giáng
                07/11 11:04 - Lập đông
 20/11 13:47  22/11 08:35 - Tiểu tuyết
               07/12 04:04 - Đại tuyết
 20/12 08:43    21/12 22:03 - Đông chí


https://builtin.com/data-science/pandas-groupby

In [5]:
import pandas as pd
df = pd.read_csv(r"..\csv\Dummy_Sales_Data_v1.csv")
df.head()

Unnamed: 0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
0,2951110000999929511,92,238,Not Delivered,2021-08-08,Healthcare,Pablo,21,25.0,Singapore,HC-188,444116
1,2181910000999921819,61,136,Not Delivered,2021-10-03,Office,Pablo,34,14.0,UK,O-555,444772
2,3239110000999932391,67,235,Not Delivered,2021-09-27,Office,Kristen,25,11.0,Kenya,O-188,444666
3,1112610000999911126,33,133,Not Shipped,2021-07-30,Fashion,Abdul,34,24.0,USA,F-555,444007
4,1548310000999915483,13,189,Not Delivered,2021-08-15,Fashion,Stella,24,19.0,Kenya,F-555,444223


In [6]:
df.Product_Category.nunique()

5

In [7]:
df_group = df.groupby("Product_Category")
df_group.ngroups

5

In [8]:
df.groupby("Product_Category").size()

Product_Category
Entertainment    1968
Fashion          1971
Healthcare       1953
Home             2060
Office           2011
dtype: int64

In [9]:
df.groupby("Product_Category").count()


Unnamed: 0_level_0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Entertainment,1968,1968,1968,1968,1968,1968,1968,1959,1968,1968,1968
Fashion,1971,1971,1971,1971,1971,1971,1971,1963,1971,1971,1971
Healthcare,1953,1953,1953,1953,1953,1953,1953,1937,1953,1953,1953
Home,2060,2060,2060,2060,2060,2060,2060,2049,2060,2060,2060
Office,2011,2011,2011,2011,2011,2011,2011,2004,2011,2011,2011


In [10]:
df.groupby("Product_Category").first()

Unnamed: 0_level_0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Entertainment,4934810000999949348,51,204,Not Delivered,2021-11-13,Abdul,32,18.0,UK,ENT-188,445113
Fashion,1112610000999911126,33,133,Not Shipped,2021-07-30,Abdul,34,24.0,USA,F-555,444007
Healthcare,2951110000999929511,92,238,Not Delivered,2021-08-08,Pablo,21,25.0,Singapore,HC-188,444116
Home,4337210000999943372,57,226,Not Shipped,2021-09-27,John,24,14.0,UK,H-555,444666
Office,2181910000999921819,61,136,Not Delivered,2021-10-03,Pablo,34,14.0,UK,O-555,444772


In [11]:
df.groupby("Product_Category").last()

Unnamed: 0_level_0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Entertainment,1468010000999914680,53,229,Delivered,2021-07-12,Anthony,25,12.0,Singapore,ENT-630,443889
Fashion,2281610000999922816,18,117,Shipped,2021-12-23,Stella,22,24.0,Italy,F-101,445553
Healthcare,1847410000999918474,37,135,Shipped,2021-10-03,Maria,30,23.0,China,HC-901,444772
Home,2301610000999923016,75,201,Not Delivered,2021-10-16,Sofia,20,14.0,Kenya,H-555,444885
Office,2385710000999923857,81,207,Delivered,2021-11-13,Emma,29,18.0,USA,O-203,445113


In [12]:
df.groupby("Product_Category").nth(3)

Unnamed: 0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
8,2804110000999928041,31,163,Not Shipped,2021-12-23,Fashion,Abdul,34,16.0,Kenya,F-901,445553
15,1444810000999914448,5,119,Not Shipped,2021-10-13,Office,Stella,33,20.0,India,O-555,444882
18,3105110000999931051,79,213,Delivered,2021-10-26,Home,Abdul,27,14.0,China,H-203,444995
19,4276410000999942764,83,224,Not Delivered,2021-07-10,Healthcare,Sofia,24,24.0,Kenya,HC-203,443887
21,4030410000999940304,21,123,Not Delivered,2021-10-10,Entertainment,Kristen,30,15.0,India,ENT-188,444779


In [13]:
df_group.get_group('Healthcare')

Unnamed: 0,OrderID,Quantity,UnitPrice(USD),Status,OrderDate,Product_Category,Sales_Manager,Shipping_Cost(USD),Delivery_Time(Days),Shipping_Address,Product_Code,OrderCode
0,2951110000999929511,92,238,Not Delivered,2021-08-08,Healthcare,Pablo,21,25.0,Singapore,HC-188,444116
6,2750410000999927504,73,242,Not Delivered,2021-07-08,Healthcare,Emma,34,10.0,UK,HC-555,443885
14,2559910000999925599,55,233,Not Delivered,2021-07-15,Healthcare,Kristen,25,18.0,India,HC-555,443992
19,4276410000999942764,83,224,Not Delivered,2021-07-10,Healthcare,Sofia,24,24.0,Kenya,HC-203,443887
22,2111510000999921115,74,250,Delivered,2021-12-08,Healthcare,Maria,25,21.0,Italy,HC-901,445338
...,...,...,...,...,...,...,...,...,...,...,...,...
9981,4904910000999949049,73,209,Shipped,2021-08-20,Healthcare,Sofia,33,12.0,UK,HC-203,444228
9983,4107210000999941072,57,162,Not Delivered,2021-08-26,Healthcare,Abdul,23,21.0,India,HC-188,444334
9986,1868610000999918686,30,162,Not Delivered,2021-12-12,Healthcare,Pablo,35,11.0,Italy,HC-630,445442
9988,4264110000999942641,4,187,Not Delivered,2021-11-20,Healthcare,Maria,34,15.0,UK,HC-203,445220


In [14]:
for name_of_group, contents_of_group in df_group:
   print(name_of_group)
   print(contents_of_group)

Entertainment
                  OrderID  Quantity  UnitPrice(USD)         Status  \
5     4934810000999949348        51             204  Not Delivered   
12    3882310000999938823        78             219      Delivered   
20    2469010000999924690        15             156        Shipped   
21    4030410000999940304        21             123  Not Delivered   
23    3629310000999936293        78             155      Delivered   
...                   ...       ...             ...            ...   
9968  4220510000999942205        39             145        Shipped   
9973  1074010000999910740         2             186  Not Delivered   
9990  4241810000999942418        94             211    Not Shipped   
9993  2363310000999923633        82             116        Shipped   
9998  1468010000999914680        53             229      Delivered   

       OrderDate Product_Category Sales_Manager  Shipping_Cost(USD)  \
5     2021-11-13    Entertainment         Abdul                  32   
12 

In [15]:
#Create a groupby object
df_group = df.groupby("Product_Category")

#Select only required columns
df_columns = df_group[["UnitPrice(USD)","Quantity"]]

#Apply aggregate function
df_columns.mean()

Unnamed: 0_level_0,UnitPrice(USD),Quantity
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Entertainment,176.038618,49.851118
Fashion,176.117199,51.2552
Healthcare,175.489503,50.905274
Home,175.354854,50.70534
Office,175.1273,50.913476


In [16]:
df.groupby("Product_Category")[["Quantity"]].aggregate(['min','max','sum','mean'])

Unnamed: 0_level_0,Quantity,Quantity,Quantity,Quantity
Unnamed: 0_level_1,min,max,sum,mean
Product_Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Entertainment,1,100,98107,49.851118
Fashion,1,100,101024,51.2552
Healthcare,1,100,99418,50.905274
Home,1,100,104453,50.70534
Office,1,100,102387,50.913476


In [17]:
function_dictionary = {'OrderID':'count','Quantity':'mean'}
df.groupby("Product_Category").aggregate(function_dictionary)

Unnamed: 0_level_0,OrderID,Quantity
Product_Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Entertainment,1968,49.851118
Fashion,1971,51.2552
Healthcare,1953,50.905274
Home,2060,50.70534
Office,2011,50.913476


In [18]:
df.groupby("Product_Category")[["Quantity"]].describe()

Unnamed: 0_level_0,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity,Quantity
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Product_Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Entertainment,1968.0,49.851118,29.184244,1.0,24.0,50.0,75.0,100.0
Fashion,1971.0,51.2552,28.687947,1.0,27.0,51.0,76.0,100.0
Healthcare,1953.0,50.905274,29.265929,1.0,25.0,50.0,77.0,100.0
Home,2060.0,50.70534,28.917479,1.0,26.0,50.0,76.0,100.0
Office,2011.0,50.913476,28.983277,1.0,26.0,52.0,76.0,100.0
