# SELECT {field}, COUNT() FROM {table} GROUP BY {field} HAVING COUNT() condition

- Aggregate function: COUNT(), SUM(), AVG(), MIN(), and MAX().
- GROUP BY... HAVING.
- Examples in SQL Query vs Pandas 

## Simple Examples of grouping rows and get agg. values
- I will use AdventureWorks2019 DB, tables: Production.Product and Purchasing.PurchaseOrderDetail
- And as always build the code first w/direct SQL query to the DB and second w/native Pandas using a DF tha is the whole table extracted from the DB

## 1. Establish the connection - conecting w/the DB

In [6]:
### Connect to the DB - Establish the connection
import pyodbc

# Valid values for the connection string
driver = '{ODBC Driver 17 for SQL Server}'
server = '(local)'
dbname = 'AdventureWorks2019'
#dbname = 'BikeStores'
user = 'user1'
passwd = 'pass1'

# Construct the Connection String
connection_string = f'DRIVER={driver};SERVER={server};\
    DATABASE={dbname};UID={user};PWD={passwd}'
print('Connection String:\n', connection_string)

# Establish the connection
try:
    connection = pyodbc.connect(connection_string)
    cur = connection.cursor()
except pyodbc.Error as e:
    print('ERROR:', e)
else:
    print('SUCCESS: Connection Established')


Connection String:
 DRIVER={ODBC Driver 17 for SQL Server};SERVER=(local);    DATABASE=AdventureWorks2019;UID=user1;PWD=pass1
SUCCESS: Connection Established


## 2. Using SQL Server and Quering directly to de DB using cursor

In [7]:
# mk functions to convert SQL queries to DF
import pandas as pd

def df_from_query(qry):     # convert cursor.execute(query) to DF
    cur.execute(qry)
    field_names = [i[0] for i in cur.description]
    get_data = [list(x) for x in cur]
    df = pd.DataFrame(data=get_data, columns=field_names)
    return df

def df_from_fetchall(qry):
    cur.execute(qry)
    results = cur.fetchall()
    cols = [i[0] for i in cur.description]
    from_db = [list(r) for r in results]
    df = pd.DataFrame(data=from_db, columns=cols)
    return df

In [8]:
### First query, first look to the products table
query1 = ''' SELECT * FROM Production.Product; '''

prods_df = df_from_query(query1)

print(query1)
display(prods_df.iloc[[0, 5, -5, -1]])
prods_df.columns   # to see all cols name of the table cause they are many
# Here I can: 1. see whole data in the Production.Product table; and
# 2. get the table in DF: products_table_df

 SELECT * FROM Production.Product; 


Unnamed: 0,ProductID,Name,ProductNumber,MakeFlag,FinishedGoodsFlag,Color,SafetyStockLevel,ReorderPoint,StandardCost,ListPrice,...,ProductLine,Class,Style,ProductSubcategoryID,ProductModelID,SellStartDate,SellEndDate,DiscontinuedDate,rowguid,ModifiedDate
0,1,Adjustable Race,AR-5381,False,False,,1000,750,0.0,0.0,...,,,,,,2008-04-30,NaT,,694215B7-08F7-4C0D-ACB1-D734BA44C0C8,2014-02-08 10:01:36.827
5,317,LL Crankarm,CA-5965,False,False,Black,500,375,0.0,0.0,...,,L,,,,2008-04-30,NaT,,3C9D10B7-A6B2-4774-9963-C19DCEE72FEA,2014-02-08 10:01:36.827
499,995,ML Bottom Bracket,BB-8107,True,True,,500,375,44.9506,101.24,...,,M,,5.0,96.0,2013-05-30,NaT,,71AB847F-D091-42D6-B735-7B0C2D82FC84,2014-02-08 10:01:36.827
503,999,"Road-750 Black, 52",BK-R19B-52,True,True,Black,100,75,343.6496,539.99,...,R,L,U,2.0,31.0,2013-05-30,NaT,,AE638923-2B67-4679-B90E-ABBAB17DCA31,2014-02-08 10:01:36.827


Index(['ProductID', 'Name', 'ProductNumber', 'MakeFlag', 'FinishedGoodsFlag',
       'Color', 'SafetyStockLevel', 'ReorderPoint', 'StandardCost',
       'ListPrice', 'Size', 'SizeUnitMeasureCode', 'WeightUnitMeasureCode',
       'Weight', 'DaysToManufacture', 'ProductLine', 'Class', 'Style',
       'ProductSubcategoryID', 'ProductModelID', 'SellStartDate',
       'SellEndDate', 'DiscontinuedDate', 'rowguid', 'ModifiedDate'],
      dtype='object')

## 3.1. Aggregate function COUNT()
How many products  (COUNT) of each color (GROUP BY) we have? - sorted (ORDER BY)

In [9]:
# Native SQL query direct to the table in the DF
q_1 = ''' SELECT Color, COUNT(Color) as Number
        FROM Production.Product
        GROUP BY Color
        ORDER BY Number DESC'''
df1 = df_from_query(q_1)
display(df1)

# Using Pandas value_counts() - get a sorted series as result
val_count_serie = prods_df.Color.value_counts()
print('-' * 40, '\n', val_count_serie, sep='')

# Using Pandas group by + count() - get unsorted series as result
group_by_serie = prods_df.groupby('Color').Color.count()
print('-' * 40, '\n', group_by_serie, sep='')

# %timeit shows ops to DF are faster but took a time early to read full tbl

Unnamed: 0,Color,Number
0,Black,93
1,Silver,43
2,Red,38
3,Yellow,36
4,Blue,26
5,Multi,8
6,Silver/Black,7
7,White,4
8,Grey,1
9,,0


----------------------------------------
Black           93
Silver          43
Red             38
Yellow          36
Blue            26
Multi            8
Silver/Black     7
White            4
Grey             1
Name: Color, dtype: int64
----------------------------------------
Color
Black           93
Blue            26
Grey             1
Multi            8
Red             38
Silver          43
Silver/Black     7
White            4
Yellow          36
Name: Color, dtype: int64


## 3.2. Many aggregates functions MIN, MAX, AVG, COUNT

In [10]:
## Want to know the min amount of total Order by product
# -- use other table: Purchasing.PurchaseOrderDetail
# 1st. get the all table in a DF to see it and later to use pandas on it
query2 = ''' SELECT * FROM Purchasing.PurchaseOrderDetail '''
orders_df = df_from_query(query2)
orders_df.iloc[[0, 9, -9, -1]]

Unnamed: 0,PurchaseOrderID,PurchaseOrderDetailID,DueDate,OrderQty,ProductID,UnitPrice,LineTotal,ReceivedQty,RejectedQty,StockedQty,ModifiedDate
0,1,1,2011-04-30,4,1,50.26,201.04,3.0,0.0,3.0,2011-04-23 00:00:00.000
9,7,10,2011-05-14,550,319,46.0635,25334.925,550.0,0.0,550.0,2011-05-07 00:00:00.000
8836,4011,8837,2014-07-24,250,876,33.88,8470.0,250.0,0.0,250.0,2015-08-12 12:25:46.470
8844,4012,8845,2014-07-24,6000,884,41.57,249420.0,6000.0,0.0,6000.0,2015-08-12 12:25:46.483


In [11]:
# List of pandas aggregate functions
display(orders_df.describe())
# Others using map() or apply()? + https://datascientyst.com/list-aggregation-functions-aggfunc-groupby-pandas/
orders_df.info()
# List of SQL Srv aggregate functions
# - https://learn.microsoft.com/en-us/sql/t-sql/functions/aggregate-functions-transact-sql?view=sql-server-ver16
# Others: https://learn.microsoft.com/en-us/sql/t-sql/functions/functions?view=sql-server-ver16

Unnamed: 0,PurchaseOrderID,PurchaseOrderDetailID,OrderQty,ProductID
count,8845.0,8845.0,8845.0,8845.0
mean,1992.321425,4423.0,265.53273,527.507518
std,1163.023779,2553.475899,355.926589,228.05066
min,1.0,1.0,3.0,1.0
25%,994.0,2212.0,3.0,367.0
50%,1978.0,4423.0,60.0,456.0
75%,3005.0,6634.0,550.0,527.0
max,4012.0,8845.0,8000.0,952.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8845 entries, 0 to 8844
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PurchaseOrderID        8845 non-null   int64         
 1   PurchaseOrderDetailID  8845 non-null   int64         
 2   DueDate                8845 non-null   datetime64[ns]
 3   OrderQty               8845 non-null   int64         
 4   ProductID              8845 non-null   int64         
 5   UnitPrice              8845 non-null   object        
 6   LineTotal              8845 non-null   object        
 7   ReceivedQty            8845 non-null   object        
 8   RejectedQty            8845 non-null   object        
 9   StockedQty             8845 non-null   object        
 10  ModifiedDate           8845 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(4), object(5)
memory usage: 760.2+ KB


In [12]:
# Make the query in SQL and Pandas
q_2 = ''' SELECT ProductID, COUNT(LineTotal) AS COUNT_LT,
                AVG(LineTotal) AS AVG_LT, STDEV(LineTotal) AS STD_LT,
                MIN(LineTotal) AS MIN_LT, MAX(LineTotal) AS MAX_LT
            FROM Purchasing.PurchaseOrderDetail
            GROUP BY ProductID
            ORDER BY ProductID '''
df2 = df_from_query(q_2)
display(df2)

group_by_df1 = orders_df.groupby('ProductID').LineTotal.agg([min, max, len])
display(group_by_df1)

group_by_df2 = orders_df.groupby('ProductID').LineTotal.describe()
display(group_by_df2)

## !! orders.LineTotal.dtype is Object !!
#orders_df.groupby('ProductID').LineTotal.mean()  - but.. is dtype Object
# Convert to float
print('orders_df.LineTotal.dtype:', orders_df.LineTotal.dtype )

Unnamed: 0,ProductID,COUNT_LT,AVG_LT,STD_LT,MIN_LT,MAX_LT
0,1,51,151.7757,7.036337,150.7905,201.0400
1,2,50,125.7480,0.000006,125.7480,125.7480
2,4,51,171.0765,0.000002,171.0765,171.0765
3,317,80,15575.1750,697.372283,14882.1750,16268.1750
4,318,80,19161.4500,697.372283,18468.4500,19854.4500
...,...,...,...,...,...,...
260,939,90,26559.2250,0.000000,26559.2250,26559.2250
261,940,42,34644.2250,0.000000,34644.2250,34644.2250
262,941,51,34644.2250,0.000000,34644.2250,34644.2250
263,948,50,45558.9750,0.000000,45558.9750,45558.9750


Unnamed: 0_level_0,min,max,len
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,150.7905,201.0400,51
2,125.7480,125.7480,50
4,171.0765,171.0765,51
317,14882.1750,16268.1750,80
318,18468.4500,19854.4500,80
...,...,...,...
939,26559.2250,26559.2250,90
940,34644.2250,34644.2250,42
941,34644.2250,34644.2250,51
948,45558.9750,45558.9750,50


Unnamed: 0_level_0,count,unique,top,freq
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,51,2,150.7905,50
2,50,1,125.7480,50
4,51,1,171.0765,51
317,80,2,14882.1750,40
318,80,2,18468.4500,40
...,...,...,...,...
939,90,1,26559.2250,90
940,42,1,34644.2250,42
941,51,1,34644.2250,51
948,50,1,45558.9750,50


orders_df.LineTotal.dtype: object


In [13]:
## Convert LineTotal to float:
# - astype(), to_numeric() w/downcast, infer_objects(), convert_dtypes()
print(orders_df.LineTotal.dtype)
#orders_df.LineTotal = pd.to_numeric(orders_df.LineTotal, downcast='float')
orders_df.LineTotal = orders_df.LineTotal.astype('float32')
print(orders_df.LineTotal.dtype, '\n')
orders_df.info()

object
float32 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8845 entries, 0 to 8844
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   PurchaseOrderID        8845 non-null   int64         
 1   PurchaseOrderDetailID  8845 non-null   int64         
 2   DueDate                8845 non-null   datetime64[ns]
 3   OrderQty               8845 non-null   int64         
 4   ProductID              8845 non-null   int64         
 5   UnitPrice              8845 non-null   object        
 6   LineTotal              8845 non-null   float32       
 7   ReceivedQty            8845 non-null   object        
 8   RejectedQty            8845 non-null   object        
 9   StockedQty             8845 non-null   object        
 10  ModifiedDate           8845 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float32(1), int64(4), object(4)
memory usage: 725.7+ KB


In [14]:
## And now we can use statistical aggregate functions
orders_df.groupby('ProductID').LineTotal.describe()
#orders_df.groupby('ProductID').OrderQty.agg([count, mean]) !?!?

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,51.0,151.775787,7.036337,150.790497,150.790497,150.790497,150.790497,201.039993
2,50.0,125.748001,0.000000,125.748001,125.748001,125.748001,125.748001,125.748001
4,51.0,171.076492,0.000000,171.076508,171.076508,171.076508,171.076508,171.076508
317,80.0,15575.173828,697.372253,14882.174805,14882.174805,15575.174805,16268.174805,16268.174805
318,80.0,19161.451172,697.372253,18468.449219,18468.449219,19161.449219,19854.449219,19854.449219
...,...,...,...,...,...,...,...,...
939,90.0,26559.222656,0.000000,26559.224609,26559.224609,26559.224609,26559.224609,26559.224609
940,42.0,34644.226562,0.000000,34644.226562,34644.226562,34644.226562,34644.226562,34644.226562
941,51.0,34644.226562,0.000000,34644.226562,34644.226562,34644.226562,34644.226562,34644.226562
948,50.0,45558.976562,0.000000,45558.976562,45558.976562,45558.976562,45558.976562,45558.976562


## 4. Aggregate function w/condition (GROUP BY... HAVING)

In [15]:
## What chip product (<10) have more than 80 orders?
q_3 = ''' SELECT ProductID, COUNT(1) AS count_col
            FROM Purchasing.PurchaseOrderDetail
            WHERE UnitPrice < 10
            GROUP BY ProductID
            HAVING COUNT(1) > 80
            ORDER BY ProductID'''
df3 = df_from_query(q_3)
df3

Unnamed: 0,ProductID,count_col
0,351,84
1,352,84
2,356,101
3,357,101
4,679,83


In [16]:
## Same in pure Pandas - orders_df is the entire table
# in to steps first select in the table rows w/UnitPrice < 10
df_less10 = orders_df.loc[orders_df.UnitPrice < 10]
#df_less10
# And now group by ProductID and count
df_l10_gbPID = df_less10.groupby('ProductID').count()
#df_l10_gbPID
# And now select whichever col > 80
#df_r = df_l10_gbPID[df_l10_gbPID.DueDate > 80]
df_r2 = df_l10_gbPID.loc[df_l10_gbPID.DueDate > 80]
display(df_r2)
# In one line - use query cause .loc need all the DF.
df_r1 = orders_df.loc[orders_df.UnitPrice < 10].groupby(
    'ProductID').count().query('DueDate > 80')
#df_r
### BUT.. COLS different see NEXT

Unnamed: 0_level_0,PurchaseOrderID,PurchaseOrderDetailID,DueDate,OrderQty,UnitPrice,LineTotal,ReceivedQty,RejectedQty,StockedQty,ModifiedDate
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
351,84,84,84,84,84,84,84,84,84,84
352,84,84,84,84,84,84,84,84,84,84
356,101,101,101,101,101,101,101,101,101,101
357,101,101,101,101,101,101,101,101,101,101
679,83,83,83,83,83,83,83,83,83,83


In [17]:
## Only one count_col like direct query
df_2l10 = orders_df.loc[orders_df.UnitPrice < 10]
df_l10_bg2 = df_2l10.groupby('ProductID').agg(
    count_col=pd.NamedAgg(column='ProductID', aggfunc='count'))
df_l10_bg2.loc[df_l10_bg2.count_col > 80]
df_r3 = df_l10_bg2.loc[df_l10_bg2.count_col > 80]
df_r3.reset_index(inplace=True)
# df_r4 = df_r3.count_col.to_frame().reset_index()
# print(df_r3.index)
# print(df_r3.count_col)
# print(type(df_r3))
# df_r3.index = range(len(df_r3))
df_r3

Unnamed: 0,ProductID,count_col
0,351,84
1,352,84
2,356,101
3,357,101
4,679,83
