In [1]:
# Upgrade your Numpy and Pandas Packages 
# ! pip install --upgrade pandas

In [2]:
# !pip install --upgrade numpy

In [3]:
import numpy as np
import pandas as pd

print("Numpy Version is: ", np.__version__)
print("Pandas Version is: ", pd.__version__)

Numpy Version is:  1.26.4
Pandas Version is:  2.2.3


In [4]:
np.random.seed(123)
data = [['row-1', 'row-2', 'row-3', 'row-4', 'row-5', 'row-6']]

df = pd.DataFrame(np.random.randint(1,5,(6, 4)), index = data,
                 columns = ['Col-1', 'Col-2', 'Col-3', 'Col-4'])
df

Unnamed: 0,Col-1,Col-2,Col-3,Col-4
row-1,3,2,3,3
row-2,1,3,3,2
row-3,4,3,4,2
row-4,3,2,1,2
row-5,3,4,2,1
row-6,3,1,4,2


# Add a New Column

In [5]:
# Add a new column to Dataframe that includes all ones
df['Col-5'] = np.ones(6)
df

Unnamed: 0,Col-1,Col-2,Col-3,Col-4,Col-5
row-1,3,2,3,3,1.0
row-2,1,3,3,2,1.0
row-3,4,3,4,2,1.0
row-4,3,2,1,2,1.0
row-5,3,4,2,1,1.0
row-6,3,1,4,2,1.0


# GroupBy Columns as grouping keys

In [6]:
# Group and show the number of each group as a size value
# Result of this is a Pandas Series. You can convert it to frames using to_frames() operation
df.groupby(["Col-1"]).size()

Col-1
1    1
3    4
4    1
dtype: int64

In [7]:
# Group and calulate mean
df.groupby(["Col-1"]).mean()

Unnamed: 0_level_0,Col-2,Col-3,Col-4,Col-5
Col-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.0,3.0,2.0,1.0
3,2.25,2.5,2.0,1.0
4,3.0,4.0,2.0,1.0


In [8]:
# Group and sum the result
df.groupby(["Col-1"]).sum()

Unnamed: 0_level_0,Col-2,Col-3,Col-4,Col-5
Col-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,3,2,1.0
3,9,10,8,4.0
4,3,4,2,1.0


In [9]:
# Group by two different columns and calulate mean
# The combination of these two keys creates a new combined key.
df.groupby(["Col-1", "Col-2"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Col-3,Col-4,Col-5
Col-1,Col-2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,3.0,2.0,1.0
3,1,4.0,2.0,1.0
3,2,2.0,2.5,1.0
3,4,2.0,1.0,1.0
4,3,4.0,2.0,1.0


# GroupBy and Apply

In [10]:
df.groupby(["Col-1"], group_keys=True).apply(lambda x: x/x.sum(), include_groups=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Col-2,Col-3,Col-4,Col-5
Col-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,row-2,1.0,1.0,1.0,1.0
3,row-1,0.222222,0.3,0.375,0.25
3,row-4,0.222222,0.1,0.25,0.25
3,row-5,0.444444,0.2,0.125,0.25
3,row-6,0.111111,0.4,0.25,0.25
4,row-3,1.0,1.0,1.0,1.0


## GroupBy and Apply

In [11]:
# Let us create another DataFrame like this
df2 = pd.DataFrame({'A': 'a a b'.split(),
                   'B': [1,2,3],
                   'C': [4,6,5]})
df2

Unnamed: 0,A,B,C
0,a,1,4
1,a,2,6
2,b,3,5


In [12]:
g1 = df2.groupby('A', group_keys=False)
g1[['B', 'C']].apply(lambda x: x / x.sum())

Unnamed: 0,B,C
0,0.333333,0.4
1,0.666667,0.6
2,1.0,1.0


In [13]:
g2 = df2.groupby('A', group_keys=True)
g2[['B', 'C']].apply(lambda x: x / x.sum())

Unnamed: 0_level_0,Unnamed: 1_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0,0.333333,0.4
a,1,0.666667,0.6
b,2,1.0,1.0
