# Part 3
1. Descriptive Functions
2. Function Application and Iteration
3. Aggregation Techniques

## 1. Descriptive Function
- head / tail
- shape
- info
- accessing raw data

In [2]:
# Setup default imports here
import pandas as pd
import numpy as np

In [7]:
# example df
df_len = 50
example_df = pd.DataFrame({
    'A': np.random.randint(0, 100, df_len),
    'B': range(0, df_len),
    'C': np.random.randint(0, 100, df_len),
    'D': np.random.randint(0, 100, df_len),
    'E': np.random.choice(["E1", "E2", "E3", "E4", "E5"], df_len),
    'F': np.random.choice(["F1", "F2", "F3", "F4", "F5"], df_len),
})

In [8]:
# head / tail
example_df.head()

Unnamed: 0,A,B,C,D,E,F
0,53,0,59,57,E5,F5
1,14,1,24,5,E5,F2
2,4,2,19,14,E2,F5
3,68,3,13,82,E1,F3
4,12,4,66,73,E2,F1


In [9]:
example_df.tail()

Unnamed: 0,A,B,C,D,E,F
45,18,45,55,66,E2,F4
46,53,46,82,90,E4,F2
47,69,47,33,96,E5,F4
48,85,48,14,88,E3,F5
49,54,49,63,55,E2,F2


In [13]:
# shape (rows x columns)
example_df.shape

(50, 6)

In [14]:
# info
example_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       50 non-null     int32 
 1   B       50 non-null     int64 
 2   C       50 non-null     int32 
 3   D       50 non-null     int32 
 4   E       50 non-null     object
 5   F       50 non-null     object
dtypes: int32(3), int64(1), object(2)
memory usage: 1.9+ KB


## accessing raw data
use .to_numpy(), avoid using .values where possible
- .array returns ExtensionArray and never copies data
- .to_numpy() returns a numpy array

Why?
.values return type can be ambiguous as it could possibly resolve to either .array or .to_numpy()

In [21]:
# to_numpy()
example_df.to_numpy()

array([[53, 0, 59, 57, 'E5', 'F5'],
       [14, 1, 24, 5, 'E5', 'F2'],
       [4, 2, 19, 14, 'E2', 'F5'],
       [68, 3, 13, 82, 'E1', 'F3'],
       [12, 4, 66, 73, 'E2', 'F1'],
       [79, 5, 19, 43, 'E2', 'F4'],
       [99, 6, 57, 99, 'E4', 'F2'],
       [27, 7, 54, 29, 'E5', 'F5'],
       [66, 8, 5, 96, 'E4', 'F4'],
       [8, 9, 45, 57, 'E3', 'F2'],
       [32, 10, 53, 19, 'E5', 'F3'],
       [96, 11, 81, 20, 'E4', 'F1'],
       [73, 12, 51, 19, 'E4', 'F2'],
       [27, 13, 51, 87, 'E5', 'F3'],
       [22, 14, 23, 66, 'E1', 'F1'],
       [32, 15, 23, 17, 'E2', 'F4'],
       [53, 16, 26, 17, 'E2', 'F1'],
       [87, 17, 81, 30, 'E5', 'F1'],
       [24, 18, 2, 31, 'E2', 'F5'],
       [94, 19, 74, 49, 'E5', 'F4'],
       [95, 20, 13, 8, 'E3', 'F5'],
       [75, 21, 29, 71, 'E2', 'F3'],
       [69, 22, 57, 31, 'E3', 'F3'],
       [10, 23, 72, 12, 'E5', 'F1'],
       [93, 24, 93, 52, 'E1', 'F1'],
       [28, 25, 40, 58, 'E1', 'F1'],
       [97, 26, 93, 16, 'E2', 'F5'],
       [66, 27, 3

## 2. Function Application and Iteration
### table-wise: pipe()
### row/column: apply()
### element-wise: applymap()
### iteration

## table-wise: pipe()

In [28]:
simple_df = pd.DataFrame({
    'A': pd.Series(1, index=range(0,10)),
    'B': pd.Series(1, index=range(0,10))
})

def add_n(input_df, n):
    return input_df + n

def multiply_by_n(input_df, n):
    return input_df * n

# should return 20
simple_df.pipe(add_n, 9).pipe(multiply_by_n, 2).head()

Unnamed: 0,A,B
0,20,20
1,20,20
2,20,20
3,20,20
4,20,20


## row/column: apply()

In [42]:
def add_9_multiply_by_2(row):
    return (row + 9) * 2

simple_df.apply(add_9_multiply_by_2).head()

Unnamed: 0,A,B
0,20,20
1,20,20
2,20,20
3,20,20
4,20,20


## element-wise: applymap()

In [38]:
def add_to_element_9_multiply_by_2(row):
    return (row + 9) * 2

simple_df.applymap(add_to_element_9_multiply_by_2).head()

Unnamed: 0,A,B
0,20,20
1,20,20
2,20,20
3,20,20
4,20,20


## iteration

## 3. GroupBy / Aggregation Data

In [44]:
# Create a DataFrame
data = {
    'Name': ['John', 'Jane', 'John', 'Alice', 'Jane'],
    'Age': [25, 30, 28, 32, 27],
    'Salary': [50000, 60000, 55000, 70000, 65000]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary
0,John,25,50000
1,Jane,30,60000
2,John,28,55000
3,Alice,32,70000
4,Jane,27,65000


In [46]:
# Basic grouping with aggregation
# Group by 'Name' and calculate the average salary
grouped_df = df.groupby('Name')['Salary'].mean()

print(grouped_df)

Name
Alice    70000.0
Jane     62500.0
John     52500.0
Name: Salary, dtype: float64


In [47]:
# Grouping with Multiple Columns and Aggregations:
# Group by 'Name' and 'Age', calculate the average salary and maximum age
grouped_df = df.groupby(['Name', 'Age']).agg({'Salary': 'mean', 'Age': 'max'})
print(grouped_df)

            Salary  Age
Name  Age              
Alice 32   70000.0   32
Jane  27   65000.0   27
      30   60000.0   30
John  25   50000.0   25
      28   55000.0   28


In [48]:
# Grouping with Custom Aggregation Functions:
# Define a custom aggregation function
def salary_range(series):
    return series.max() - series.min()


# Group by 'Name' and calculate the salary range using the custom function
grouped_df = df.groupby('Name').agg({'Salary': salary_range})
print(grouped_df)

       Salary
Name         
Alice       0
Jane     5000
John     5000
