In [749]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_wine

In [750]:
wine = load_wine(as_frame=True).data  # Target column is not included
wine.head()
wine.shape

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


(178, 13)

# Selecting and accessing data

The three main accessors are:

- `[]` (The Indexing Operator): Primarily for selecting columns.

- `.loc` (Label-based): Selects by the name of the index/column.

- `.iloc` (Integer-based): Selects by the position (0-based index).

In [751]:
# Series
wine['alcohol']  # or `df.alcohol`

# Dataframe (list of cols)
wine[['alcohol']]

0      14.23
1      13.20
2      13.16
3      14.37
4      13.24
       ...  
173    13.71
174    13.40
175    13.27
176    13.17
177    14.13
Name: alcohol, Length: 178, dtype: float64

Unnamed: 0,alcohol
0,14.23
1,13.20
2,13.16
3,14.37
4,13.24
...,...
173,13.71
174,13.40
175,13.27
176,13.17


In [752]:
wine[['alcohol', 'ash']]  # Select many columns

Unnamed: 0,alcohol,ash
0,14.23,2.43
1,13.20,2.14
2,13.16,2.67
3,14.37,2.50
4,13.24,2.87
...,...,...
173,13.71,2.45
174,13.40,2.48
175,13.27,2.26
176,13.17,2.37


### By Label (`.loc`)

Uses the actual name of the index/column. Note: `.loc` slicing is inclusive of the stop bound.

In [753]:
wine.loc[0]  # Series
wine.loc[[0]]  # Dataframe

alcohol                           14.23
malic_acid                         1.71
ash                                2.43
alcalinity_of_ash                 15.60
magnesium                        127.00
total_phenols                      2.80
flavanoids                         3.06
nonflavanoid_phenols               0.28
proanthocyanins                    2.29
color_intensity                    5.64
hue                                1.04
od280/od315_of_diluted_wines       3.92
proline                         1065.00
Name: 0, dtype: float64

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0


In [754]:
wine.loc[[0, 1, 2]]  # Select many rows

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0


In [755]:
wine.loc[4:9]  # Select range (start:stop:step)
wine.loc[4:9:2]  # With step too

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0
5,14.2,1.76,2.45,15.2,112.0,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450.0
6,14.39,1.87,2.45,14.6,96.0,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290.0
7,14.06,2.15,2.61,17.6,121.0,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295.0
8,14.83,1.64,2.17,14.0,97.0,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045.0
9,13.86,1.35,2.27,16.0,98.0,2.98,3.15,0.22,1.85,7.22,1.01,3.55,1045.0


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0
6,14.39,1.87,2.45,14.6,96.0,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290.0
8,14.83,1.64,2.17,14.0,97.0,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045.0


**Simultaneous Row & Column Access**

In [756]:
# Get col values for just 1 row

wine.loc[0, 'alcohol']  # Scalar (single value)
wine.loc[0, ['alcohol']]  # Series (of length 1, having value for just 1 col)
wine.loc[0, ['alcohol', 'ash', 'magnesium']]  # Series (length > 1, having values for many cols)

np.float64(14.23)

alcohol    14.23
Name: 0, dtype: float64

alcohol       14.23
ash            2.43
magnesium    127.00
Name: 0, dtype: float64

In [757]:
# Get col values for for many rows

wine.loc[0:4, 'alcohol']  # Series
wine.loc[0:4, ['alcohol']]  # Dataframe

0    14.23
1    13.20
2    13.16
3    14.37
4    13.24
Name: alcohol, dtype: float64

Unnamed: 0,alcohol
0,14.23
1,13.2
2,13.16
3,14.37
4,13.24


In [758]:
wine.loc[0:4, 'alcohol':'flavanoids']  # Range of cols (includes last col too)
wine.loc[[10, 11, 12], 'alcohol':'flavanoids']  # Pick specific rows

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids
10,14.1,2.16,2.3,18.0,105.0,2.95,3.32
11,14.12,1.48,2.32,16.8,95.0,2.2,2.43
12,13.75,1.73,2.41,16.0,89.0,2.6,2.76


In [759]:
wine.loc[:, ['alcohol', 'ash']]  # All rows

Unnamed: 0,alcohol,ash
0,14.23,2.43
1,13.20,2.14
2,13.16,2.67
3,14.37,2.50
4,13.24,2.87
...,...,...
173,13.71,2.45
174,13.40,2.48
175,13.27,2.26
176,13.17,2.37


## By Position (`.iloc`)

In [760]:
wine.iloc[0]  # Series
wine.iloc[[0, 1, 2]]  # Dataframe

alcohol                           14.23
malic_acid                         1.71
ash                                2.43
alcalinity_of_ash                 15.60
magnesium                        127.00
total_phenols                      2.80
flavanoids                         3.06
nonflavanoid_phenols               0.28
proanthocyanins                    2.29
color_intensity                    5.64
hue                                1.04
od280/od315_of_diluted_wines       3.92
proline                         1065.00
Name: 0, dtype: float64

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0


In [761]:
wine.iloc[0:5]  # Last index not included

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [762]:
wine.iloc[-1]  # Last row (Series)

alcohol                          14.13
malic_acid                        4.10
ash                               2.74
alcalinity_of_ash                24.50
magnesium                        96.00
total_phenols                     2.05
flavanoids                        0.76
nonflavanoid_phenols              0.56
proanthocyanins                   1.35
color_intensity                   9.20
hue                               0.61
od280/od315_of_diluted_wines      1.60
proline                         560.00
Name: 177, dtype: float64

**Simultaneous Row & Column Access**

In [763]:
wine.iloc[0, 1]  # Scalar
wine.iloc[0:5, 0:2]  # First 5 rows, first 2 cols
wine.iloc[[0, 2], [1, 3]]  # Rows 0 & 2, Cols 1 & 3

np.float64(1.71)

Unnamed: 0,alcohol,malic_acid
0,14.23,1.71
1,13.2,1.78
2,13.16,2.36
3,14.37,1.95
4,13.24,2.59


Unnamed: 0,malic_acid,alcalinity_of_ash
0,1.71,15.6
2,2.36,18.6


## Fast Scalar Access (Single Value)

In [764]:
wine.at[0, 'alcohol']  # Label-based scalar lookup
wine.iat[0, 0]  # Integer-position scalar lookup

np.float64(14.23)

np.float64(14.23)

# Boolean indexing - to filter rows

You create a "mask" — a Series of `True`/`False` values — and pass it to the DataFrame.

The DataFrame keeps only the rows where the mask is `True`.

In [765]:
mask = wine['alcohol'] > 14.5  # Series, T/F for each row
mask.head()
mask.sum()  # How many `True`s we have

0    False
1    False
2    False
3    False
4    False
Name: alcohol, dtype: bool

np.int64(2)

In [766]:
wine[mask]  # Extract rows where `True`

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
8,14.83,1.64,2.17,14.0,97.0,2.8,2.98,0.29,1.98,5.2,1.08,2.85,1045.0
13,14.75,1.73,2.39,11.4,91.0,3.1,3.69,0.43,2.81,5.4,1.25,2.73,1150.0


**Multiple Conditions**

You must use bitwise operators and wrap every condition in parentheses.

- AND (`&`)
- OR (`|`)
- NOT (`~`)

In [767]:
wine[(wine['malic_acid'] > 4) & (wine['alcohol'] < 12)]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
124,11.87,4.31,2.39,21.0,82.0,2.86,3.03,0.21,2.91,2.8,0.75,3.64,380.0


In [768]:
wine.shape
wine[(wine['alcohol'] > 14.5)].shape
wine[~(wine['alcohol'] > 14.5)].shape

(178, 13)

(2, 13)

(176, 13)

**Handy Filtering Methods**

Pandas provides built-in methods that return boolean masks, making your code cleaner

In [769]:
mask = wine['magnesium'].isin([120, 70, 95])  # If a value exists in a list
wine[mask]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
11,14.12,1.48,2.32,16.8,95.0,2.2,2.43,0.26,1.57,5.0,1.17,2.82,1280.0
16,14.3,1.92,2.72,20.0,120.0,2.8,3.14,0.33,1.97,6.2,1.07,2.65,1280.0
23,12.85,1.6,2.52,17.8,95.0,2.48,2.37,0.26,1.46,3.93,1.09,3.63,1015.0
89,12.08,1.33,2.3,23.6,70.0,2.2,1.59,0.42,1.38,1.74,1.07,3.21,625.0
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.7,0.64,1.74,740.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840.0


In [770]:
mask = wine['total_phenols'].between(1.50, 1.55)  # Inclusive of the boundaries by default
wine[mask]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
130,12.86,1.35,2.32,18.0,122.0,1.51,1.25,0.21,0.94,4.1,0.76,1.29,630.0
140,12.93,2.81,2.7,21.0,96.0,1.54,0.5,0.53,0.75,4.6,0.77,2.31,600.0
142,13.52,3.17,2.72,23.5,97.0,1.55,0.52,0.5,0.55,4.35,0.89,2.06,520.0
145,13.16,3.57,2.15,21.0,102.0,1.5,0.55,0.43,1.3,4.0,0.6,1.68,830.0
168,13.58,2.58,2.69,24.5,105.0,1.55,0.84,0.39,1.54,8.66,0.74,1.8,750.0


# String Methods (`.str`)

If you have text data, you can use vectorised string methods.

In [771]:
students = pd.DataFrame({
    "name": ['rahul', 'harikesh', 'vishnu', 'vidu'],
    "age": [22, 23, 10, 21],
    "salary": [1000, 950, 1100, 5000],
    "marks": [100, None, None, 90]
})

In [772]:
students['name'].str.upper()

0       RAHUL
1    HARIKESH
2      VISHNU
3        VIDU
Name: name, dtype: object

In [773]:
mask = students['name'].str.startswith('vi')
students[mask]

Unnamed: 0,name,age,salary,marks
2,vishnu,10,1100,
3,vidu,21,5000,90.0


In [774]:
mask = students['name'].str.contains('sh')
students[mask]

Unnamed: 0,name,age,salary,marks
1,harikesh,23,950,
2,vishnu,10,1100,


# Null Checks (`.isna`/`.notna`)

In [775]:
mask = students['marks'].isna()
students[mask]

mask = students['marks'].notna()
students[mask]

Unnamed: 0,name,age,salary,marks
1,harikesh,23,950,
2,vishnu,10,1100,


Unnamed: 0,name,age,salary,marks
0,rahul,22,1000,100.0
3,vidu,21,5000,90.0


# Modifying

**What is a View vs. a Copy?**

DataFrame values are stored in memory. To be efficient, Pandas tries not to duplicate data unless it has to.

- View: This is a "window" into the original data. It shares the same memory buffer.
  - If you change the View, you change the Original.

- Copy: This is a completely new object with its own memory.
  - If you change the Copy, the Original is untouched.

**The Problem:** When you filter or slice data, Pandas doesn't always tell you explicitly if it returned a View or a Copy.

It decides based on memory layout and performance. This ambiguity causes the "SettingWithCopy" warning.

In [776]:
jobs = pd.DataFrame({
    "status": np.random.permutation(['pending'] * 10 + ['processing'] * 10),
    "priority": np.random.permutation(['low'] * 10 + ['medium'] * 5 + ['high'] * 5)
})

jobs.head()

Unnamed: 0,status,priority
0,processing,high
1,pending,medium
2,processing,high
3,pending,low
4,pending,low


## The "SettingWithCopyWarning"

This warning usually happens when you perform Chained Assignment.

In [777]:
# This is a Copy (a temporary subset of original dataframe)
jobs[jobs['status'] == 'pending']

Unnamed: 0,status,priority
1,pending,medium
3,pending,low
4,pending,low
5,pending,low
6,pending,high
7,pending,low
9,pending,medium
12,pending,medium
14,pending,low
17,pending,low


In [778]:
# Updates column 'priority' inside that temporary copy
jobs[jobs['status'] == 'pending']['priority'] = 'high'

# The temporary copy is thrown away. The original dataframe is not updated.
# Pandas sees this and warns.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jobs[jobs['status'] == 'pending']['priority'] = 'high'


In [779]:
jobs[jobs['status'] == 'pending']  # Original dataframe is NOT updated

Unnamed: 0,status,priority
1,pending,medium
3,pending,low
4,pending,low
5,pending,low
6,pending,high
7,pending,low
9,pending,medium
12,pending,medium
14,pending,low
17,pending,low


**How to Update Data Correctly**

**Scenario A: You want to update the ORIGINAL DataFrame**

Use `.loc`. This is a single operation, not two steps, so it guarantees you are operating on the original data.

In [780]:
jobs.loc[jobs['status'] == 'pending', 'priority'] = 'high'

In [781]:
jobs[jobs['status'] == 'pending']  # Original dataframe is updated

Unnamed: 0,status,priority
1,pending,high
3,pending,high
4,pending,high
5,pending,high
6,pending,high
7,pending,high
9,pending,high
12,pending,high
14,pending,high
17,pending,high


**Scenario B: You want a SEPARATE COPY**

You intentionally create a subset and want to work on it separately without affecting the main dataframe.

Explicitly tell Pandas to make a copy using .`copy()`.

In [782]:
jobs_processing = jobs[jobs['status'] == 'processing'].copy()
jobs_processing  # Explicitly create a copy

Unnamed: 0,status,priority
0,processing,high
2,processing,high
8,processing,high
10,processing,low
11,processing,low
13,processing,high
15,processing,medium
16,processing,low
18,processing,low
19,processing,medium


In [783]:
jobs_processing['priority'] = 'low'  # No warning. Pandas knows this is a standalone object
jobs_processing  # Your copied subset is updated

Unnamed: 0,status,priority
0,processing,low
2,processing,low
8,processing,low
10,processing,low
11,processing,low
13,processing,low
15,processing,low
16,processing,low
18,processing,low
19,processing,low


In [784]:
jobs[jobs['status'] == 'processing']  # Original data is NOT updated

Unnamed: 0,status,priority
0,processing,high
2,processing,high
8,processing,high
10,processing,low
11,processing,low
13,processing,high
15,processing,medium
16,processing,low
18,processing,low
19,processing,medium


# Renaming Columns & Rows

The primary method is `.rename()`, which accepts a dictionary mapping `{ 'Old Name' : 'New Name' }`.

⚠️ By default, `.rename()`/`.drop()` do not change the original DataFrame. They return a new copy with the changes.

To save changes:

- Reassign the variable (`df = df.drop(...)`)
- To change in-place: Use the argument `inplace=True` (not preferred).

In [785]:
students.rename(columns={'age': 'years'})

Unnamed: 0,name,years,salary,marks
0,rahul,22,1000,100.0
1,harikesh,23,950,
2,vishnu,10,1100,
3,vidu,21,5000,90.0


In [786]:
_students = students.copy()  # lets not disturb students
_students.columns = ['first_name', 'years', "income", "score"]  # bulk renaming, modifies inplace
_students

Unnamed: 0,first_name,years,income,score
0,rahul,22,1000,100.0
1,harikesh,23,950,
2,vishnu,10,1100,
3,vidu,21,5000,90.0


# Dropping Columns

⭐ **Axis 0: Points DOWN (↓). It follows the Rows.**

⭐ **Axis 1: Points ACROSS (→). It follows the Columns.**

When you do math, the axis parameter tells Pandas **which dimension to squash (collapse)**.

* **`df.sum(axis=0)`**: The arrow goes **DOWN** the rows. It squashes all the rows together.
  * Result: You get totals for each Column.


* **`df.sum(axis=1)`**: The arrow goes **ACROSS** the columns. It squashes all the columns together.
  * Result: You get a total for each Row.

When you use `drop`, the axis parameter tells Pandas **where to look for the label**.

* **`df.drop(..., axis=0)`**: Look at the **Row Labels** (Index).
  * Result: You delete a Row.


* **`df.drop(..., axis=1)`**: Look at the **Column Names** (Headers).
  * Result: You delete a Column.

In [787]:
# Drop a single column
students.drop('age', axis=1)

# Drop multiple columns
students.drop(['age', 'salary'], axis=1)

Unnamed: 0,name,salary,marks
0,rahul,1000,100.0
1,harikesh,950,
2,vishnu,1100,
3,vidu,5000,90.0


Unnamed: 0,name,marks
0,rahul,100.0
1,harikesh,
2,vishnu,
3,vidu,90.0


In [788]:
# Drop & Return
popped_col = _students.pop('first_name')  # modifies inplace
popped_col

0       rahul
1    harikesh
2      vishnu
3        vidu
Name: first_name, dtype: object

# Dropping Rows

In [789]:
# Drop row with index label 0
students.drop(0)

# Drop rows by position (e.g., first 5 rows)
# You must get the labels of those positions first
jobs.drop(jobs.index[0:5])

Unnamed: 0,name,age,salary,marks
1,harikesh,23,950,
2,vishnu,10,1100,
3,vidu,21,5000,90.0


Unnamed: 0,status,priority
5,pending,high
6,pending,high
7,pending,high
8,processing,high
9,pending,high
10,processing,low
11,processing,low
12,pending,high
13,processing,high
14,pending,high


**By Condition (Filtering)**

⭐ Usually, we don't "drop" rows; we just "keep" the ones we want. This is often faster and more readable.

In [790]:
jobs[jobs['status'] == 'pending']  # Dropped rows where status was 'processing'

Unnamed: 0,status,priority
1,pending,high
3,pending,high
4,pending,high
5,pending,high
6,pending,high
7,pending,high
9,pending,high
12,pending,high
14,pending,high
17,pending,high


# `.map()` - Substitution

Scope: Works only on a Series (a single column).

Best For: Mapping values using a dictionary (lookup) or a simple function.

In [791]:
name_map = {
    "rahul": "me",
    # "harikesh": Not in dictionary,
    "vidu": "bro",
    "vishnu": "monkey",
}

# Note: If a value is NOT in the dictionary, .map() converts it to NaN
students['name'].map(name_map)  # Reassign to update

0        me
1       NaN
2    monkey
3       bro
Name: name, dtype: object

In [792]:
# Syntax 2: Function

# Calculate length of every string in 'name'
students['name'].map(len)

0    5
1    8
2    6
3    4
Name: name, dtype: int64

# `.apply()` - Substitution

Similar to `.map()`, but ONLY accept functions.

Best For: applying complex functions that can't be vectorized.

⚠️ `.apply()` is essentially a Python for loop under the hood. It is slow compared to native Pandas/NumPy vectorization.

**Always try to use direct math first.**

| Task | **Bad (Slow)** using `.apply()` | **Good (Fast)** Vectorized |
| --- | --- | --- |
| **Add 2 cols** | `df.apply(lambda x: x['A'] + x['B'], axis=1)` | `df['A'] + df['B']` |
| **Multiply col** | `df['A'].apply(lambda x: x * 5)` | `df['A'] * 5` |
| **String Slice** | `df['S'].apply(lambda x: x[:3])` | `df['S'].str[:3]` |

In [793]:
# A. On a Series (Element-wise)

def classify_salary(salary):
  return "Rich" if salary >= 5000 else "Poor"


# Apply function to every cell in 'salary'
students['salary'].apply(classify_salary)  # Reassign to update

0    Poor
1    Poor
2    Poor
3    Rich
Name: salary, dtype: object

B. On a DataFrame (Row-wise or Column-wise)

axis=1 (Row-wise):
- The function receives each row as a Series.
- You can access multiple columns inside the function.

axis=0 (Column-wise):
- The function receives each column.
- Used for aggregates like sum or mean.


In [798]:
def calculate_power(row):
  return len(row['name']) * row['salary']


# axis=1: passes all rows to the function, one by one
students.apply(calculate_power, axis=1)

0     5000
1     7600
2     6600
3    20000
dtype: int64

In [799]:
def calculate_mean(col):
  return sum(col) / len(col)


# axis=0: passes all cols to the function, one by one
wine.apply(calculate_mean, axis=0)

alcohol                          13.000618
malic_acid                        2.336348
ash                               2.366517
alcalinity_of_ash                19.494944
magnesium                        99.741573
total_phenols                     2.295112
flavanoids                        2.029270
nonflavanoid_phenols              0.361854
proanthocyanins                   1.590899
color_intensity                   5.058090
hue                               0.957449
od280/od315_of_diluted_wines      2.611685
proline                         746.893258
dtype: float64