In [17]:
import numpy as np
import pandas as pd

In [18]:
data = {
    "Student_ID": ["S01","S02","S03","S04","S05","S06","S07","S08"],
    "Age": [20,21,22,np.nan,20,23,21,22],
    "Marks": [85,np.nan,92,45,300,78,65,np.nan],
    "Attendance": [90,75,95,40,85,np.nan,70,60],
    "Gender": ["Male","Female","Male","Female","Male","Female","Male","Female"],
    "Department": ["CS","IT","CS","IT","CS","CS","IT","CS"]
}
df=pd.DataFrame(data)
print(df)

  Student_ID   Age  Marks  Attendance  Gender Department
0        S01  20.0   85.0        90.0    Male         CS
1        S02  21.0    NaN        75.0  Female         IT
2        S03  22.0   92.0        95.0    Male         CS
3        S04   NaN   45.0        40.0  Female         IT
4        S05  20.0  300.0        85.0    Male         CS
5        S06  23.0   78.0         NaN  Female         CS
6        S07  21.0   65.0        70.0    Male         IT
7        S08  22.0    NaN        60.0  Female         CS


In [19]:
df.shape

(8, 6)

In [20]:
df.dtypes

Unnamed: 0,0
Student_ID,object
Age,float64
Marks,float64
Attendance,float64
Gender,object
Department,object


In [21]:
print(df.head(5))

  Student_ID   Age  Marks  Attendance  Gender Department
0        S01  20.0   85.0        90.0    Male         CS
1        S02  21.0    NaN        75.0  Female         IT
2        S03  22.0   92.0        95.0    Male         CS
3        S04   NaN   45.0        40.0  Female         IT
4        S05  20.0  300.0        85.0    Male         CS


In [22]:
print(df.tail(5))

  Student_ID   Age  Marks  Attendance  Gender Department
3        S04   NaN   45.0        40.0  Female         IT
4        S05  20.0  300.0        85.0    Male         CS
5        S06  23.0   78.0         NaN  Female         CS
6        S07  21.0   65.0        70.0    Male         IT
7        S08  22.0    NaN        60.0  Female         CS


In [23]:
print(df.isnull().sum())

Student_ID    0
Age           1
Marks         2
Attendance    1
Gender        0
Department    0
dtype: int64


In [24]:
print(df.isnull().sum(axis=1)) #row wise

0    0
1    1
2    0
3    1
4    0
5    1
6    0
7    1
dtype: int64


In [25]:
print(df.isnull().sum(axis=0))

Student_ID    0
Age           1
Marks         2
Attendance    1
Gender        0
Department    0
dtype: int64


In [26]:
df['Age']=df['Age'].fillna(df['Age'].mean())
print(df)

  Student_ID        Age  Marks  Attendance  Gender Department
0        S01  20.000000   85.0        90.0    Male         CS
1        S02  21.000000    NaN        75.0  Female         IT
2        S03  22.000000   92.0        95.0    Male         CS
3        S04  21.285714   45.0        40.0  Female         IT
4        S05  20.000000  300.0        85.0    Male         CS
5        S06  23.000000   78.0         NaN  Female         CS
6        S07  21.000000   65.0        70.0    Male         IT
7        S08  22.000000    NaN        60.0  Female         CS


In [27]:
df['Marks'].mean()

np.float64(110.83333333333333)

In [28]:
df['Marks']=df['Marks'].fillna(df['Marks'].median())
print(df)

  Student_ID        Age  Marks  Attendance  Gender Department
0        S01  20.000000   85.0        90.0    Male         CS
1        S02  21.000000   81.5        75.0  Female         IT
2        S03  22.000000   92.0        95.0    Male         CS
3        S04  21.285714   45.0        40.0  Female         IT
4        S05  20.000000  300.0        85.0    Male         CS
5        S06  23.000000   78.0         NaN  Female         CS
6        S07  21.000000   65.0        70.0    Male         IT
7        S08  22.000000   81.5        60.0  Female         CS


In [32]:

df['Attendance']=df['Attendance'].fillna(df['Attendance'].mean())

In [None]:
# ==============================
# OUTLIERS - IMPORTANT POINTS
# ==============================

# 1. Outlier = Data point that is very far from other observations.

# 2. Outliers can:
#    - Distort mean
#    - Affect standard deviation
#    - Change regression line slope
#    - Reduce ML model accuracy

# 3. Mean is sensitive to outliers.
#    Median is more robust (less affected).

# 4. Common Methods to Detect Outliers:
#    - IQR Method (Most Important for interviews)
#    - Z-Score Method
#    - Boxplot Visualization

# 5. IQR Formula:
#    IQR = Q3 - Q1
#    Lower Bound = Q1 - 1.5 * IQR
#    Upper Bound = Q3 + 1.5 * IQR

# 6. Z-Score Formula:
#    Z = (x - mean) / standard_deviation
#    If |Z| > 3 â†’ Possible Outlier

# 7. When to REMOVE Outliers:
#    - Data entry error (e.g., Marks = 300 when max is 100)
#    - Measurement mistake

# 8. When NOT to Remove Outliers:
#    - Fraud detection
#    - Anomaly detection
#    - Rare but valid real-world events

# 9. Ways to Handle Outliers:
#    - Remove rows
#    - Replace with mean/median
#    - Cap values (Winsorization)
#    - Use robust models (Tree-based models)

# 10. Linear Regression is sensitive to outliers.
#     Decision Trees & Random Forest are less sensitive.

# Always visualize data before removing outliers.

IQR DETECTION

In [33]:
Q1=df['Marks'].quantile(0.25)
Q3=df['Marks'].quantile(0.75)
print(Q1,Q3)
IQR=Q3-Q1
print(IQR)

74.75 86.75
12.0


In [34]:
lower=Q1-1.5*IQR
upper=Q3+1.5*IQR
print(lower)
print(upper)

56.75
104.75


In [35]:
outlier=df[(df['Marks']<lower) | (df['Marks']>upper)]
print(outlier)

  Student_ID        Age  Marks  Attendance  Gender Department
3        S04  21.285714   45.0        40.0  Female         IT
4        S05  20.000000  300.0        85.0    Male         CS


In [36]:
# Common Methods to Handle Outliers:

# (A) Remove Outliers
#     Use when they are errors or unrealistic values.
#     Risk: May reduce dataset size.

# (B) Replace with Mean
#     Suitable when data is normally distributed.
#     But mean is sensitive to extreme values.

# (C) Replace with Median
#     Better option when data is skewed.
#     Median is robust to outliers.

# (D) Capping / Winsorization
#     Replace extreme values with upper/lower limit.
#     Keeps data size same.
#     Very useful in ML preprocessing.

| Feature    | Noise                                 | Outlier                       |
| ---------- | ------------------------------------- | ----------------------------- |
| Definition | Random error or disturbance           | Extreme value far from others |
| Nature     | Random / unwanted variation           | Abnormally high or low value  |
| Position   | May not be extreme                    | Usually extreme               |
| Cause      | Measurement error, sensor issue       | Data entry error, rare event  |
| Effect     | Increases variance                    | Shifts mean heavily           |
| Detection  | Harder to detect                      | Easier using IQR / Z-score    |
| Example    | Attendance = 87.2345 (rounding error) | Marks = 300 out of 100        |

