# Handling Outliners

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# Read your CSV file
df = pd.read_csv("C:\\Users\\Kashish\\Downloads\\outliers_practice.csv")

# Show first 5 rows
df.head()

Unnamed: 0,Age,Income,Units_Sold,Rating
0,25,42000,15,4.2
1,27,46000,18,4.5
2,29,50000,22,4.1
3,31,52000,20,4.3
4,33,55000,25,4.0


In [5]:
# STEP 2: Basic Exploration
# -----------------------------------------------------------

# Shape means total rows and columns
print("Shape:", df.shape)

# Info() shows datatype and non-null values
df.info()

# Describe() shows mean, std, min, max etc.
df.describe()

Shape: (48, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Age         48 non-null     int64  
 1   Income      48 non-null     int64  
 2   Units_Sold  48 non-null     int64  
 3   Rating      48 non-null     float64
dtypes: float64(1), int64(3)
memory usage: 1.6 KB


Unnamed: 0,Age,Income,Units_Sold,Rating
count,48.0,48.0,48.0,48.0
mean,46.5625,111187.5,46.395833,4.21875
std,16.837024,147154.377795,45.182721,0.589863
min,19.0,15000.0,5.0,1.5
25%,33.75,54750.0,25.75,4.1
50%,45.5,74000.0,36.5,4.3
75%,57.25,93250.0,48.25,4.4
max,90.0,800000.0,250.0,5.0


In [6]:
# STEP 3: Identify Outliers using IQR Method
# -----------------------------------------------------------
# IQR = Interquartile Range
# Q1 = 25th percentile
# Q3 = 75th percentile
# IQR = Q3 - Q1
# Outlier Rule = values < (Q1 - 1.5*IQR) OR > (Q3 + 1.5*IQR)

Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

print("IQR Values:")
print(IQR)

# Calculate lower and upper limits
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

print("\nLower Limits:")
print(lower_limit)
print("\nUpper Limits:")
print(upper_limit)

# Detect outliers (True = outlier)
outlier_mask = (df < lower_limit) | (df > upper_limit)

# Show rows containing any outlier
df[outlier_mask.any(axis=1)]

IQR Values:
Age              23.5
Income        38500.0
Units_Sold       22.5
Rating            0.3
dtype: float64

Lower Limits:
Age             -1.50
Income       -3000.00
Units_Sold      -8.00
Rating           3.65
dtype: float64

Upper Limits:
Age               92.50
Income        151000.00
Units_Sold        82.00
Rating             4.85
dtype: float64


Unnamed: 0,Age,Income,Units_Sold,Rating
42,19,15000,5,1.5
43,20,17000,7,2.0
44,75,350000,120,5.0
45,80,500000,150,5.0
46,85,600000,200,5.0
47,90,800000,250,5.0


### OUTLIER HANDLING METHODS

In [10]:
# METHOD 1 → REMOVE OUTLIERS
# -----------------------------------------------------------
# Definition:
# Remove rows that contain outlier values.
# This is useful when outliers are mistakes or noise.

df_removed = df[~outlier_mask.any(axis=1)]
df_removed.head()


Unnamed: 0,Age,Income,Units_Sold,Rating
0,25,42000,15,4.2
1,27,46000,18,4.5
2,29,50000,22,4.1
3,31,52000,20,4.3
4,33,55000,25,4.0


In [11]:
# METHOD 2 → CAP OUTLIERS (WINSORIZATION)
# -----------------------------------------------------------
# Definition:
# Replace extreme values with the nearest acceptable limit.
# Example: Anything above upper_limit becomes exactly upper_limit.

df_capped = df.copy()

for col in df.select_dtypes(include='number').columns:
    df_capped[col] = np.where(df_capped[col] > upper_limit[col], 
                              upper_limit[col], df_capped[col])
    df_capped[col] = np.where(df_capped[col] < lower_limit[col], 
                              lower_limit[col], df_capped[col])

df_capped.head()


Unnamed: 0,Age,Income,Units_Sold,Rating
0,25.0,42000.0,15.0,4.2
1,27.0,46000.0,18.0,4.5
2,29.0,50000.0,22.0,4.1
3,31.0,52000.0,20.0,4.3
4,33.0,55000.0,25.0,4.0


In [13]:
# METHOD 3 → Z-SCORE METHOD
# -----------------------------------------------------------
# Definition:
# Z-score = (value - mean) / std
# If |Z-score| > 3 → usually considered an outlier.

from scipy import stats

df_z = df.copy()

# Calculate z-score for numeric columns
z_scores = np.abs(stats.zscore(df_z.select_dtypes(include='number')))

# Get outlier rows
outlier_rows_z = df_z[(z_scores > 3).any(axis=1)]
outlier_rows_z.head()


Unnamed: 0,Age,Income,Units_Sold,Rating
42,19,15000,5,1.5
43,20,17000,7,2.0
46,85,600000,200,5.0
47,90,800000,250,5.0


In [15]:
# METHOD 4 → REPLACE OUTLIERS WITH MEAN
# -----------------------------------------------------------
# Definition:
# Outlier values are replaced with the column mean.

df_mean = df.copy()

for col in df_mean.select_dtypes(include='number').columns:
    mean_value = df_mean[col].mean()
    df_mean[col] = np.where(outlier_mask[col], mean_value, df_mean[col])

df_mean.head()


Unnamed: 0,Age,Income,Units_Sold,Rating
0,25.0,42000.0,15.0,4.2
1,27.0,46000.0,18.0,4.5
2,29.0,50000.0,22.0,4.1
3,31.0,52000.0,20.0,4.3
4,33.0,55000.0,25.0,4.0


In [17]:
# METHOD 5 → REPLACE OUTLIERS WITH MEDIAN
# -----------------------------------------------------------
# Definition:
# Median is stronger than mean if data is skewed.
# Median replacement avoids effect of extreme values.

df_median = df.copy()

for col in df_median.select_dtypes(include='number').columns:
    median_value = df_median[col].median()
    df_median[col] = np.where(outlier_mask[col], median_value, df_median[col])

df_median.head()

Unnamed: 0,Age,Income,Units_Sold,Rating
0,25.0,42000.0,15.0,4.2
1,27.0,46000.0,18.0,4.5
2,29.0,50000.0,22.0,4.1
3,31.0,52000.0,20.0,4.3
4,33.0,55000.0,25.0,4.0


In [20]:
# METHOD 6 → LOG TRANSFORMATION
# -----------------------------------------------------------
# Definition:
# Turns big values into smaller ones.
# Helps reduce the impact of extreme values (not remove them).

df_log = df.copy()

for col in ["Income", "Units_Sold"]:
    # Add 1 because log(0) is not allowed
    df_log[col] = np.log(df_log[col] + 1)

df_log.head()

Unnamed: 0,Age,Income,Units_Sold,Rating
0,25,10.645449,2.772589,4.2
1,27,10.736418,2.944439,4.5
2,29,10.819798,3.135494,4.1
3,31,10.859018,3.044522,4.3
4,33,10.915107,3.258097,4.0


In [22]:
# 1) Remove outliers
df_removed.to_csv(r"C:\Users\Kashish\Downloads\df_removed_outliers.csv", index=False)

# 2) Cap outliers (winsorization)
df_capped.to_csv(r"C:\Users\Kashish\Downloads\df_capped_outliers.csv", index=False)

# 3) Z-score outlier rows
outlier_rows_z.to_csv(r"C:\Users\Kashish\Downloads\df_zscore_outliers.csv", index=False)

# 4) Replace outliers with mean
df_mean.to_csv(r"C:\Users\Kashish\Downloads\df_outliers_mean.csv", index=False)

# 5) Replace outliers with median
df_median.to_csv(r"C:\Users\Kashish\Downloads\df_outliers_median.csv", index=False)

# 6) Log transformation file
df_log.to_csv(r"C:\Users\Kashish\Downloads\df_log_transformed.csv", index=False)

print("All outlier-handled files saved to Downloads folder.")


All outlier-handled files saved to Downloads folder.
