In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import matplotlib.pyplot as plt

print("SCALING IN MACHINE LEARNING")
print("="*50)

SCALING IN MACHINE LEARNING


In [2]:
# Example: Why scaling matters
data_unscaled = pd.DataFrame({
    'age': [25, 30, 35, 40, 45],
    'salary': [30000, 50000, 75000, 90000, 120000],
    'experience': [2, 5, 8, 12, 15]
})

print("Original Data (Unscaled):")
print(data_unscaled)
print(f"\nRanges:")
print(f"Age: {data_unscaled['age'].min()} - {data_unscaled['age'].max()}")
print(f"Salary: {data_unscaled['salary'].min()} - {data_unscaled['salary'].max()}")
print(f"Experience: {data_unscaled['experience'].min()} - {data_unscaled['experience'].max()}")

print("\nPROBLEM: Salary dominates due to large scale!")
print("Machine learning algorithms will focus on salary and ignore age/experience")

Original Data (Unscaled):
   age  salary  experience
0   25   30000           2
1   30   50000           5
2   35   75000           8
3   40   90000          12
4   45  120000          15

Ranges:
Age: 25 - 45
Salary: 30000 - 120000
Experience: 2 - 15

PROBLEM: Salary dominates due to large scale!
Machine learning algorithms will focus on salary and ignore age/experience


In [3]:
# Standard Scaler - most common scaling method
scaler = StandardScaler()

# Apply Standard Scaling
data_scaled = data_unscaled.copy()
data_scaled[['age', 'salary', 'experience']] = scaler.fit_transform(data_unscaled[['age', 'salary', 'experience']])

print("\n" + "="*50)
print("STANDARD SCALER RESULTS:")
print("="*50)
print("Scaled Data:")
print(data_scaled)

print(f"\nAfter scaling - all features have:")
print(f"Mean ‚âà 0, Standard Deviation ‚âà 1")
print(f"Age mean: {data_scaled['age'].mean():.6f}")
print(f"Salary mean: {data_scaled['salary'].mean():.6f}")
print(f"Experience mean: {data_scaled['experience'].mean():.6f}")


STANDARD SCALER RESULTS:
Scaled Data:
        age    salary  experience
0 -1.414214 -1.376396   -1.369474
1 -0.707107 -0.736212   -0.727533
2  0.000000  0.064018   -0.085592
3  0.707107  0.544157    0.770329
4  1.414214  1.504433    1.412270

After scaling - all features have:
Mean ‚âà 0, Standard Deviation ‚âà 1
Age mean: 0.000000
Salary mean: 0.000000
Experience mean: -0.000000


In [4]:
print("\nHOW STANDARD SCALER WORKS:")
print("="*35)
print("Formula: z = (x - mean) / standard_deviation")
print("\nStep by step for 'age' column:")

age_mean = data_unscaled['age'].mean()
age_std = data_unscaled['age'].std()

print(f"Original ages: {data_unscaled['age'].tolist()}")
print(f"Mean: {age_mean}")
print(f"Std: {age_std}")

print("\nTransformation:")
for i, age in enumerate(data_unscaled['age']):
    scaled_age = (age - age_mean) / age_std
    print(f"({age} - {age_mean}) / {age_std} = {scaled_age:.3f}")


HOW STANDARD SCALER WORKS:
Formula: z = (x - mean) / standard_deviation

Step by step for 'age' column:
Original ages: [25, 30, 35, 40, 45]
Mean: 35.0
Std: 7.905694150420948

Transformation:
(25 - 35.0) / 7.905694150420948 = -1.265
(30 - 35.0) / 7.905694150420948 = -0.632
(35 - 35.0) / 7.905694150420948 = 0.000
(40 - 35.0) / 7.905694150420948 = 0.632
(45 - 35.0) / 7.905694150420948 = 1.265


In [6]:
scaling_guide = pd.DataFrame({
    'Scaler': ['StandardScaler', 'MinMaxScaler', 'RobustScaler'],
    'Formula': [
        '(x - mean) / std',
        '(x - min) / (max - min)', 
        '(x - median) / IQR'
    ],
    'Output Range': [
        'Mean=0, Std=1',
        '[0, 1]',
        'Median=0, robust to outliers'
    ],
    'Best For': [
        'Normal distribution, most ML algorithms',
        'Bounded range needed (0-1)',
        'Data with outliers'
    ],
    'Algorithms': [
        'SVM, Neural Networks, KNN, PCA',
        'Neural Networks, Image processing',
        'When outliers present'
    ]
})

print("\n" + "="*80)
print("SCALING METHODS GUIDE:")
print("="*80)

for idx, row in scaling_guide.iterrows():
    print(f"\n{row['Scaler']}:")
    print(f"  Formula: {row['Formula']}")
    print(f"  Output: {row['Output Range']}")
    print(f"  Best For: {row['Best For']}")
    print(f"  Algorithms: {row['Algorithms']}")


SCALING METHODS GUIDE:

StandardScaler:
  Formula: (x - mean) / std
  Output: Mean=0, Std=1
  Best For: Normal distribution, most ML algorithms
  Algorithms: SVM, Neural Networks, KNN, PCA

MinMaxScaler:
  Formula: (x - min) / (max - min)
  Output: [0, 1]
  Best For: Bounded range needed (0-1)
  Algorithms: Neural Networks, Image processing

RobustScaler:
  Formula: (x - median) / IQR
  Output: Median=0, robust to outliers
  Best For: Data with outliers
  Algorithms: When outliers present


In [7]:
# Real-world example with different scales
real_data = pd.DataFrame({
    'house_price': [200000, 350000, 500000, 750000, 1200000],
    'bedrooms': [2, 3, 4, 4, 5],
    'square_feet': [1200, 1800, 2500, 3200, 4500],
    'lot_size': [0.25, 0.5, 0.75, 1.0, 2.0]
})

print("\n" + "="*50)
print("REAL WORLD EXAMPLE:")
print("="*50)
print("House Data (Unscaled):")
print(real_data)

# Apply Standard Scaling
scaler_real = StandardScaler()
real_scaled = scaler_real.fit_transform(real_data)
real_scaled_df = pd.DataFrame(real_scaled, columns=real_data.columns)

print("\nHouse Data (Standard Scaled):")
print(real_scaled_df.round(3))

print("\nNow all features contribute equally to ML algorithms!")


REAL WORLD EXAMPLE:
House Data (Unscaled):
   house_price  bedrooms  square_feet  lot_size
0       200000         2         1200      0.25
1       350000         3         1800      0.50
2       500000         4         2500      0.75
3       750000         4         3200      1.00
4      1200000         5         4500      2.00

House Data (Standard Scaled):
   house_price  bedrooms  square_feet  lot_size
0       -1.141    -1.569       -1.256    -1.076
1       -0.713    -0.588       -0.733    -0.662
2       -0.285     0.392       -0.122    -0.248
3        0.428     0.392        0.488     0.166
4        1.711     1.373        1.622     1.821

Now all features contribute equally to ML algorithms!


In [9]:
# Create a decision flowchart
print("\nDECISION FLOWCHART:")
print("="*30)

decision_tree = """
üìä YOUR DATA
    ‚îÇ
    ‚îú‚îÄ Has OUTLIERS? 
    ‚îÇ   ‚îÇ
    ‚îÇ   ‚îú‚îÄ YES ‚Üí Use RobustScaler
    ‚îÇ   ‚îÇ         (Uses median & IQR, ignores outliers)
    ‚îÇ   ‚îÇ
    ‚îÇ   ‚îî‚îÄ NO ‚Üí Continue...
    ‚îÇ
    ‚îú‚îÄ Need values in [0,1] range?
    ‚îÇ   ‚îÇ
    ‚îÇ   ‚îú‚îÄ YES ‚Üí Use MinMaxScaler
    ‚îÇ   ‚îÇ         (Neural networks, image data)
    ‚îÇ   ‚îÇ
    ‚îÇ   ‚îî‚îÄ NO ‚Üí Use StandardScaler
    ‚îÇ             (Most common choice)
"""

print(decision_tree)


DECISION FLOWCHART:

üìä YOUR DATA
    ‚îÇ
    ‚îú‚îÄ Has OUTLIERS? 
    ‚îÇ   ‚îÇ
    ‚îÇ   ‚îú‚îÄ YES ‚Üí Use RobustScaler
    ‚îÇ   ‚îÇ         (Uses median & IQR, ignores outliers)
    ‚îÇ   ‚îÇ
    ‚îÇ   ‚îî‚îÄ NO ‚Üí Continue...
    ‚îÇ
    ‚îú‚îÄ Need values in [0,1] range?
    ‚îÇ   ‚îÇ
    ‚îÇ   ‚îú‚îÄ YES ‚Üí Use MinMaxScaler
    ‚îÇ   ‚îÇ         (Neural networks, image data)
    ‚îÇ   ‚îÇ
    ‚îÇ   ‚îî‚îÄ NO ‚Üí Use StandardScaler
    ‚îÇ             (Most common choice)



In [10]:
# When to use RobustScaler
print("\n2. ROBUST SCALER - Use When:")
print("="*40)

robust_examples = [
    "‚úÖ Data has outliers",
    "‚úÖ Skewed distributions", 
    "‚úÖ When median is more representative than mean",
    "‚úÖ Financial data, real estate prices",
    "‚úÖ When StandardScaler gives poor results"
]

for example in robust_examples:
    print(example)

# Example with outliers
outlier_demo = pd.DataFrame({
    'income': [30000, 35000, 40000, 45000, 50000, 2000000],  # One millionaire!
    'age': [25, 28, 30, 32, 35, 40]
})

print("\nData with outlier:")
print(outlier_demo)

# Compare StandardScaler vs RobustScaler
scaler_robust = RobustScaler()
scaled_robust = scaler_robust.fit_transform(outlier_demo)

scaler_standard = StandardScaler() 
scaled_standard = scaler_standard.fit_transform(outlier_demo)

print(f"\nStandard Scaler range: {scaled_standard.min():.2f} to {scaled_standard.max():.2f}")
print(f"Robust Scaler range: {scaled_robust.min():.2f} to {scaled_robust.max():.2f}")
print("‚Üí RobustScaler handles outlier better!")


2. ROBUST SCALER - Use When:
‚úÖ Data has outliers
‚úÖ Skewed distributions
‚úÖ When median is more representative than mean
‚úÖ Financial data, real estate prices
‚úÖ When StandardScaler gives poor results

Data with outlier:
    income  age
0    30000   25
1    35000   28
2    40000   30
3    45000   32
4    50000   35
5  2000000   40

Standard Scaler range: -1.37 to 2.24
Robust Scaler range: -1.04 to 156.60
‚Üí RobustScaler handles outlier better!


*************Dimentionality Reduction*****************************


Dimensionality Reduction is a technique used to reduce the number of features (dimensions) in a dataset while preserving the most important information.

WHY DIMENSIONALITY REDUCTION?
===================================
üêå COMPUTATIONAL COST: More features = slower processing
üíæ STORAGE: More memory needed to store data
üìä VISUALIZATION: Hard to plot >3 dimensions
üéØ CURSE OF DIMENSIONALITY: Data becomes sparse in high dimensions
üîç OVERFITTING: More features can lead to overfitting
üßπ NOISE: Some features may be irrelevant or noisy


In [18]:
# Add this cell after your dimensionality reduction markdown

print("DIMENSIONALITY REDUCTION: CORRELATION & VARIANCE THRESHOLD")
print("="*65)

import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt

print("Three simple but powerful feature selection methods:")
print("1. Variance Threshold - Remove low variance features")
print("2. Correlation Threshold - Remove highly correlated features")
print("2. Exhaustive feature Selection - tries all combinations of features")

DIMENSIONALITY REDUCTION: CORRELATION & VARIANCE THRESHOLD
Three simple but powerful feature selection methods:
1. Variance Threshold - Remove low variance features
2. Correlation Threshold - Remove highly correlated features
2. Exhaustive feature Selection - tries all combinations of features


In [20]:
# Add this cell to demonstrate correlation-based dimensionality reduction

print("CORRELATION-BASED DIMENSIONALITY REDUCTION EXAMPLES")
print("="*60)

import pandas as pd
import numpy as np


CORRELATION-BASED DIMENSIONALITY REDUCTION EXAMPLES


In [None]:
print("\n1. PHYSICAL MEASUREMENTS EXAMPLE:")
print("="*40)

# Height in different units - highly correlated
physical_data = pd.DataFrame({
    'height_cm': [170, 175, 165, 180, 160, 185, 172, 168, 177, 162],
    'height_inches': [66.9, 68.9, 65.0, 70.9, 63.0, 72.8, 67.7, 66.1, 69.7, 63.8],
    'height_feet': [5.57, 5.74, 5.41, 5.91, 5.25, 6.07, 5.64, 5.51, 5.81, 5.31],
    'weight_kg': [70, 80, 60, 90, 55, 95, 75, 65, 85, 58],
    'weight_pounds': [154, 176, 132, 198, 121, 209, 165, 143, 187, 128],
    'bmi': [24.2, 26.1, 22.0, 27.8, 21.5, 27.7, 25.3, 23.0, 27.1, 22.1]
})

print("Physical measurements dataset:")
print(physical_data.round(2))

# Calculate correlation matrix
corr_matrix = physical_data.corr()
print(f"\nCorrelation Matrix:")
print(corr_matrix.round(3))

# Identify highly correlated pairs
def find_high_correlations(df, threshold=0.9):
    corr_matrix = df.corr().abs()
    high_corr_pairs = []
    
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            correlation = corr_matrix.iloc[i, j]
            if correlation > threshold:
                feature1 = corr_matrix.columns[i]
                feature2 = corr_matrix.columns[j]
                high_corr_pairs.append((feature1, feature2, correlation))
    
    return high_corr_pairs

high_corr = find_high_correlations(physical_data, 0.95)
print(f"\nHighly correlated pairs (>0.95):")
for f1, f2, corr in high_corr:
    print(f"  {f1} ‚Üî {f2}: {corr:.3f}")


1. PHYSICAL MEASUREMENTS EXAMPLE:
Physical measurements dataset:
   height_cm  height_inches  height_feet  weight_kg  weight_pounds   bmi
0        170           66.9         5.57         70            154  24.2
1        175           68.9         5.74         80            176  26.1
2        165           65.0         5.41         60            132  22.0
3        180           70.9         5.91         90            198  27.8
4        160           63.0         5.25         55            121  21.5
5        185           72.8         6.07         95            209  27.7
6        172           67.7         5.64         75            165  25.3
7        168           66.1         5.51         65            143  23.0
8        177           69.7         5.81         85            187  27.1
9        162           63.8         5.31         58            128  22.1

Correlation Matrix:
               height_cm  height_inches  height_feet  weight_kg  \
height_cm          1.000          1.000    

In [28]:
# Add this cell to demonstrate variance threshold dimensionality reduction

print("VARIANCE THRESHOLD DIMENSIONALITY REDUCTION EXAMPLES")
print("="*60)

import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt

print("Remove features with low variance (little predictive information)")

VARIANCE THRESHOLD DIMENSIONALITY REDUCTION EXAMPLES
Remove features with low variance (little predictive information)


In [33]:
print("\n1. BASIC VARIANCE THRESHOLD EXAMPLE:")
print("="*40)

# Create sample data with different variance levels
sample_data = pd.DataFrame({
    'constant_feature': [1, 1, 1, 1, 1, 1, 1, 1],        # Zero variance
    'low_variance': [1, 1, 1, 1, 2, 1, 1, 1],            # Very low variance
    'medium_variance': [1, 2, 3, 2, 4, 3, 2, 1],         # Medium variance
    'high_variance': [1, 10, 5, 15, 8, 20, 3, 12],       # High variance
    'useful_feature': [10, 20, 30, 40, 50, 60, 70, 80]   # Good variance
})

print("Sample data with different variance levels:")
print(sample_data)

# Calculate variance for each feature
variances = sample_data.var()
print(f"\nVariance of each feature:")
for col, var in variances.items():
    print(f"  {col}: {var:.3f}")

# Apply Variance Threshold
print(f"\nApplying Variance Threshold (threshold = 1.0):")

selector = VarianceThreshold(threshold=1.0)
selected_features = selector.fit_transform(sample_data)

# Get feature names that were selected
feature_names = sample_data.columns[selector.get_support()]
result_df = pd.DataFrame(selected_features, columns=feature_names)

print(f"Original features: {list(sample_data.columns)}")
print(f"Selected features: {list(feature_names)}")
print(f"Removed features: {[col for col in sample_data.columns if col not in feature_names]}")

print(f"\nBefore: {sample_data.shape[1]} features")
print(f"After: {result_df.shape[1]} features")
print(f"Reduction: {sample_data.shape[1] - result_df.shape[1]} features removed")

print(f"\nFiltered dataset:")
print(result_df)


1. BASIC VARIANCE THRESHOLD EXAMPLE:
Sample data with different variance levels:
   constant_feature  low_variance  medium_variance  high_variance  \
0                 1             1                1              1   
1                 1             1                2             10   
2                 1             1                3              5   
3                 1             1                2             15   
4                 1             2                4              8   
5                 1             1                3             20   
6                 1             1                2              3   
7                 1             1                1             12   

   useful_feature  
0              10  
1              20  
2              30  
3              40  
4              50  
5              60  
6              70  
7              80  

Variance of each feature:
  constant_feature: 0.000
  low_variance: 0.125
  medium_variance: 1.071
  high_variance:

WHAT IS EXHAUSTIVE FEATURE SELECTION?
=============================================

üîç EXHAUSTIVE SEARCH: Tests every possible combination of features
üìä PERFORMANCE-BASED: Selects features that give best model performance  
üéØ OPTIMAL SOLUTION: Guaranteed to find the best feature subset
