In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

# Sample dataset
data = {
    "Name": ["Ali", "Sara", "Omar", "Hina", None],
    "Gender": ["Male", "Female", "Male", "Female", "Female"],
    "Age": [25, 30, 22, np.nan, 28],
    "Salary": [50000, 60000, 55000, 65000, 70000]
}
df = pd.DataFrame(data)

print("Original Data:\n", df)

# 1. Handling Missing Values
df["Name"].fillna("Unknown", inplace=True)
df["Age"].fillna(df["Age"].mean(), inplace=True)

# 2. Encoding Categorical Data
label_encoder = LabelEncoder()
df["Gender_Label"] = label_encoder.fit_transform(df["Gender"])

# 3. Scaling
scaler = MinMaxScaler()
df["Salary_Scaled"] = scaler.fit_transform(df[["Salary"]])

# 4. Feature Creation (Age Group)
df["Age_Group"] = pd.cut(df["Age"], bins=[0,18,30,60], labels=["Child","Young","Adult"])

# 5. Binning (Salary into Low/High)
df["Salary_Bin"] = pd.cut(df["Salary"], bins=[0,55000,70000], labels=["Low","High"])

print("\nAfter Feature Engineering:\n", df)


Original Data:
    Name  Gender   Age  Salary
0   Ali    Male  25.0   50000
1  Sara  Female  30.0   60000
2  Omar    Male  22.0   55000
3  Hina  Female   NaN   65000
4  None  Female  28.0   70000

After Feature Engineering:
       Name  Gender    Age  Salary  Gender_Label  Salary_Scaled Age_Group  \
0      Ali    Male  25.00   50000             1           0.00     Young   
1     Sara  Female  30.00   60000             0           0.50     Young   
2     Omar    Male  22.00   55000             1           0.25     Young   
3     Hina  Female  26.25   65000             0           0.75     Young   
4  Unknown  Female  28.00   70000             0           1.00     Young   

  Salary_Bin  
0        Low  
1       High  
2        Low  
3       High  
4       High  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Name"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)


In [2]:
import pandas as pd
import numpy as np

# Example dataset with an outlier
data = {"Salary": [50000, 52000, 51000, 53000, 120000]}  # 120000 is an outlier
df = pd.DataFrame(data)
print("Original:\n", df)

# Calculate IQR
Q1 = df["Salary"].quantile(0.25)
Q3 = df["Salary"].quantile(0.75)
IQR = Q3 - Q1

# Filter out outliers
df_no_outliers = df[(df["Salary"] >= Q1 - 1.5 * IQR) & (df["Salary"] <= Q3 + 1.5 * IQR)]
print("\nAfter Removing Outliers:\n", df_no_outliers)


Original:
    Salary
0   50000
1   52000
2   51000
3   53000
4  120000

After Removing Outliers:
    Salary
0   50000
1   52000
2   51000
3   53000


In [3]:
import pandas as pd

# Example dataset
data = {
    "Feature1": [1,2,3,4,5],
    "Feature2": [2,4,6,8,10],
    "Feature3": [5,4,3,2,1],
    "Target":   [10,20,30,40,50]
}
df = pd.DataFrame(data)

# Find correlation
corr = df.corr()

print("Correlation Table:\n", corr)

# Select features with correlation > 0.8 with Target
selected_features = corr["Target"][abs(corr["Target"]) > 0.8].index
print("\nSelected Features:\n", selected_features)


Correlation Table:
           Feature1  Feature2  Feature3  Target
Feature1       1.0       1.0      -1.0     1.0
Feature2       1.0       1.0      -1.0     1.0
Feature3      -1.0      -1.0       1.0    -1.0
Target         1.0       1.0      -1.0     1.0

Selected Features:
 Index(['Feature1', 'Feature2', 'Feature3', 'Target'], dtype='object')


In [4]:
import pandas as pd
import numpy as np

# Example skewed dataset
data = {"Income": [1000, 2000, 3000, 4000, 100000]}
df = pd.DataFrame(data)
print("Before Transformation:\n", df)

# Apply log transformation
df["Income_Log"] = np.log1p(df["Income"])  # log1p handles zero safely
print("\nAfter Log Transformation:\n", df)


Before Transformation:
    Income
0    1000
1    2000
2    3000
3    4000
4  100000

After Log Transformation:
    Income  Income_Log
0    1000    6.908755
1    2000    7.601402
2    3000    8.006701
3    4000    8.294300
4  100000   11.512935
