<a href="https://colab.research.google.com/github/hemchan-cyber/M.Tech-AI-Machine-Learning-Practical/blob/main/Q6_Data_Formatting_and_Data_Binning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Q6. Python program to load (or create) a dataset with missing values, count missing values in each column, and handle them by replacing with NaN, the column mean, and the most frequent value. Finally, display the dataset before and after performing the replacements.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Create Synthetic Dataset (Mimicking the Car Dataset structure)
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]
df = pd.read_csv('/content/drive/MyDrive/ML Lab practice/6.Data Formatting & Data Binning/car_dataset.data', names = headers)

In [None]:
print("--- 1. Raw Dataset (Head) ---")
print(df.head())
print("\n" + "="*50 + "\n")

--- 1. Raw Dataset (Head) ---
   symboling normalized-losses         make fuel-type aspiration num-of-doors  \
0          3                 ?  alfa-romero       gas        std          two   
1          3                 ?  alfa-romero       gas        std          two   
2          1                 ?  alfa-romero       gas        std          two   
3          2               164         audi       gas        std         four   
4          2               164         audi       gas        std         four   

    body-style drive-wheels engine-location  wheel-base  ...  engine-size  \
0  convertible          rwd           front        88.6  ...          130   
1  convertible          rwd           front        88.6  ...          130   
2    hatchback          rwd           front        94.5  ...          152   
3        sedan          fwd           front        99.8  ...          109   
4        sedan          4wd           front        99.4  ...          136   

   fuel-system  bore

In [None]:
# ---------------------------------------------------------
# STEP 1: Identify and Convert "?" to NaN
# ---------------------------------------------------------
df = df.replace("?", np.nan)
print("--- 2. Dataset after replacing '?' with NaN ---")
print(df.head())
print("\n" + "="*50 + "\n")

--- 2. Dataset after replacing '?' with NaN ---
   symboling normalized-losses         make fuel-type aspiration num-of-doors  \
0          3               NaN  alfa-romero       gas        std          two   
1          3               NaN  alfa-romero       gas        std          two   
2          1               NaN  alfa-romero       gas        std          two   
3          2               164         audi       gas        std         four   
4          2               164         audi       gas        std         four   

    body-style drive-wheels engine-location  wheel-base  ...  engine-size  \
0  convertible          rwd           front        88.6  ...          130   
1  convertible          rwd           front        88.6  ...          130   
2    hatchback          rwd           front        94.5  ...          152   
3        sedan          fwd           front        99.8  ...          109   
4        sedan          4wd           front        99.4  ...          136   

  

In [None]:
# ---------------------------------------------------------
# STEP 2: Evaluating for Missing Data
# ---------------------------------------------------------
missing_data = df.isnull()
print("--- 3. Count of Missing Values per Column ---")
for column in df.columns:
    print(f"{column}: {missing_data[column].value_counts().get(True, 0)} missing")
print("\n" + "="*50 + "\n")

--- 3. Count of Missing Values per Column ---
symboling: 0 missing
normalized-losses: 41 missing
make: 0 missing
fuel-type: 0 missing
aspiration: 0 missing
num-of-doors: 2 missing
body-style: 0 missing
drive-wheels: 0 missing
engine-location: 0 missing
wheel-base: 0 missing
length: 0 missing
width: 0 missing
height: 0 missing
curb-weight: 0 missing
engine-type: 0 missing
num-of-cylinders: 0 missing
engine-size: 0 missing
fuel-system: 0 missing
bore: 4 missing
stroke: 4 missing
compression-ratio: 0 missing
horsepower: 2 missing
peak-rpm: 2 missing
city-mpg: 0 missing
highway-mpg: 0 missing
price: 4 missing




In [None]:
# ---------------------------------------------------------
# STEP 3: Handle Missing Data (Mean Imputation)
# ---------------------------------------------------------
# List of continuous columns to replace with Mean
numeric_cols = ["normalized-losses", "bore", "stroke", "horsepower", "peak-rpm"]
print("--- 4. Imputing Numeric Columns with Mean ---")
for col in numeric_cols:
    # Convert column to float first
    avg_val = df[col].astype("float").mean(axis=0)
    print(f"Average of {col}: {avg_val:.2f}")
    df[col] = df[col].replace(np.nan, avg_val)
    # Ensure column is strictly float type now
    df[col] = df[col].astype("float")

--- 4. Imputing Numeric Columns with Mean ---
Average of normalized-losses: 122.00
Average of bore: 3.33
Average of stroke: 3.26
Average of horsepower: 104.26
Average of peak-rpm: 5125.37


In [None]:
# ---------------------------------------------------------
# STEP 4: Handle Missing Data (Mode Imputation)
# ---------------------------------------------------------
print("\n--- 5. Imputing 'num-of-doors' with Mode ---")
# Find most frequent value
most_freq_door = df['num-of-doors'].value_counts().idxmax()
print(f"Most frequent num-of-doors: {most_freq_door}")


--- 5. Imputing 'num-of-doors' with Mode ---
Most frequent num-of-doors: four


In [None]:
# Replace NaN with Mode
df["num-of-doors"] = df["num-of-doors"].replace(np.nan, most_freq_door)

In [None]:
# ---------------------------------------------------------
# STEP 5: Drop Rows where 'price' is missing
# ---------------------------------------------------------
print("\n--- 6. Dropping rows with missing Price ---")
# Drop rows with NaN in 'price'
df = df.dropna(subset=["price"], axis=0)


--- 6. Dropping rows with missing Price ---


In [None]:
# Reset index
df = df.reset_index(drop=True)

# Convert price to numeric for final display
df["price"] = df["price"].astype("float")

print("\n" + "="*50 + "\n")





In [None]:
# ---------------------------------------------------------
# FINAL CHECK
# ---------------------------------------------------------
print("--- 7. Final Cleaned Dataset ---")
print(df)
print("\nRemaining missing values:", df.isnull().sum().sum())

--- 7. Final Cleaned Dataset ---
     symboling  normalized-losses         make fuel-type aspiration  \
0            3              122.0  alfa-romero       gas        std   
1            3              122.0  alfa-romero       gas        std   
2            1              122.0  alfa-romero       gas        std   
3            2              164.0         audi       gas        std   
4            2              164.0         audi       gas        std   
..         ...                ...          ...       ...        ...   
196         -1               95.0        volvo       gas        std   
197         -1               95.0        volvo       gas      turbo   
198         -1               95.0        volvo       gas        std   
199         -1               95.0        volvo    diesel      turbo   
200         -1               95.0        volvo       gas      turbo   

    num-of-doors   body-style drive-wheels engine-location  wheel-base  ...  \
0            two  convertible      