# 01 Exploration

In [6]:
import pandas as pd
import numpy as np

In [7]:
# STEP 1: Load the dataset
# Read the CSV file
# Make sure the file is in the same folder as your notebook
df = pd.read_csv("C:\\Users\\Kashish\\Downloads\\practice_data.csv")

# Show first 5 rows to check data
df.head()

Unnamed: 0,ID,Age,Income,Gender,City,Purchase_Amount
0,1,25.0,45000.0,Male,Delhi,1200.0
1,2,30.0,,Female,Mumbai,1500.0
2,3,22.0,38000.0,Male,,900.0
3,4,28.0,52000.0,Female,Delhi,1600.0
4,5,,41000.0,Female,Bangalore,1100.0


In [8]:
# STEP 2: Basic Exploration (Understanding the data)
# 1) Shape means total rows and columns
df.shape

(50, 6)

In [9]:
# 2) Basic information of each column (data type, missing values)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               50 non-null     int64  
 1   Age              44 non-null     float64
 2   Income           45 non-null     float64
 3   Gender           47 non-null     object 
 4   City             44 non-null     object 
 5   Purchase_Amount  43 non-null     float64
dtypes: float64(3), int64(1), object(2)
memory usage: 2.5+ KB


In [10]:
# 3) Summary of numeric columns (mean, min, max etc.)
df.describe()

Unnamed: 0,ID,Age,Income,Purchase_Amount
count,50.0,44.0,45.0,43.0
mean,25.5,30.795455,52411.111111,1558.139535
std,14.57738,4.91121,10717.068422,494.3647
min,1.0,22.0,36000.0,800.0
25%,13.25,27.0,43000.0,1200.0
50%,25.5,30.0,51000.0,1500.0
75%,37.75,34.25,61000.0,1950.0
max,50.0,41.0,72000.0,2600.0


In [11]:
# 4) Count missing values in each column
df.isna().sum()

ID                 0
Age                6
Income             5
Gender             3
City               6
Purchase_Amount    7
dtype: int64

In [12]:
# 5) Check how many missing values in complete dataset
df.isna().sum().sum()

np.int64(27)

In [13]:
# 6) Look at duplicate rows
df.duplicated().sum()

np.int64(0)

# 02 Handling Missing Value

In [14]:
# Method 1: Drop rows that contain missing values
# Drop all rows where ANY column has missing value
# Not recommended for beginners unless missing data is very small

df_drop_rows = df.dropna()

df_drop_rows.head()


Unnamed: 0,ID,Age,Income,Gender,City,Purchase_Amount
0,1,25.0,45000.0,Male,Delhi,1200.0
3,4,28.0,52000.0,Female,Delhi,1600.0
6,7,40.0,70000.0,Male,Mumbai,2500.0
7,8,29.0,48000.0,Female,Delhi,1400.0
9,10,26.0,39500.0,Male,Bangalore,1000.0


In [15]:
# Method 2: Drop columns that contain missing values
# Drop columns with ANY missing value
# This is only useful if a column is mostly empty

df_drop_cols = df.dropna(axis=1)

df_drop_cols.head()


Unnamed: 0,ID
0,1
1,2
2,3
3,4
4,5


In [16]:
# Method 3: Fill missing numeric values with MEAN
# Mean = average value of column
# Use for Age, Income, Purchase_Amount

df_mean = df.copy()
df_mean["Age"] = df_mean["Age"].fillna(df_mean["Age"].mean())
df_mean["Income"] = df_mean["Income"].fillna(df_mean["Income"].mean())
df_mean["Purchase_Amount"] = df_mean["Purchase_Amount"].fillna(df_mean["Purchase_Amount"].mean())

df_mean.head()


Unnamed: 0,ID,Age,Income,Gender,City,Purchase_Amount
0,1,25.0,45000.0,Male,Delhi,1200.0
1,2,30.0,52411.111111,Female,Mumbai,1500.0
2,3,22.0,38000.0,Male,,900.0
3,4,28.0,52000.0,Female,Delhi,1600.0
4,5,30.795455,41000.0,Female,Bangalore,1100.0


In [17]:
# Method 4: Fill missing numeric values with MEDIAN
# Median = middle value when sorted
# Better when outliers exist

df_median = df.copy()
df_median["Age"] = df_median["Age"].fillna(df_median["Age"].median())
df_median["Income"] = df_median["Income"].fillna(df_median["Income"].median())
df_median["Purchase_Amount"] = df_median["Purchase_Amount"].fillna(df_median["Purchase_Amount"].median())

df_median.head()


Unnamed: 0,ID,Age,Income,Gender,City,Purchase_Amount
0,1,25.0,45000.0,Male,Delhi,1200.0
1,2,30.0,51000.0,Female,Mumbai,1500.0
2,3,22.0,38000.0,Male,,900.0
3,4,28.0,52000.0,Female,Delhi,1600.0
4,5,30.0,41000.0,Female,Bangalore,1100.0


In [18]:
# Method 5: Fill missing categorical values with MODE
# Mode = most common value in the column
# Useful for Gender, City

df_mode = df.copy()
df_mode["Gender"] = df_mode["Gender"].fillna(df_mode["Gender"].mode()[0])
df_mode["City"] = df_mode["City"].fillna(df_mode["City"].mode()[0])

df_mode.head()


Unnamed: 0,ID,Age,Income,Gender,City,Purchase_Amount
0,1,25.0,45000.0,Male,Delhi,1200.0
1,2,30.0,,Female,Mumbai,1500.0
2,3,22.0,38000.0,Male,Delhi,900.0
3,4,28.0,52000.0,Female,Delhi,1600.0
4,5,,41000.0,Female,Bangalore,1100.0


In [19]:
# Method 6: Fill missing values with a constant (fixed) value
df_constant = df.copy()

# Numeric constant
df_constant["Income"] = df_constant["Income"].fillna(0)

# Categorical constant
df_constant["City"] = df_constant["City"].fillna("Unknown")

df_constant.head()


Unnamed: 0,ID,Age,Income,Gender,City,Purchase_Amount
0,1,25.0,45000.0,Male,Delhi,1200.0
1,2,30.0,0.0,Female,Mumbai,1500.0
2,3,22.0,38000.0,Male,Unknown,900.0
3,4,28.0,52000.0,Female,Delhi,1600.0
4,5,,41000.0,Female,Bangalore,1100.0


In [20]:
# Method 7: Forward fill (use previous row value)
# Forward Fill = replace missing value with value from the row above

df_ffill = df.copy()

# Updated method (no warning)
df_ffill = df_ffill.ffill()

df_ffill.head()



Unnamed: 0,ID,Age,Income,Gender,City,Purchase_Amount
0,1,25.0,45000.0,Male,Delhi,1200.0
1,2,30.0,45000.0,Female,Mumbai,1500.0
2,3,22.0,38000.0,Male,Mumbai,900.0
3,4,28.0,52000.0,Female,Delhi,1600.0
4,5,28.0,41000.0,Female,Bangalore,1100.0


In [21]:

# Method 8: Backward fill (use next row value)
# Backward Fill = replace missing value with the value from the row below

df_bfill = df.copy()

# Updated syntax (no warning)
df_bfill = df_bfill.bfill()

df_bfill.head()



Unnamed: 0,ID,Age,Income,Gender,City,Purchase_Amount
0,1,25.0,45000.0,Male,Delhi,1200.0
1,2,30.0,38000.0,Female,Mumbai,1500.0
2,3,22.0,38000.0,Male,Delhi,900.0
3,4,28.0,52000.0,Female,Delhi,1600.0
4,5,35.0,41000.0,Female,Bangalore,1100.0


In [25]:
# 1. Drop rows
df_drop_rows.to_csv(r"C:\Users\Kashish\Downloads\df_drop_rows.csv", index=False)

# 2. Drop columns
df_drop_cols.to_csv(r"C:\Users\Kashish\Downloads\df_drop_cols.csv", index=False)

# 3. Mean-filled file
df_mean.to_csv(r"C:\Users\Kashish\Downloads\df_mean.csv", index=False)

# 4. Median-filled file
df_median.to_csv(r"C:\Users\Kashish\Downloads\df_median.csv", index=False)

# 5. Mode-filled file
df_mode.to_csv(r"C:\Users\Kashish\Downloads\df_mode.csv", index=False)

# 6. Constant-filled file
df_constant.to_csv(r"C:\Users\Kashish\Downloads\df_constant.csv", index=False)

# 7. Forward fill
df_ffill.to_csv(r"C:\Users\Kashish\Downloads\df_ffill.csv", index=False)

# 8. Backward fill
df_bfill.to_csv(r"C:\Users\Kashish\Downloads\df_bfill.csv", index=False)

print("All files saved in Downloads folder.")



All files saved in Downloads folder.
