<a href="https://colab.research.google.com/github/khshohelrana/Python_Data-Mining/blob/main/Data_mining_2ndReport.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
file_path = '/kaggle/input/data-processing/melb_data.csv'
df = pd.read_csv(file_path)

#Data Exploration
print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset Info:")
df.info()

print("\nSummary Statistics:")
print(df.describe())

#Identifying missing values
print("\nMissing values in each column:")
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print(missing_values)

# Handling NULL values
df['BuildingArea'].fillna(df['BuildingArea'].median(), inplace=True)

df['YearBuilt'].fillna(df['YearBuilt'].mode()[0], inplace=True)

df['CouncilArea'].fillna(df['CouncilArea'].mode()[0], inplace=True)

df['Car'].fillna(df['Car'].median(), inplace=True)

print("\nMissing values after imputation:")
print(df.isnull().sum())

#Detecting and handling outliers using the IQR method
Q1_price = df['Price'].quantile(0.25)
Q3_price = df['Price'].quantile(0.75)
IQR_price = Q3_price - Q1_price

Q1_building = df['BuildingArea'].quantile(0.25)
Q3_building = df['BuildingArea'].quantile(0.75)
IQR_building = Q3_building - Q1_building

price_outliers = df[(df['Price'] < (Q1_price - 1.5 * IQR_price)) | (df['Price'] > (Q3_price + 1.5 * IQR_price))]
building_outliers = df[(df['BuildingArea'] < (Q1_building - 1.5 * IQR_building)) | (df['BuildingArea'] > (Q3_building + 1.5 * IQR_building))]

print(f"\nNumber of Price outliers: {len(price_outliers)}")
print(f"Number of BuildingArea outliers: {len(building_outliers)}")

# Remove outliers from the original dataset
df_cleaned = df[~((df['Price'] < (Q1_price - 1.5 * IQR_price)) | (df['Price'] > (Q3_price + 1.5 * IQR_price)))]
df_cleaned = df_cleaned[~((df_cleaned['BuildingArea'] < (Q1_building - 1.5 * IQR_building)) | (df_cleaned['BuildingArea'] > (Q3_building + 1.5 * IQR_building)))]

#Checking for duplicates and removing them
print("\nNumber of duplicate rows before removal:", df_cleaned.duplicated().sum())
df_cleaned = df_cleaned.drop_duplicates()
print("Number of duplicate rows after removal:", df_cleaned.duplicated().sum())

#Visualizing missing values (if any) after processing
sns.heatmap(df_cleaned.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values Heatmap (After Processing)")
plt.show()

#Final dataset overview
print("\nFinal Cleaned Dataset Shape:", df_cleaned.shape)
print("\nFirst 5 rows of the cleaned dataset:")
print(df_cleaned.head())