# Outline: Data Cleaning and EDA in Python
## Importing necessary libraries
pandas
numpy
matplotlib
seaborn
datacleaner (optional)
dora (optional)

## Loading the dataset
Read CSV or other file formats using pandas

## Initial exploration
Check the shape of the dataset
Display the first few rows using the head() function
Use the info() function to get basic information about the dataset

## Handling missing values
Identify missing values using isnull() or isna()
Impute missing values using mean, median, or mode
Drop missing values using dropna()

## Handling duplicates
Identify duplicate rows using duplicated()
Remove duplicate rows using drop_duplicates()

## Data type conversion
Convert data types using astype()

## Renaming columns
Rename columns using the rename() function

## Handling outliers
Identify outliers using box plots or IQR method
Remove or transform outliers

## Feature scaling
Normalize or standardize features using MinMaxScaler or StandardScaler

## Exploratory Data Analysis (EDA)

### Univariate analysis
Histograms
Box plots
Density plots

### Bivariate analysis
Scatter plots
Pair plots
Correlation matrix and heatmap

### Multivariate analysis
Parallel coordinates
Andrews curves
RadViz

## Data transformation
Log transformation
Square root transformation
Box-Cox transformation

## Feature engineering
Create new features based on existing ones
Perform one-hot encoding or label encoding for categorical variables

## Final dataset preparation
Split the dataset into training and testing sets

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datacleaner import autoclean
from dora import Dora

# Load the dataset
df = pd.read_csv('dataset.csv')

# Initial exploration
print(df.shape)
print(df.head())
print(df.info())

# Handling missing values
print(df.isnull().sum())
df['column_name'].fillna(df['column_name'].mean(), inplace=True)
df.dropna(inplace=True)

# Handling duplicates
print(df.duplicated().sum())
df.drop_duplicates(inplace=True)

# Data type conversion
df['column_name'] = df['column_name'].astype('int')

# Renaming columns
df.rename(columns={'old_name': 'new_name'}, inplace=True)

# Handling outliers
sns.boxplot(x=df['column_name'])
Q1 = df['column_name'].quantile(0.25)
Q3 = df['column_name'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['column_name'] > Q1 - 1.5*IQR) & (df['column_name'] < Q3 + 1.5*IQR)]

# Feature scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
df['column_name'] = scaler.fit_transform(df[['column_name']])

# Exploratory Data Analysis (EDA)
sns.histplot(df['column_name'])
sns.boxplot(x=df['column_name'], y=df['target_column'])
sns.kdeplot(df['column_name'])
sns.scatterplot(x=df['column_name'], y=df['target_column'])
sns.pairplot(df)

# Data transformation
df['column_name'] = np.log(df['column_name'])
df['column_name'] = np.sqrt(df['column_name'])
df['column_name'], _ = stats.boxcox(df['column_name'])

# Feature engineering
df['new_column'] = df['column1'] + df['column2']
df = pd.get_dummies(df, columns=['categorical_column'])

# Final dataset preparation
X = df.drop('target_column', axis=1)
y = df['target_column']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
