In [None]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values.

# Steps to follow:
# 1. Load the data: Use the pandas library to read a CSV file.
# 2. Check for missing values: Use the isnull() method to find missing values.
# 3. Summarize missing data: Use the sum() function to count the number of missing values in each column.
import pandas as pd

df = pd.read_csv('data.csv')

missing_values = df.isnull()
missing_count = df.isnull().sum()

print("Missing values in DataFrame (True indicates missing):")
print(missing_values)
print("\nCount of missing values per column:")
print(missing_count)



In [None]:
# Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset.

# Steps to follow:
# 1. Use dropna() method: Use the dropna() method to remove rows with missing values.
import pandas as pd

df = pd.read_csv('data.csv')

df_cleaned = df.dropna()

print(df_cleaned)


In [None]:
# Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.

# Steps to follow:
# 1. Use dropna() with axis parameter: Set axis=1 in dropna() to remove columns with missing values.

import pandas as pd

df = pd.read_csv('data.csv')

df_cleaned = df.dropna(axis=1)

print(df_cleaned)


In [None]:
# Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.

# Steps to follow:
# 1. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.

import pandas as pd

df = pd.read_csv('data.csv')

mean_value = df['numerical_column'].mean()

df['numerical_column'] = df['numerical_column'].fillna(mean_value)

print(df)


In [None]:
# Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.

# Steps to follow:
# 1. Calculate mode and fill NA: Use mode() to find the most frequent value and fillna() to fill the missing values.

import pandas as pd

df = pd.read_csv('data.csv')

mode_value = df['categorical_column'].mode()[0]

df['categorical_column'] = df['categorical_column'].fillna(mode_value)

print(df)


In [None]:
# Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.

# Steps to follow:
# 1. Calculate median and fill NA: Use median() for skewed data and fillna() to handle missing values.

import pandas as pd

df = pd.read_csv('data.csv')

median_value = df['skewed_column'].median()

df['skewed_column'] = df['skewed_column'].fillna(median_value)

print(df)


In [None]:
# Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.

# Steps to follow:
# 1. Install and import required libraries: Use pip install sklearn if not already installed.
# 2. KNN Imputer: Use KNNImputer to fill in missing values.
import pandas as pd
from sklearn.impute import KNNImputer

df = pd.read_csv('data.csv')

imputer = KNNImputer(n_neighbors=3)

df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print(df_imputed)



In [None]:
# Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.

# Steps to follow:
# 1. Identify missing values in categorical data: Use the isnull() method on categorical columns.
# 2. Impute with next frequent category: Use the mode() method to choose the next frequent category.

import pandas as pd

df = pd.read_csv('data.csv')

categorical_cols = df.select_dtypes(include=['object', 'category']).columns

for col in categorical_cols:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)

print(df)


In [None]:
# Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.

# Steps to follow:
# 1. Partition the data: Split the dataset into train and test based on the presence of missing values.
# 2. Train a model: Use a regression model to predict missing values.
# 3. Impute missing values with predictions.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load dataset
df = pd.read_csv('data.csv')

# Feature with missing values to impute
target_col = 'feature_with_missing'

# Separate rows with and without missing target values
df_train = df[df[target_col].notnull()]
df_test = df[df[target_col].isnull()]

# Features to use for prediction (exclude target and any non-numeric or ID columns)
feature_cols = [col for col in df.columns if col != target_col and df[col].dtype in ['int64', 'float64']]

X_train = df_train[feature_cols]
y_train = df_train[target_col]

X_test = df_test[feature_cols]

# Train regression model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict missing values
predicted_values = model.predict(X_test)

# Impute missing values with predictions
df.loc[df[target_col].isnull(), target_col] = predicted_values

# Optional: Evaluate model performance on train set predictions
train_preds = model.predict(X_train)
mse = mean_squared_error(y_train, train_preds)
print(f'Training Mean Squared Error: {mse}')

print(df)



In [None]:
# Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.

# Steps to follow:
# 1. Sort the data: Ensure the dataset is sorted by dates.
# 2. Use fillna() with method parameter: Apply ffill() and bfill() for forward and backward fill.

import pandas as pd

# Sample time series data with missing values
data = {
    'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'],
    'value': [10, None, None, 20, None]
}
df = pd.DataFrame(data)

# Convert 'date' column to datetime and sort
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# Forward fill missing values
df['value_ffill'] = df['value'].fillna(method='ffill')

# Backward fill missing values
df['value_bfill'] = df['value'].fillna(method='bfill')

print(df)
