In [None]:
# Task 1: Load a CSV Dataset
# Description: Load a CSV file into a Pandas DataFrame and print the first five rows to understand the structure of the dataset.


import pandas as pd

# Task 1: Load a CSV Dataset
# Description: Load a CSV file into a Pandas DataFrame and print the first five rows.

def load_and_preview_csv(file_path):
    # Load CSV into DataFrame
    df = pd.read_csv(file_path)
    
    # Print first five rows
    print("First 5 rows of the dataset:")
    print(df.head())

    return df

# Example usage:
if __name__ == "__main__":
    # Replace 'your_dataset.csv' with your actual CSV file path
    df = load_and_preview_csv('your_dataset.csv')



In [None]:
# Task 2: Check for Missing Values
# Description: Identify and list the columns with missing values and the number of missing values in each.

import pandas as pd

# Task 2: Check for Missing Values
# Description: Identify and list columns with missing values and the count of missing values in each.

def check_missing_values(df):
    missing_counts = df.isnull().sum()
    missing_columns = missing_counts[missing_counts > 0]
    
    if missing_columns.empty:
        print("No missing values found in any column.")
    else:
        print("Columns with missing values and their counts:")
        print(missing_columns)

# Example usage:
if __name__ == "__main__":
    # Assume df is already loaded from previous task or elsewhere
    df = pd.read_csv('your_dataset.csv')  # Replace with actual path
    check_missing_values(df)


In [None]:
# Task 3: Visualize Missing Data
# Description: Use a heatmap to visualize the missing values in the dataset.
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Task 3: Visualize Missing Data
# Description: Use a heatmap to visualize the missing values in the dataset.

def visualize_missing_data(df):
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Data Heatmap')
    plt.show()

# Example usage:
if __name__ == "__main__":
    df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path
    visualize_missing_data(df)





In [None]:
# Task 4: Remove Columns with Many Missing Values
# Description: Drop columns that have more than 50% missing values.

import pandas as pd

# Task 4: Remove Columns with Many Missing Values
# Description: Drop columns that have more than 50% missing values.

def drop_columns_with_many_missing(df, threshold=0.5):
    # Calculate the fraction of missing values per column
    missing_fraction = df.isnull().mean()
    # Identify columns to drop
    cols_to_drop = missing_fraction[missing_fraction > threshold].index
    # Drop columns
    df_cleaned = df.drop(columns=cols_to_drop)
    return df_cleaned

# Example usage:
if __name__ == "__main__":
    df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path
    df_cleaned = drop_columns_with_many_missing(df)
    print("Columns dropped:", df.columns.difference(df_cleaned.columns).tolist())
    print(df_cleaned.head())



In [None]:
# Task 5: Identify Duplicate Rows
# Description: Check for and display any duplicate rows in the dataset.


import pandas as pd

# Task 5: Identify Duplicate Rows
# Description: Check for and display any duplicate rows in the dataset.

def find_duplicate_rows(df):
    # Find duplicate rows (excluding the first occurrence)
    duplicates = df[df.duplicated(keep='first')]
    return duplicates

# Example usage:
if __name__ == "__main__":
    df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path
    duplicate_rows = find_duplicate_rows(df)
    if not duplicate_rows.empty:
        print("Duplicate rows found:")
        print(duplicate_rows)
    else:
        print("No duplicate rows found.")


In [None]:
# Task 6: Remove Duplicate Rows
# Description: Remove duplicate rows from the dataset and verify that they have been removed.
import pandas as pd

# Task 6: Remove Duplicate Rows
# Description: Remove duplicate rows from the dataset and verify that they have been removed.

def remove_duplicate_rows(df):
    # Remove duplicate rows, keep the first occurrence
    df_no_duplicates = df.drop_duplicates(keep='first')
    return df_no_duplicates

# Example usage:
if __name__ == "__main__":
    df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path
    print(f"Original dataset shape: {df.shape}")
    
    df_cleaned = remove_duplicate_rows(df)
    print(f"Dataset shape after removing duplicates: {df_cleaned.shape}")
    
    # Verify no duplicates remain
    duplicates_after = df_cleaned[df_cleaned.duplicated()]
    if duplicates_after.empty:
        print("All duplicate rows removed successfully.")
    else:
        print("Duplicates still exist in the dataset.")




In [None]:
# Task 7: Check Data Inconsistencies
# Description: Identify inconsistencies in categorical columns, such as differing text cases or trailing spaces.


import pandas as pd

# Task 7: Check Data Inconsistencies
# Description: Identify inconsistencies in categorical columns, such as differing text cases or trailing spaces.

def check_categorical_inconsistencies(df, categorical_columns):
    inconsistencies = {}
    for col in categorical_columns:
        if col in df.columns:
            unique_vals = df[col].dropna().astype(str).unique()
            # Check for case differences by comparing lowercase versions
            lower_case_vals = set(val.lower().strip() for val in unique_vals)
            if len(lower_case_vals) != len(unique_vals):
                inconsistencies[col] = {
                    "original_unique_values": unique_vals,
                    "note": "Inconsistencies detected (case or whitespace differences)."
                }
    return inconsistencies

# Example usage:
if __name__ == "__main__":
    df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path
    categorical_cols = ['Category', 'Status', 'Type']  # Replace with your categorical columns
    
    issues = check_categorical_inconsistencies(df, categorical_cols)
    if issues:
        print("Inconsistencies found in categorical columns:")
        for col, detail in issues.items():
            print(f"Column: {col}")
            print(f"Unique Values: {detail['original_unique_values']}")
            print(detail["note"])
    else:
        print("No inconsistencies found in specified categorical columns.")


In [None]:
# Task 8: Get Summary of Data Quality
# Description: Generate a summary of data quality including total records, number of duplicate rows, and columns with missing values.


import pandas as pd

# Task 8: Get Summary of Data Quality
# Description: Generate a summary of data quality including total records, number of duplicate rows, and columns with missing values.

def data_quality_summary(df):
    total_records = len(df)
    duplicate_count = df.duplicated().sum()
    missing_values = df.isnull().sum()
    columns_with_missing = missing_values[missing_values > 0].to_dict()
    
    summary = {
        "Total Records": total_records,
        "Duplicate Rows": duplicate_count,
        "Columns with Missing Values": columns_with_missing
    }
    return summary

# Example usage:
if __name__ == "__main__":
    df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path
    
    summary = data_quality_summary(df)
    print("Data Quality Summary:")
    for key, value in summary.items():
        print(f"{key}: {value}")


In [None]:
# Task 9: Generate a Data Quality Report
# Description: Create a comprehensive data quality report that includes not only missing values but also basic statistics for numerical columns and the distribution of categorical columns.


import pandas as pd

# Task 9: Generate a Data Quality Report
# Description: Create a comprehensive data quality report that includes missing values,
# basic statistics for numerical columns, and distribution of categorical columns.

def generate_data_quality_report(df):
    report = {}

    # Missing values per column
    missing_values = df.isnull().sum()
    report['Missing Values'] = missing_values[missing_values > 0].to_dict()

    # Basic statistics for numerical columns
    numeric_stats = df.describe().to_dict()
    report['Numerical Summary Statistics'] = numeric_stats

    # Distribution of categorical columns
    categorical_distributions = {}
    for col in df.select_dtypes(include=['object', 'category']).columns:
        value_counts = df[col].value_counts(dropna=False).to_dict()
        categorical_distributions[col] = value_counts
    report['Categorical Distributions'] = categorical_distributions

    return report

# Example usage:
if __name__ == "__main__":
    df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path

    dq_report = generate_data_quality_report(df)
    
    print("Data Quality Report:\n")
    
    print("Missing Values:")
    for col, count in dq_report['Missing Values'].items():
        print(f"  {col}: {count}")

    print("\nNumerical Summary Statistics:")
    for stat, values in dq_report['Numerical Summary Statistics'].items():
        print(f"  {stat}:")
        for col, val in values.items():
            print(f"    {col}: {val}")

    print("\nCategorical Distributions:")
    for col, dist in dq_report['Categorical Distributions'].items():
        print(f"  {col}:")
        for val, count in dist.items():
            print(f"    {val}: {count}")


In [None]:
# Task 10: Advanced Data Imputation
# Description: Perform advanced data imputation by replacing missing values in numerical columns with the mean and categorical columns with the mode.


import pandas as pd

# Task 10: Advanced Data Imputation
# Description: Replace missing values in numerical columns with mean and in categorical columns with mode.

def advanced_data_imputation(df):
    df_imputed = df.copy()

    # Impute numerical columns with mean
    num_cols = df_imputed.select_dtypes(include=['number']).columns
    for col in num_cols:
        mean_value = df_imputed[col].mean()
        df_imputed[col].fillna(mean_value, inplace=True)

    # Impute categorical columns with mode
    cat_cols = df_imputed.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        mode_value = df_imputed[col].mode()
        if not mode_value.empty:
            df_imputed[col].fillna(mode_value[0], inplace=True)

    return df_imputed


# Example usage:
if __name__ == "__main__":
    df = pd.read_csv('your_dataset.csv')  # Replace with your dataset path

    df_imputed = advanced_data_imputation(df)
    print(df_imputed.head())

