In [2]:
!pip install faker


Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [7]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize the Faker instance
fake = Faker()

# Step 1: Generate Messy Data (using Faker to simulate real-world data issues)
def generate_messy_data(num_records=100):
    """Generate a messy dataset with random missing values, inconsistent formats, and duplicates."""
    records = []
    for _ in range(num_records):
        records.append({
            'ID': random.choice([fake.unique.random_int(min=1, max=100), np.nan, ""]),
            'Name': random.choice([fake.name(), None, " "]),
            'Age': random.choice([fake.random_int(min=18, max=60), np.nan, "Unknown"]),
            'Join Date': random.choice([fake.date(), fake.date(pattern='%d-%m-%Y'), fake.date(pattern='%m/%d/%Y'), np.nan, "Invalid"])
        })
    return pd.DataFrame(records)

# Step 2: Identify issues (Missing, Duplicates, Invalid Formats)
def identify_issues(df):
    print("\nIdentifying issues in the dataset:")
    print("\nMissing or Empty Values:")
    print(df.replace({"": np.nan, " ": np.nan, "Unknown": np.nan}).isnull().sum())
    print("\nDuplicate Entries:")
    print(df.duplicated().sum())
    print("\nData Types of Columns:")
    print(df.dtypes)

# Step 3: Standardize Date Formats
def standardize_date(date):
    """Standardizes date formats to a uniform format."""
    try:
        return pd.to_datetime(date, errors='coerce', dayfirst=True)
    except Exception as e:
        return np.nan

# Step 4: Convert columns to numeric values (and handle errors)
def enforce_numeric(column):
    """Converts column values to numeric, coercing errors to NaN."""
    return pd.to_numeric(column, errors='coerce')

# Step 5: Clean the data (automated)
def clean_data(df):
    """Automatically cleans the dataset by fixing common issues."""

    # Replace empty strings, spaces, and 'Unknown' with NaN
    df.replace({"": np.nan, " ": np.nan, "Unknown": np.nan}, inplace=True)

    # Convert 'Age' column to numeric and fill missing values with median
    df['Age'] = enforce_numeric(df['Age'])
    df['Age'] = df['Age'].fillna(df['Age'].median())

    # Drop rows where 'Name' is missing
    df.dropna(subset=['Name'], inplace=True)

    # Drop duplicate rows
    df.drop_duplicates(inplace=True)

    # Standardize 'Join Date' column to datetime format
    df['Join Date'] = df['Join Date'].apply(standardize_date)

    # Convert 'ID' column to numeric (nullable integer type)
    df['ID'] = enforce_numeric(df['ID']).astype('Int64')

    return df

# Step 6: Automate the whole process
def automate_data_cleaning(df):
    """Runs the entire cleaning process automatically."""
    print("Original Data Sample:")
    print(df.head())  # Show original data
    identify_issues(df)  # Identify initial issues
    df_cleaned = clean_data(df)  # Clean the data
    print("\nCleaned Data Sample:")
    print(df_cleaned.head())  # Show cleaned data

    # Optionally, save the cleaned data to a CSV file
    df_cleaned.to_csv("cleaned_data.csv", index=False)
    print("\nCleaned data saved to 'cleaned_data.csv'")

    return df_cleaned

# Example usage (automatically generate messy data using Faker)
if __name__ == "__main__":
    # Generate a messy dataset
    df = generate_messy_data(num_records=100)

    # Run the automated data cleaning pipeline
    automate_data_cleaning(df)


Original Data Sample:
    ID         Name      Age   Join Date
0   15         None       20  01-07-1983
1       Ashley Ruiz       34  30-10-1974
2  NaN               Unknown  31-08-1985
3                        NaN     Invalid
4  NaN         None      NaN  2023-10-21

Identifying issues in the dataset:

Missing or Empty Values:
ID           68
Name         60
Age          67
Join Date    20
dtype: int64

Duplicate Entries:
4

Data Types of Columns:
ID           object
Name         object
Age          object
Join Date    object
dtype: object

Cleaned Data Sample:
      ID            Name   Age  Join Date
1   <NA>     Ashley Ruiz  34.0 1974-10-30
5   <NA>     John Dennis  40.0 1999-02-27
8     44     Alison Hall  40.0        NaT
9   <NA>   Steven Carter  40.0        NaT
10  <NA>  Michelle Lopez  29.0        NaT

Cleaned data saved to 'cleaned_data.csv'


  print(df.replace({"": np.nan, " ": np.nan, "Unknown": np.nan}).isnull().sum())
  df.replace({"": np.nan, " ": np.nan, "Unknown": np.nan}, inplace=True)
  return pd.to_datetime(date, errors='coerce', dayfirst=True)
  return pd.to_datetime(date, errors='coerce', dayfirst=True)


In [None]:
from google.colab import files
files.download("cleaned_data.csv")
