In [3]:
# Activity 3: Data Standardization & Validation

# Task A: Enforcing Data Formats & Constraints

# 13. Date Format Standardization:
# - Convert all date entries into a uniform format (e.g., YYYY-MM-DD).
import pandas as pd

# Sample data with mixed date formats
data = {'raw_date': ['01/02/2023', '2023-03-04', 'March 5, 2023', '2023.06.07', '07-08-2023']}
df = pd.DataFrame(data)

# Convert to datetime and standardize format
df['standard_date'] = pd.to_datetime(df['raw_date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Show the result
print(df)






# 14. Numeric Constraints Enforcement:
# - Check and enforce numeric constraints (e.g., age > 0).

import pandas as pd

# Sample data
data = {'name': ['Alice', 'Bob', 'Charlie', 'David'],
        'age': [25, -5, 0, 32]}  # -5 and 0 violate the constraint

df = pd.DataFrame(data)

# Identify invalid rows
invalid_ages = df[df['age'] <= 0]
print("Invalid age entries:")
print(invalid_ages)

# Option 1: Remove invalid rows
df_valid = df[df['age'] > 0]

# Option 2: (Alternative) Set invalid ages to NaN for later handling
# df['age'] = df['age'].apply(lambda x: x if x > 0 else pd.NA)

print("\nCleaned DataFrame:")
print(df_valid)





# 15. String Format Checks:
# - Ensure text fields meet certain constraints (e.g., valid email format).

        raw_date standard_date
0     01/02/2023    2023-01-02
1     2023-03-04           NaN
2  March 5, 2023           NaN
3     2023.06.07           NaN
4     07-08-2023           NaN
Invalid age entries:
      name  age
1      Bob   -5
2  Charlie    0

Cleaned DataFrame:
    name  age
0  Alice   25
3  David   32


In [4]:
# Task B: Addressing Inconsistent Representations

# 16. Standardizing Date Formats:
# - Identify and correct inconsistent date formats within the dataset.

import pandas as pd

# Sample data with inconsistent date formats
data = {
    'event_date': ['2023-05-01', '05/02/2023', 'May 3, 2023', '2023.04.30', '03-05-2023', 'InvalidDate']
}
df = pd.DataFrame(data)

# Convert to datetime with coercion for unparseable values
df['standardized_date'] = pd.to_datetime(df['event_date'], errors='coerce')

# Format as YYYY-MM-DD
df['standardized_date'] = df['standardized_date'].dt.strftime('%Y-%m-%d')

print(df)







# 17. Pattern Matching for Consistency:
# - Standardize phone numbers to a specific pattern (e.g., (123) 456-7890).





# 18. Handling Mixed Case Text:
# - Convert all text entries to a consistent case (e.g., all uppercase).









