In [None]:
import pandas as pd

df = pd.read_csv('cafe_sales.csv')

# 3. Data Cleaning and Type Corrections

In [None]:
# Task 3.1: Convert invalid placeholders (e.g., "UNKNOWN", "ERROR") into NaN.

import numpy as np

df.replace(['UNKNOWN', 'ERROR'], np.nan, inplace=True)

print("Invalid placeholders replaced. Remaining NaN counts per column:")
print(df.isna().sum())


In [None]:
# Task 3.2: Ensure all the columns have correct data types. If not, then convert columns to correct data types.
# Handle invalid entries appropriately.

print("--- dtypes before conversion ---")
print(df.dtypes)
print()

# Numeric columns: Quantity (whole numbers), Price Per Unit and Total Spent (decimals)
# errors='coerce' turns any remaining unparseable value into NaN instead of raising an error
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce').astype('Int64')  # nullable integer type
df['Price Per Unit'] = pd.to_numeric(df['Price Per Unit'], errors='coerce')
df['Total Spent'] = pd.to_numeric(df['Total Spent'], errors='coerce')

# Date column: convert to datetime; unparseable values become NaT
df['Transaction Date'] = pd.to_datetime(df['Transaction Date'], errors='coerce')

print("--- dtypes after conversion ---")
print(df.dtypes)
print()
print("--- sample of converted data ---")
print(df.head())


In [None]:
# Task 3.3: For each data type conversion answer following:
# How did you handle invalid entries?

# Quantity (str -> Int64):
#   In Task 3.1, "UNKNOWN" and "ERROR" were replaced with NaN.
#   Then pd.to_numeric with errors='coerce' was used to convert the column.
#   I used Int64 instead of int64 because regular int64 can't hold NaN values.

# Price Per Unit (str -> float64):
#   Same as Quantity â€” placeholders were replaced with NaN first,
#   then pd.to_numeric with errors='coerce' converted it to float64.
#   Invalid entries became NaN.

# Total Spent (str -> float64):
#   Same approach as Price Per Unit.
#   "ERROR" values became NaN in Task 3.1, and errors='coerce' handled the rest.

# Transaction Date (str -> datetime64):
#   "ERROR" values were replaced with NaN in Task 3.1.
#   pd.to_datetime with errors='coerce' was used so bad dates became NaT
#   instead of throwing an error.
