<a href="https://colab.research.google.com/github/krauseannelize/nb-py-ms-exercises/blob/sprint04/notebooks/s04_pandas_data_wrangling/38_assessing_cleaning_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 38 | Assessing & Cleaning Data

## Data Assessment

In [219]:
import pandas as pd
import numpy as np

# Creating a deliberately messy dataset with correct lengths
data = {
    'Name': [
        'John Doe', 'john doe', ' John Doe ', 'John Doe', 'Jane SMITH',
        'Jane Smith', 'JANE SMITH', 'Jack Brown', 'jack brown', 'Sam Wilson',
        'Sam Wilson', np.nan, 'John Doe', 'John Doe', 'Jane Smith'
    ],
    'Age': [
        '28', 'Thirty-two', 45, 'NaN', 33, 36, '29', 40, 'forty-one', np.nan,
        60, 30, '28', '30', 'unknown'
    ],
    'Date Joined': [
        '2020-01-15', '2021-02-20', '2019-03-10', '2020-01-01', '2020-04-25',
        '2020-03-15', '2020-02-25', '2019-10-02', '2018-11-11', '2021-01-15',
        np.nan, '2020-06-15', '2019-07-01', '2020-06-15', '2019-03-10'
    ],
    'Salary': [
        '50000', '60000', '75000', 'Eighty Thousand', 52000, 75000, 100000,
        '85000', '90K', 'None', 110000, 88000, 'Sixty Thousand', 102000, '70,000'
    ],
    'Remarks': [
        'Good employee', 'N/A', 'Excellent', 'GOOD Employee', 'n/a',
        '', 'Great!', 'Needs improvement', 'none', None,
        'Fair', 'excellent', 'Fair', '', 'Promising'
    ]
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Duplicating some rows intentionally to check for duplicates
df = pd.concat([df, df.iloc[2:4]], ignore_index=True)
df.head()

Unnamed: 0,Name,Age,Date Joined,Salary,Remarks
0,John Doe,28,2020-01-15,50000,Good employee
1,john doe,Thirty-two,2021-02-20,60000,
2,John Doe,45,2019-03-10,75000,Excellent
3,John Doe,,2020-01-01,Eighty Thousand,GOOD Employee
4,Jane SMITH,33,2020-04-25,52000,


In [220]:
# Check the first 2 rows of the DataFrame
print("First 2 Rows:")
print(df.head(2))

First 2 Rows:
       Name         Age Date Joined Salary        Remarks
0  John Doe          28  2020-01-15  50000  Good employee
1  john doe  Thirty-two  2021-02-20  60000            N/A


In [221]:
# Check the last 2 rows of the DataFrame
print("Last 2 Rows:")
print(df.tail(2))

Last 2 Rows:
          Name  Age Date Joined           Salary        Remarks
15   John Doe    45  2019-03-10            75000      Excellent
16    John Doe  NaN  2020-01-01  Eighty Thousand  GOOD Employee


In [222]:
# View a summary of the DataFrame including column types, non-null counts, and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         16 non-null     object
 1   Age          16 non-null     object
 2   Date Joined  16 non-null     object
 3   Salary       17 non-null     object
 4   Remarks      16 non-null     object
dtypes: object(5)
memory usage: 812.0+ bytes


In [223]:
# Check the number of rows and columns in the DataFrame
print("Shape of the DataFrame (Rows, Columns):", df.shape)

Shape of the DataFrame (Rows, Columns): (17, 5)


In [224]:
# Check the data types of each column
print("Data Types of Each Column:")
print(df.dtypes)

Data Types of Each Column:
Name           object
Age            object
Date Joined    object
Salary         object
Remarks        object
dtype: object


In [225]:
# Check the number of missing values in the DataFrame
print("Missing Values Count:")
print(df.isnull().sum())

Missing Values Count:
Name           1
Age            1
Date Joined    1
Salary         0
Remarks        1
dtype: int64


In [226]:
# Check how many duplicated rows are in the DataFrame
print("Number of Duplicate Rows:", df.duplicated().sum())

Number of Duplicate Rows: 2


In [227]:
# Returns a Boolean Series indicating which rows are duplicates of previous ones
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,False
9,False


In [228]:
# .duplicated() → creates a Boolean Series (condition)
# .loc[] → applies that condition to select matching rows
df.loc[df.Age.duplicated()]

Unnamed: 0,Name,Age,Date Joined,Salary,Remarks
12,John Doe,28.0,2019-07-01,Sixty Thousand,Fair
15,John Doe,45.0,2019-03-10,75000,Excellent
16,John Doe,,2020-01-01,Eighty Thousand,GOOD Employee


In [229]:
# Check inconsistencies in the unique values of Remarks Column
print("Unique Values in 'Remarks' Column:")
print(df['Remarks'].unique())

Unique Values in 'Remarks' Column:
['Good employee' 'N/A' 'Excellent' 'GOOD Employee' 'n/a' '' 'Great!'
 'Needs improvement' 'none' None 'Fair' 'excellent' 'Promising']


In [230]:
# Check for inconsistent date formats
print("Unique Date Formats in 'Date Joined':")
print(df['Date Joined'].unique())

Unique Date Formats in 'Date Joined':
['2020-01-15' '2021-02-20' '2019-03-10' '2020-01-01' '2020-04-25'
 '2020-03-15' '2020-02-25' '2019-10-02' '2018-11-11' '2021-01-15' nan
 '2020-06-15' '2019-07-01']


In [231]:
# .astype(str) → convert values in 'Age' column to strings
# .str.replace('NaN', '') → remove literal 'NaN' strings
# .str.isnumeric() → return True for strings that contain only numeric characters
df.Age.astype(str).str.replace('NaN', '').str.isnumeric()

Unnamed: 0,Age
0,True
1,False
2,True
3,False
4,True
5,True
6,True
7,True
8,False
9,False


In [232]:
# .astype(str) → convert 'Age' values to strings
# .str.replace('NaN', '') → remove literal 'NaN' strings
# .str.isnumeric() → return True for strings that contain only numeric characters
# ~(...) → invert the Boolean Series to get rows that are NOT numeric
invalid_ages = df[~(df['Age'].astype(str).str.replace('NaN', '').str.isnumeric())]

# Print the names and age values that failed the numeric check
print("Invalid Age Values (Non-Numeric):")
print(invalid_ages[['Name', 'Age']])

Invalid Age Values (Non-Numeric):
          Name         Age
1     john doe  Thirty-two
3     John Doe         NaN
8   jack brown   forty-one
9   Sam Wilson         NaN
14  Jane Smith     unknown
16    John Doe         NaN


## Data Cleaning

In [233]:
# .isna() → returns True where data is missing (NaN)
# .sum() → sums the True values column-wise (True = 1)
df.isna().sum()

Unnamed: 0,0
Name,1
Age,1
Date Joined,1
Salary,0
Remarks,1


In [234]:
# .isna() → returns True where 'Age' is missing (NaN)
# df.loc[...] → selects rows based on that condition
# , : → selects all columns for those rows
df.loc[df.Age.isna(), :]

Unnamed: 0,Name,Age,Date Joined,Salary,Remarks
9,Sam Wilson,,2021-01-15,,


In [235]:
# : → select all rows
# ['Age', 'Salary'] → select only the 'Age' and 'Salary' columns
# .loc[] → label-based selection for rows and columns
df.loc[:, ['Age', 'Salary']]

Unnamed: 0,Age,Salary
0,28,50000
1,Thirty-two,60000
2,45,75000
3,,Eighty Thousand
4,33,52000
5,36,75000
6,29,100000
7,40,85000
8,forty-one,90K
9,,


In [236]:
# .dropna() → remove rows that contain any NaN (missing) values
# inplace=True → apply the change directly to the original DataFrame
df.dropna(inplace=True)

In [237]:
# Recheck the number of missing values
df.isna().sum()

Unnamed: 0,0
Name,0
Age,0
Date Joined,0
Salary,0
Remarks,0


In [238]:
# Check unique values in Name column
pd.DataFrame(df['Name'].unique(), columns=['Name'])

Unnamed: 0,Name
0,John Doe
1,john doe
2,John Doe
3,Jane SMITH
4,Jane Smith
5,JANE SMITH
6,Jack Brown
7,jack brown


In [239]:
# Standardize Name (Remove spaces & fix capitalization)
df['Name'] = df['Name'].str.strip().str.title()
df['Name']

Unnamed: 0,Name
0,John Doe
1,John Doe
2,John Doe
3,John Doe
4,Jane Smith
5,Jane Smith
6,Jane Smith
7,Jack Brown
8,Jack Brown
12,John Doe


In [240]:
# Find duplicate names after standardization
df.Name.duplicated()

Unnamed: 0,Name
0,False
1,True
2,True
3,True
4,False
5,True
6,True
7,False
8,True
12,True


In [241]:
# .dtype → returns the data type of the 'Age' column
# dtype('O') = object/str
df['Age'].dtype

dtype('O')

In [242]:
# pd.to_numeric() → attempts to convert values in 'Age' column to numeric type
# errors='coerce' → replaces any invalid (non-numeric) entries with NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

In [243]:
# Recheck data type of the 'Age' column
df['Age'].dtype

dtype('float64')

In [244]:
# Check unique values in 'Salary' column
df['Salary'].unique()

array(['50000', '60000', '75000', 'Eighty Thousand', 52000, 75000, 100000,
       '85000', '90K', 'Sixty Thousand', 102000, '70,000'], dtype=object)

In [245]:
# Replace inconsistent salary formats with standardized numeric values
df['Salary'] = df['Salary'].replace({
    'Eighty Thousand': 80000, 'Sixty Thousand': 60000, '90K': 90000, '70,000': 70000})

In [246]:
# pd.to_numeric() → attempts to convert values in 'Salary' column to numeric type
# errors='coerce' → replaces any invalid (non-numeric) entries with NaN
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

In [247]:
# Recheck unique values in 'Salary' column
df['Salary'].unique()

array([ 50000,  60000,  75000,  80000,  52000, 100000,  85000,  90000,
       102000,  70000])

In [248]:
# Confirm data type of 'Salary' column
df['Salary'].dtype

dtype('int64')

In [249]:
# Check the number of missing values in the Salary column
df.Salary.isna().sum()

np.int64(0)

In [250]:
# Convert Remarks column to lowercase and find unique values
df['Remarks'].str.lower().unique()

array(['good employee', 'n/a', 'excellent', '', 'great!',
       'needs improvement', 'none', 'fair', 'promising'], dtype=object)

In [251]:
# Standardize 'Remarks' column
# Convert all text to lowercase
# Replace common placeholders using a mapping dictionary
remarks_dict = {'n/a': 'unknown', 'none': 'unknown', '': 'unknown'}
df['Remarks'] = df['Remarks'].str.lower().replace(remarks_dict)

# Display the cleaned 'Remarks' column
df['Remarks']

Unnamed: 0,Remarks
0,good employee
1,unknown
2,excellent
3,good employee
4,unknown
5,unknown
6,great!
7,needs improvement
8,unknown
12,fair


In [252]:
# Check data type of 'Date Joined' column
# Data type: object/string
df['Date Joined'].dtype

dtype('O')

In [253]:
# Convert 'Date Joined' column to datetime format
# errors='coerce' ensures invalid/missing dates become NaT
# format='%d-%m-%Y' explicitly defines expected date structure (day-month-year)
# dayfirst=True is included for clarity, but ignored when format is specified
df['Date Joined'] = pd.to_datetime(df['Date Joined'], format='%d-%m-%Y', errors='coerce', dayfirst=True)


In [254]:
# Recheck summary of the DataFrame to check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, 0 to 16
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Name         14 non-null     object        
 1   Age          9 non-null      float64       
 2   Date Joined  0 non-null      datetime64[ns]
 3   Salary       14 non-null     int64         
 4   Remarks      14 non-null     object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 672.0+ bytes


In [255]:
# Format 'Date Joined' column as string in 'DD-MM-YYYY' format using datetime accessor
df['Date Joined'] = df['Date Joined'].dt.strftime('%d-%m-%Y')
df.head()

Unnamed: 0,Name,Age,Date Joined,Salary,Remarks
0,John Doe,28.0,,50000,good employee
1,John Doe,,,60000,unknown
2,John Doe,45.0,,75000,excellent
3,John Doe,,,80000,good employee
4,Jane Smith,33.0,,52000,unknown


In [256]:
# Re-convert 'Date Joined' to datetime format to restore datetime functionality
# Format 'Date Joined' column as string in 'YYYY-MM-DD' format using datetime accessor
df['Date Joined'] = pd.to_datetime(df['Date Joined'], errors='coerce', dayfirst=True)
df['Date Joined'] = df['Date Joined'].dt.strftime('%Y-%m-%d')
df.head()

Unnamed: 0,Name,Age,Date Joined,Salary,Remarks
0,John Doe,28.0,,50000,good employee
1,John Doe,,,60000,unknown
2,John Doe,45.0,,75000,excellent
3,John Doe,,,80000,good employee
4,Jane Smith,33.0,,52000,unknown


In [257]:
# Recheck total duplicate rows in DataFrame
df.duplicated().sum()

np.int64(2)

In [258]:
# Display rows that are duplicates of earlier ones
df[df.duplicated()]

Unnamed: 0,Name,Age,Date Joined,Salary,Remarks
15,John Doe,45.0,,75000,excellent
16,John Doe,,,80000,good employee


In [259]:
# Drop duplicate rows from the DataFrame and overwrite the original
df = df.drop_duplicates()

In [260]:
# confirm rows 15 and 16 have been removed
df

Unnamed: 0,Name,Age,Date Joined,Salary,Remarks
0,John Doe,28.0,,50000,good employee
1,John Doe,,,60000,unknown
2,John Doe,45.0,,75000,excellent
3,John Doe,,,80000,good employee
4,Jane Smith,33.0,,52000,unknown
5,Jane Smith,36.0,,75000,unknown
6,Jane Smith,29.0,,100000,great!
7,Jack Brown,40.0,,85000,needs improvement
8,Jack Brown,,,90000,unknown
12,John Doe,28.0,,60000,fair


## Data Cleaning: Strings

Most of these methods are available on any string object (`str`) and are chainable for efficient cleaning and transformation.

| Method | Purpose / Use Case | Syntax Example |
|--------|--------------------|----------------|
| `strip()` | Remove leading/trailing whitespace | `s.strip()` |
| `lower()` | Convert all characters to lowercase | `s.lower()` |
| `upper()` | Convert all characters to uppercase | `s.upper()` |
| `title()` | Convert string to title case | `s.title()` |
| `capitalize()` | Capitalize first character | `s.capitalize()` |
| `swapcase()` | Swap case of each character | `s.swapcase()` |
| `replace()` | Replace substring with another | `s.replace("old", "new")` |
| `split()` | Split string into list by delimiter | `s.split(",")` |
| `join()` | Join list into string with delimiter | `",".join(list)` |
| `find()` | Find index of first occurrence of substring | `s.find("sub")` |
| `count()` | Count occurrences of substring | `s.count("sub")` |
| `startswith()` | Check if string starts with substring | `s.startswith("prefix")` |
| `endswith()` | Check if string ends with substring | `s.endswith("suffix")` |
| `isalpha()` | Check if all characters are alphabetic | `s.isalpha()` |
| `isdigit()` | Check if all characters are digits | `s.isdigit()` |
| `isalnum()` | Check if all characters are alphanumeric | `s.isalnum()` |
| `isspace()` | Check if string contains only whitespace | `s.isspace()` |
| `zfill()` | Pad numeric string with leading zeros | `s.zfill(5)` |
| `rjust()` | Right-align string within given width | `s.rjust(10)` |
| `ljust()` | Left-align string within given width | `s.ljust(10)` |
| `format()` | Insert variables into string template | `"Hello {}".format(name)` |

In [266]:
import pandas as pd

# Creating list with random strings
strings = ['  hello world  ', 'PYTHON  ', '  daTa SCIENCE  ']

# Clean each string: strip whitespace, convert to title case, and right-align to width 10
cleaned_strings = [s.strip().title().rjust(10) for s in strings]

print(cleaned_strings)

['Hello World', '    Python', 'Data Science']
