# Demonstrating PyJanitor for Data Cleaning and Transformation

## 1. Install and Import Required Libraries

If running locally, uncomment the next line:
```bash
!pip install pyjanitor pandas
```

In [1]:
import pandas as pd
import janitor

In [2]:
## 2. Create a Sample DataFrame

data = {
    'First Name': ['Alice', 'Bob', 'Charlie', None],
    'Last Name': ['Smith', 'Jones', 'Brown', 'Davis'],
    'Age': [25, 30, None, 22],
    'Income($)': [50000, 60000, 55000, None]
}
df = pd.DataFrame(data)
df

Unnamed: 0,First Name,Last Name,Age,Income($)
0,Alice,Smith,25.0,50000.0
1,Bob,Jones,30.0,60000.0
2,Charlie,Brown,,55000.0
3,,Davis,22.0,


In [3]:
## 3. Clean Column Names

# PyJanitor makes it easy to clean column names for easier access
df_clean = df.clean_names()
df_clean

Unnamed: 0,first_name,last_name,age,income_$_
0,Alice,Smith,25.0,50000.0
1,Bob,Jones,30.0,60000.0
2,Charlie,Brown,,55000.0
3,,Davis,22.0,


In [4]:
## 4. Remove Rows with Missing Values

# Remove rows where any value is missing
df_no_missing = df_clean.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,income_$_
0,Alice,Smith,25.0,50000.0
1,Bob,Jones,30.0,60000.0


In [6]:
## 5. Fill Missing Values

# Fill missing ages with the mean age
df_filled = df_clean.fillna({'age': df_clean['age'].mean(), 'income_$_': df_clean['income_$_'].mean()})
df_filled

Unnamed: 0,first_name,last_name,age,income_$_
0,Alice,Smith,25.0,50000.0
1,Bob,Jones,30.0,60000.0
2,Charlie,Brown,25.666667,55000.0
3,,Davis,22.0,55000.0


In [8]:
## 6. Add a New Column Based on Existing Data

# Add a column for income in thousands
df_new = df_filled.add_column('income_k', df_filled['income_$_'] / 1000)
df_new

  return method(self._obj, *args, **kwargs)


Unnamed: 0,first_name,last_name,age,income_$_,income_k
0,Alice,Smith,25.0,50000.0,50.0
1,Bob,Jones,30.0,60000.0,60.0
2,Charlie,Brown,25.666667,55000.0,55.0
3,,Davis,22.0,55000.0,55.0


In [9]:
## 7. Filter Rows Easily

# Filter rows where age is greater than 24
df_filtered = df_new.filter_on('age > 24')
df_filtered

  return method(self._obj, *args, **kwargs)
  return method(self._obj, *args, **kwargs)


Unnamed: 0,first_name,last_name,age,income_$_,income_k
0,Alice,Smith,25.0,50000.0,50.0
1,Bob,Jones,30.0,60000.0,60.0
2,Charlie,Brown,25.666667,55000.0,55.0


In [12]:
## 8. Chaining Multiple Operations

# PyJanitor encourages method chaining for readable code
df_chain = (
    df
    .clean_names()
    .dropna(subset=['age'])
    # Use pandas query instead of filter_on for special characters in column names
    .query('`income_$_` > 50000')
    .add_column('full_name', lambda x: x['first_name'] + ' ' + x['last_name'])
)
df_chain

  return method(self._obj, *args, **kwargs)


Unnamed: 0,first_name,last_name,age,income_$_,full_name
1,Bob,Jones,30.0,60000.0,<function <lambda> at 0x000001B0A2566480>


## 9. Conclusion

PyJanitor simplifies and streamlines data cleaning and transformation tasks, making your pandas workflows more readable and efficient.

For more, see: https://pyjanitor-devs.github.io/pyjanitor/