# Testing Jupyter Notebook in Git

One, two, three

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Sample data creation
np.random.seed(0)  # For reproducibility

# Creating 'Age' column with values from 25 to 62 and some nulls
age_data = np.random.choice(list(range(25, 63)) + [None], size=50)

# Creating 'Sex' column with mixed-case values and some nulls
sex_data = np.random.choice(['Male', 'Female', 'Other', 'MALE', 'FEMALE', 'OTHER', None], size=50)

In [3]:
# Creating the DataFrame
df = pd.DataFrame({
    'Age': age_data,
    'Sex': sex_data
})

In [4]:
# Check initial column types
print("Initial Data Types:")
print(df.dtypes)

Initial Data Types:
Age    object
Sex    object
dtype: object


In [5]:
# Get the first 5 values
print("\nFirst 5 Rows:")
print(df.head())


First 5 Rows:
  Age     Sex
0  25    Male
1  28  FEMALE
2  28  Female
3  34  FEMALE
4  44  Female


In [6]:
# Count null values by column
print("\nNull Counts by Column:")
print(df.isnull().sum())


Null Counts by Column:
Age    2
Sex    5
dtype: int64


In [7]:
# Data cleaning function
def clean_data(df):
    # Fill missing 'Age' values with the median of the 'Age' column
    df['Age'].fillna(df['Age'].median(), inplace=True)
    # Standardize 'Sex' column to be all lowercase and fill missing with 'Unknown'
    df['Sex'] = df['Sex'].str.lower().fillna('unknown')
    return df

In [8]:
# Clean the DataFrame
df_cleaned = clean_data(df)

## Final results

In [9]:
# Check the final DataFrame
print("\nFinal Data Types:")
print(df_cleaned.dtypes)

print("\nFirst 5 Rows of Cleaned DataFrame:")
print(df_cleaned.head())

print("\nNull Counts by Column After Cleaning:")
print(df_cleaned.isnull().sum())


Final Data Types:
Age    float64
Sex     object
dtype: object

First 5 Rows of Cleaned DataFrame:
    Age     Sex
0  25.0    male
1  28.0  female
2  28.0  female
3  34.0  female
4  44.0  female

Null Counts by Column After Cleaning:
Age    0
Sex    0
dtype: int64
