In [167]:
import pandas as pd
import numpy as np
import re

# Dummy Data

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1w8Bim7BZHVb3baY0hK4YBL-BPTzkZ-0T?usp=sharing)

We'll be using very small, synthetic datasets in this notebook so you can see EXACTLY what these cleaning methods do.

In [168]:
# -------------------------------
# 1. People Applications Dataset
# -------------------------------
people_data = {
    'fake_ssn': ['123-45-6789', '987-65-4321', '555-55-5555', '111-22-3333', '222-33-4444'],
    'first_name': ['John', 'JANE', 'Alice', 'Bob', 'eMILY'],
    'middle_name': ['Q. A.', 'b. C.', 'Marie-Lou', 'D.', 'Ann K.'],
    'last_name': ['Doe', 'Smith', 'O\'Connor', 'McDonald', 'Johnson'],
    'birthday': ['12/31/1990', '31/12/1985', '07/04/1975', '1980-05-15', '15/08/1992'],
    'ethnicity': ['Maroon Manatee', 'Pink Panda', 'Blue Buffalo', 'Golden Giraffe', 'Pink Panda'],
    'zip_code': ['12345-6789', '54321', '67890-1234', '11111-2222', '33333']
}

df_people = pd.DataFrame(people_data)

# -------------------------------
# 2. City Records Dataset
# -------------------------------
city_records_data = {
    'fake_ssn': ['123-45-6789', '987-65-4321', '555-55-5555', '111-22-3333', '222-33-4444'],
    'birthday': ['1990-12-31', '12/31/1985', '04/07/1975', '15-05-1980', '1992/08/15'],
    'income': ['$50,000', '60000', '$75,500', '80000', '$45,000'],
    'address': ['123 main st', '456     Oak Avenue', '789    Pine      Rd', '101  maple street', '202.   Birch     Blvd'],
    'tax_due': ['$1,200', '900', '$1,500', '1,100', '$800'],
    'housing_status': ['Rent', 'own', 'Rent', 'OWN', 'idk']
}

df_city = pd.DataFrame(city_records_data)

# ---------------------------------------------
# 3. Food Assistance Programs Dataset
# ---------------------------------------------
food_assist_data = {
    'fake_ssn': ['123-45-6789', '987-65-4321', '555-55-5555', '111-22-3333', '222-33-4444'],
    'birthday': ['31/12/1990', '1985-12-31', '07-04-1975', '15/05/1980', '08/15/1992'],
    'household_size': [3, 2, None, 4, 1],
    'monthly_expenses': ['$1,200', '950', '1,100', '$1,300', None],
    'program_threshold': ['$2000', '2,500', '$2200', '2500', '$2100'],
    'program_assigned': ['Food Aid A', 'FOOD aid B', 'food Aid A', 'Food aid B', 'None - Housing Status Unknown']
}

df_food = pd.DataFrame(food_assist_data)

# Data Cleaning and Preparation

A great resource for any pandas questions - summarizes all the basic functionality well. Bookmark it!
https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf


# df_people

In [169]:
df_people.head(3)

Unnamed: 0,fake_ssn,first_name,middle_name,last_name,birthday,ethnicity,zip_code
0,123-45-6789,John,Q. A.,Doe,12/31/1990,Maroon Manatee,12345-6789
1,987-65-4321,JANE,b. C.,Smith,31/12/1985,Pink Panda,54321
2,555-55-5555,Alice,Marie-Lou,O'Connor,07/04/1975,Blue Buffalo,67890-1234


In [170]:
# Standardize name casing to Title Case
df_people['first_name'] = df_people['first_name'].str.title()
df_people['middle_name'] = df_people['middle_name'].str.title()
df_people['last_name'] = df_people['last_name'].str.title()
df_people.head(3)

Unnamed: 0,fake_ssn,first_name,middle_name,last_name,birthday,ethnicity,zip_code
0,123-45-6789,John,Q. A.,Doe,12/31/1990,Maroon Manatee,12345-6789
1,987-65-4321,Jane,B. C.,Smith,31/12/1985,Pink Panda,54321
2,555-55-5555,Alice,Marie-Lou,O'Connor,07/04/1975,Blue Buffalo,67890-1234


Many functions return lists that you can manipulate. The most notorious class of functions that do these are string class functions, which require the use of the "str" modifier before calling the function.

In [171]:
df_people['zip_code'] = df_people['zip_code'].str.split('-')
df_people.head(3)

Unnamed: 0,fake_ssn,first_name,middle_name,last_name,birthday,ethnicity,zip_code
0,123-45-6789,John,Q. A.,Doe,12/31/1990,Maroon Manatee,"[12345, 6789]"
1,987-65-4321,Jane,B. C.,Smith,31/12/1985,Pink Panda,[54321]
2,555-55-5555,Alice,Marie-Lou,O'Connor,07/04/1975,Blue Buffalo,"[67890, 1234]"


Refer to documentation for specific quirks of each function: https://pandas.pydata.org/docs/reference/api/pandas.Series.str.split.html

Interestingly, the split function can be accessed as a list! Because it is the output of a string function, it must also be used with the "str" modifier.

In [172]:
df_people['zip_code'] = df_people['zip_code'].str[0]
df_people.head(3)

Unnamed: 0,fake_ssn,first_name,middle_name,last_name,birthday,ethnicity,zip_code
0,123-45-6789,John,Q. A.,Doe,12/31/1990,Maroon Manatee,12345
1,987-65-4321,Jane,B. C.,Smith,31/12/1985,Pink Panda,54321
2,555-55-5555,Alice,Marie-Lou,O'Connor,07/04/1975,Blue Buffalo,67890


Data columns are notoriously difficult to handle. It is absolutely worth it to do preliminary analysis to understand the different formats (if any) present in your data.

In [173]:
dates_df = df_people.birthday.reset_index(drop=True)
dates_df

Unnamed: 0,birthday
0,12/31/1990
1,31/12/1985
2,07/04/1975
3,1980-05-15
4,15/08/1992


Here, we have a decently messy column. Some dates are written in the European format, while others are written according to the "American" norm of putting year last.

Most dates are separated by slashes, but one has **dashes**?!

**Ahhhh!**

In [174]:
df_people.dtypes

Unnamed: 0,0
fake_ssn,object
first_name,object
middle_name,object
last_name,object
birthday,object
ethnicity,object
zip_code,object


When cleaning data, don't be afraid to quarantine difficult columns into their own DataFrame while you experiment on how to clean them.

Date columns should always be cast as **DateTime** type columns, not **Objects**!

In our case, we need to standardize both the formatting (ie: dashes vs. slashes) and format (European vs. American) of our date strings.

It's impossible to differentiate months and days if they are a value that is 12 or below. When there is ambiguity, use your best judgement to extract data you are confident about.

In this case, a good idea would be to extract birth year since it is a rule based column (starts with 19 or 20).

In [176]:
# Function to extract the year from a birthday string by splitting on '-' or '/'
def extract_year(bday_str):
    parts = re.split(r'[-/]', bday_str)

    for part in parts:
        if part.startswith('19') or part.startswith('20'):
            return part
    return np.nan

In [178]:
df_people.head(3)

Unnamed: 0,fake_ssn,first_name,middle_name,last_name,birthday,ethnicity,zip_code
0,123-45-6789,John,Q. A.,Doe,12/31/1990,Maroon Manatee,12345
1,987-65-4321,Jane,B. C.,Smith,31/12/1985,Pink Panda,54321
2,555-55-5555,Alice,Marie-Lou,O'Connor,07/04/1975,Blue Buffalo,67890


In [179]:
df_people['year_str'] = df_people['birthday'].apply(extract_year)
df_people.head(3)

Unnamed: 0,fake_ssn,first_name,middle_name,last_name,birthday,ethnicity,zip_code,year_str
0,123-45-6789,John,Q. A.,Doe,12/31/1990,Maroon Manatee,12345,1990
1,987-65-4321,Jane,B. C.,Smith,31/12/1985,Pink Panda,54321,1985
2,555-55-5555,Alice,Marie-Lou,O'Connor,07/04/1975,Blue Buffalo,67890,1975


In [180]:
# Convert the 'year_str' column to a datetime column using only the year information
df_people['year_str'] = pd.to_datetime(df_people['year_str'], format='%Y', errors='coerce')  # Cast to datetime

In [181]:
df_people['year_str']

Unnamed: 0,year_str
0,1990-01-01
1,1985-01-01
2,1975-01-01
3,1980-01-01
4,1992-01-01


In [182]:
df_people['year_str'].dt.year

Unnamed: 0,year_str
0,1990
1,1985
2,1975
3,1980
4,1992


In [183]:
df_people['year_str'] = df_people['year_str'].dt.year
df_people.head(3)

Unnamed: 0,fake_ssn,first_name,middle_name,last_name,birthday,ethnicity,zip_code,year_str
0,123-45-6789,John,Q. A.,Doe,12/31/1990,Maroon Manatee,12345,1990
1,987-65-4321,Jane,B. C.,Smith,31/12/1985,Pink Panda,54321,1985
2,555-55-5555,Alice,Marie-Lou,O'Connor,07/04/1975,Blue Buffalo,67890,1975


In [184]:
df_people.dtypes

Unnamed: 0,0
fake_ssn,object
first_name,object
middle_name,object
last_name,object
birthday,object
ethnicity,object
zip_code,object
year_str,int32


## df_city

In [185]:
df_city.head(3)

Unnamed: 0,fake_ssn,birthday,income,address,tax_due,housing_status
0,123-45-6789,1990-12-31,"$50,000",123 main st,"$1,200",Rent
1,987-65-4321,12/31/1985,60000,456 Oak Avenue,900,own
2,555-55-5555,04/07/1975,"$75,500",789 Pine Rd,"$1,500",Rent


In [186]:
df_city['housing_status'] = df_city['housing_status'].str.lower()
df_city.head(3)

Unnamed: 0,fake_ssn,birthday,income,address,tax_due,housing_status
0,123-45-6789,1990-12-31,"$50,000",123 main st,"$1,200",rent
1,987-65-4321,12/31/1985,60000,456 Oak Avenue,900,own
2,555-55-5555,04/07/1975,"$75,500",789 Pine Rd,"$1,500",rent


In [187]:
 df_city['housing_status'] = df_city['housing_status'].str.strip()
 df_city.head(3)

Unnamed: 0,fake_ssn,birthday,income,address,tax_due,housing_status
0,123-45-6789,1990-12-31,"$50,000",123 main st,"$1,200",rent
1,987-65-4321,12/31/1985,60000,456 Oak Avenue,900,own
2,555-55-5555,04/07/1975,"$75,500",789 Pine Rd,"$1,500",rent


In [188]:
# Clean income: remove '$' and ',' then convert to numeric
df_city['income'] = df_city['income'].replace({'\$': '', ',': ''}, regex=True)
df_city.head(3)

Unnamed: 0,fake_ssn,birthday,income,address,tax_due,housing_status
0,123-45-6789,1990-12-31,50000,123 main st,"$1,200",rent
1,987-65-4321,12/31/1985,60000,456 Oak Avenue,900,own
2,555-55-5555,04/07/1975,75500,789 Pine Rd,"$1,500",rent


In [189]:
df_city.dtypes

Unnamed: 0,0
fake_ssn,object
birthday,object
income,object
address,object
tax_due,object
housing_status,object


After cleaning numeric columns, always cast them to proper numeric types so they can be used in aggregations, joins, and plotting functions without being treated as categorical variables.

In [190]:
df_city['income'] = pd.to_numeric(df_city['income'], errors='coerce')

In [191]:
df_city.dtypes

Unnamed: 0,0
fake_ssn,object
birthday,object
income,int64
address,object
tax_due,object
housing_status,object


In [192]:
df_city.head(3)

Unnamed: 0,fake_ssn,birthday,income,address,tax_due,housing_status
0,123-45-6789,1990-12-31,50000,123 main st,"$1,200",rent
1,987-65-4321,12/31/1985,60000,456 Oak Avenue,900,own
2,555-55-5555,04/07/1975,75500,789 Pine Rd,"$1,500",rent


In [193]:
# Clean tax_due: remove '$' and ',' then convert to numeric
df_city['tax_due'] = df_city['tax_due'].replace({'\$': '', ',': ''}, regex=True)
df_city['tax_due'] = pd.to_numeric(df_city['tax_due'], errors='coerce')
df_city.head(3)

Unnamed: 0,fake_ssn,birthday,income,address,tax_due,housing_status
0,123-45-6789,1990-12-31,50000,123 main st,1200,rent
1,987-65-4321,12/31/1985,60000,456 Oak Avenue,900,own
2,555-55-5555,04/07/1975,75500,789 Pine Rd,1500,rent


In [194]:
df_city.dtypes

Unnamed: 0,0
fake_ssn,object
birthday,object
income,int64
address,object
tax_due,int64
housing_status,object


# df_food

In [195]:
df_food.head(3)

Unnamed: 0,fake_ssn,birthday,household_size,monthly_expenses,program_threshold,program_assigned
0,123-45-6789,31/12/1990,3.0,"$1,200",$2000,Food Aid A
1,987-65-4321,1985-12-31,2.0,950,2500,FOOD aid B
2,555-55-5555,07-04-1975,,1100,$2200,food Aid A


In [196]:
df_food['program_assigned'] = df_food['program_assigned'].str.lower().str.strip()
df_food.head(3)

Unnamed: 0,fake_ssn,birthday,household_size,monthly_expenses,program_threshold,program_assigned
0,123-45-6789,31/12/1990,3.0,"$1,200",$2000,food aid a
1,987-65-4321,1985-12-31,2.0,950,2500,food aid b
2,555-55-5555,07-04-1975,,1100,$2200,food aid a


In [197]:
df_food['birth_year'] = df_food['birthday'].apply(extract_year)
df_food['birth_year'] = pd.to_datetime(df_food['birth_year'], format='%Y', errors='coerce')
df_food['birth_year'] = df_food['birth_year'].dt.year
df_food.head(3)

Unnamed: 0,fake_ssn,birthday,household_size,monthly_expenses,program_threshold,program_assigned,birth_year
0,123-45-6789,31/12/1990,3.0,"$1,200",$2000,food aid a,1990
1,987-65-4321,1985-12-31,2.0,950,2500,food aid b,1985
2,555-55-5555,07-04-1975,,1100,$2200,food aid a,1975


In [198]:
df_food['monthly_expenses'] = df_food['monthly_expenses'].replace({'\$': '', ',': ''}, regex=True)
df_food['monthly_expenses'] = pd.to_numeric(df_food['monthly_expenses'], errors='coerce')

df_food['program_threshold'] = df_food['program_threshold'].replace({'\$': '', ',': ''}, regex=True)
df_food['program_threshold'] = pd.to_numeric(df_food['program_threshold'], errors='coerce')
df_food.head(3)

Unnamed: 0,fake_ssn,birthday,household_size,monthly_expenses,program_threshold,program_assigned,birth_year
0,123-45-6789,31/12/1990,3.0,1200.0,2000,food aid a,1990
1,987-65-4321,1985-12-31,2.0,950.0,2500,food aid b,1985
2,555-55-5555,07-04-1975,,1100.0,2200,food aid a,1975


# Advanced Cleaning Functions

Thus far, we haven't worried about missing or blank values.

Let's simulate data as you'll actually find it in real-world problems - not perfecly clean and complete, but full of missing values.

In [199]:
df_food

Unnamed: 0,fake_ssn,birthday,household_size,monthly_expenses,program_threshold,program_assigned,birth_year
0,123-45-6789,31/12/1990,3.0,1200.0,2000,food aid a,1990
1,987-65-4321,1985-12-31,2.0,950.0,2500,food aid b,1985
2,555-55-5555,07-04-1975,,1100.0,2200,food aid a,1975
3,111-22-3333,15/05/1980,4.0,1300.0,2500,food aid b,1980
4,222-33-4444,08/15/1992,1.0,,2100,none - housing status unknown,1992


# Missing Values and NaNs

In Pandas, missing or blank values are called "Not A Number" values, or NaNs!

In [200]:
nan_list = [np.nan, np.nan, "DS 3001"]
nan_list

[nan, nan, 'DS 3001']

In [201]:
nan_df = pd.DataFrame(nan_list)
nan_df

Unnamed: 0,0
0,
1,
2,DS 3001


In [202]:
nan_df.dtypes

Unnamed: 0,0
0,object


Calling df.isna() will return a dataframe full of booleans, which are (intuitively) True for data point that are null, and False for those which are non-null.

In [203]:
nan_df.isna()

Unnamed: 0,0
0,True
1,True
2,False


You can call sum on isna() to count the NaNs in each column. Checking for missing values is **a fundamental step** of EDA.

In [204]:
nan_df.isna().apply(sum)

Unnamed: 0,0
0,2


You can apply this to a single column or an entire dataframe.

In [205]:
df_food.isna().apply(sum)

Unnamed: 0,0
fake_ssn,0
birthday,0
household_size,1
monthly_expenses,1
program_threshold,0
program_assigned,0
birth_year,0


When applying functions that need DataFrames and not Series objects, make sure to use the double bracket syntax.

In [206]:
df_food[['monthly_expenses']].isna().apply(sum)

Unnamed: 0,0
monthly_expenses,1


In [207]:
df_food[['monthly_expenses', 'household_size']].isna().apply(sum)

Unnamed: 0,0
monthly_expenses,1
household_size,1


We can handle missing data in several ways. The method you choose should be determined by its suitability to the problem at hand—much like machine learning, there isn't a universal solution that you can apply in every case.

In [208]:
nan_df = pd.DataFrame({"col 1": [np.nan, np.nan, 1, 2], "col 2": [np.nan, 5, np.nan, 2], "col 3": [9, 8, 7, 6]})
nan_df

Unnamed: 0,col 1,col 2,col 3
0,,,9
1,,5.0,8
2,1.0,,7
3,2.0,2.0,6


Using .dropna() in its base form will drop **every row that has a missing value**. Ensure you understand what proportion of data been dropped when using this.

In [209]:
nan_df.dropna()

Unnamed: 0,col 1,col 2,col 3
3,2.0,2.0,6


If instead you only wanted to drop rows with NaN vals *in a certain set of columns*, you can specify subset=['listofcolumns']

In [210]:
nan_df.dropna(subset=['col 1'])

Unnamed: 0,col 1,col 2,col 3
2,1.0,,7
3,2.0,2.0,6


In [211]:
nan_df.dropna(subset=['col 2'])

Unnamed: 0,col 1,col 2,col 3
1,,5.0,8
3,2.0,2.0,6


However, dropping every row that contains a missing value is usually pretty costly. In our example, we sacrificed over 50% of our data.

ML algorithms need to be trained on as much data as you can possibly give them to ensure good prective performance, so we need to find a different way to handle NaNs if there are more than a few rows containing them!

Occasionally, there will be a few **columns** which are sparse or poorly collected, and have missing data for a great majority of the dataset. It is common practice to *drop all columns with greater than n% missing data*, where n is a threshold chosen by the practitioner.

As an exercise, let's code a function that we can use every time we want to preprocess in this way!


In [212]:
def get_proportion_null(df):
  return df.isna().sum()/len(df)

get_proportion_null(nan_df)

Unnamed: 0,0
col 1,0.5
col 2,0.5
col 3,0.0


In [213]:
def get_percentage_null(df):
  percentage_null = df.isna().sum()/len(df)
  return percentage_null * 100

get_percentage_null(nan_df)

Unnamed: 0,0
col 1,50.0
col 2,50.0
col 3,0.0


In [214]:
def drop_columns_with_high_nulls(df, threshold=0.5):
    # Calculate the fraction of NaN values in each column
    null_fraction = df.isnull().mean()

    # Identify columns where the fraction is greater than the threshold
    cols_to_drop = null_fraction[null_fraction > threshold].index

    return df.drop(columns=cols_to_drop)

drop_columns_with_high_nulls(nan_df, 0.6)

Unnamed: 0,col 1,col 2,col 3
0,,,9
1,,5.0,8
2,1.0,,7
3,2.0,2.0,6


In [215]:
drop_columns_with_high_nulls(nan_df, 0.5)

Unnamed: 0,col 1,col 2,col 3
0,,,9
1,,5.0,8
2,1.0,,7
3,2.0,2.0,6


In [216]:
drop_columns_with_high_nulls(nan_df, 0.49)

Unnamed: 0,col 3
0,9
1,8
2,7
3,6


In [217]:
drop_columns_with_high_nulls(nan_df, 0)

Unnamed: 0,col 3
0,9
1,8
2,7
3,6


## Imputation

In [218]:
nan_df = pd.DataFrame({"col 1": [np.nan, 13, 20, 50, 13], "col 2": [np.nan, 5, np.nan, 2, 5], "col 3": [9, 8000, 70000, 6, 6.5]})
nan_df

Unnamed: 0,col 1,col 2,col 3
0,,,9.0
1,13.0,5.0,8000.0
2,20.0,,70000.0
3,50.0,2.0,6.0
4,13.0,5.0,6.5


In [219]:
nan_df.fillna(0)  # Fill all NaN values with 0

Unnamed: 0,col 1,col 2,col 3
0,0.0,0.0,9.0
1,13.0,5.0,8000.0
2,20.0,0.0,70000.0
3,50.0,2.0,6.0
4,13.0,5.0,6.5


In [220]:
nan_df.fillna("I'm empty!!!")

Unnamed: 0,col 1,col 2,col 3
0,I'm empty!!!,I'm empty!!!,9.0
1,13.0,5.0,8000.0
2,20.0,I'm empty!!!,70000.0
3,50.0,2.0,6.0
4,13.0,5.0,6.5


In [221]:
nan_copy = nan_df.copy()

# For each column, fill NaNs with the column's mean
for col in nan_copy.columns:
    mean_value = nan_df[col].mean()  # Calculate the mean of the column
    nan_copy[col] = nan_copy[col].fillna(mean_value)  # Fill NaNs with the mean

In [222]:
nan_df

Unnamed: 0,col 1,col 2,col 3
0,,,9.0
1,13.0,5.0,8000.0
2,20.0,,70000.0
3,50.0,2.0,6.0
4,13.0,5.0,6.5


In [223]:
nan_copy

Unnamed: 0,col 1,col 2,col 3
0,24.0,4.0,9.0
1,13.0,5.0,8000.0
2,20.0,4.0,70000.0
3,50.0,2.0,6.0
4,13.0,5.0,6.5


In [224]:
nan_median = nan_df.copy()

# For each column, fill NaNs with the column's median
for col in nan_copy.columns:
    median_value = nan_df[col].median()  # Calculate the median of the column
    nan_median[col] = nan_median[col].fillna(median_value)  # Fill NaNs with the median
nan_median

Unnamed: 0,col 1,col 2,col 3
0,16.5,5.0,9.0
1,13.0,5.0,8000.0
2,20.0,5.0,70000.0
3,50.0,2.0,6.0
4,13.0,5.0,6.5


In [225]:
nan_mode = nan_df.copy()

# For each column, fill NaNs with the column's mode
for col in nan_copy.columns:
    mode_value = nan_df[col].mode().iloc[0]  # Calculate the mode of the column
    nan_mode[col] = nan_mode[col].fillna(mode_value)  # Fill NaNs with the mode
nan_mode

Unnamed: 0,col 1,col 2,col 3
0,13.0,5.0,9.0
1,13.0,5.0,8000.0
2,20.0,5.0,70000.0
3,50.0,2.0,6.0
4,13.0,5.0,6.5


Data that is logically ordered, like **time-series** or highly correlated data, tends to have high correlations between sequential observations.

This is called **serial correlation.** Because of it, the best guess at any given missing value is often the observation before or after it.


Forward fill fills with the value in the previous row. This is not suitable for datasets where a value is blank in the first row.

In [226]:
nan_df.fillna(method='ffill')

  nan_df.fillna(method='ffill')


Unnamed: 0,col 1,col 2,col 3
0,,,9.0
1,13.0,5.0,8000.0
2,20.0,5.0,70000.0
3,50.0,2.0,6.0
4,13.0,5.0,6.5


Back fill fills the value with the next row.

In [227]:
nan_df.fillna(method='bfill')

  nan_df.fillna(method='bfill')


Unnamed: 0,col 1,col 2,col 3
0,13.0,5.0,9.0
1,13.0,5.0,8000.0
2,20.0,2.0,70000.0
3,50.0,2.0,6.0
4,13.0,5.0,6.5


Reminder (truly can't stress this enough): **don't use ffill or bfill unless you're using time series data or data that has some other natural ordering.**

If the data isn't ordered, choosing the point before or after a DataFrame is completely arbitrary, and dependent on whatever nonsensical order the data came in.

In [228]:
df_food

Unnamed: 0,fake_ssn,birthday,household_size,monthly_expenses,program_threshold,program_assigned,birth_year
0,123-45-6789,31/12/1990,3.0,1200.0,2000,food aid a,1990
1,987-65-4321,1985-12-31,2.0,950.0,2500,food aid b,1985
2,555-55-5555,07-04-1975,,1100.0,2200,food aid a,1975
3,111-22-3333,15/05/1980,4.0,1300.0,2500,food aid b,1980
4,222-33-4444,08/15/1992,1.0,,2100,none - housing status unknown,1992


In [229]:
# Impute missing values: fill missing values with the median of column
median_household_size = df_food['household_size'].median()
df_food['household_size'] = df_food['household_size'].fillna(median_household_size)

median_expenses = df_food['monthly_expenses'].median()
df_food['monthly_expenses'] = df_food['monthly_expenses'].fillna(median_expenses)
df_food

Unnamed: 0,fake_ssn,birthday,household_size,monthly_expenses,program_threshold,program_assigned,birth_year
0,123-45-6789,31/12/1990,3.0,1200.0,2000,food aid a,1990
1,987-65-4321,1985-12-31,2.0,950.0,2500,food aid b,1985
2,555-55-5555,07-04-1975,2.5,1100.0,2200,food aid a,1975
3,111-22-3333,15/05/1980,4.0,1300.0,2500,food aid b,1980
4,222-33-4444,08/15/1992,1.0,1150.0,2100,none - housing status unknown,1992
