In [None]:
import pandas as pd
import numpy as np

In [None]:
pdf_persons = pd.DataFrame({
    "first": ["Efi", "Fritz", "Dixie", "Theodor", np.nan, None, 'NA'],
    "last": ["Coman", "Johansen", "Normus", "Normus", np.nan, np.nan, 'Missing'],
    "email": ["EfiComan@purim.org", "Fandango@bar.gov", "NormusDixie@gmail.com", None, np.nan, 'Anonymous@email.com', 'NA'],
    "age": ['33', '55', '63', '36', None, None, 'Missing']
})
pdf_survey = pd.read_csv('./data/survey_results_public.csv')

In [None]:
pdf_persons

In [None]:
pdf_persons.dropna()

In [None]:
pdf_persons.dropna(axis='index', how='any') # This is the default so, same result as before

In [None]:
pdf_persons.dropna(axis='index', how='all') # drop a row if all the values are missing

In [None]:
pdf_persons.dropna(axis='columns', how='all') # drop any column with all values missing. In our case nothing

In [None]:
# What if we want to be more specific, like drop a row if email is not set
pdf_persons.dropna(axis='index', how='any', subset=['email']) # how doesn't matter with one item is subset..


In [None]:
# Lets change our custom NA/Missing to proper NaNs
pdf_persons = pdf_persons.replace('NA', np.nan).replace('Missing', np.nan)
pdf_persons


In [None]:
# Get a mask of NaN values in the DataFrame
# See that Python's None is treated as NaN too
pdf_persons.isna()

In [None]:
# Fill NaN cells with a pre determined value.
# Example: look at the value as a grade for a task and that task was never submitted. 
# In this case, you would want to replace the NaN with 0
pdf_persons.fillna('bobo_is_missing')


In [None]:
# filled by column-name
pdf_persons.fillna(value={'age': '0'})


In [None]:
# What's the average age?
pdf_persons.age.mean()

In [None]:
# We need to convert the column to numericals
pdf_persons.dtypes

In [None]:
# age is currently a string typed column
pdf_persons['age'] = pdf_persons['age'].astype(int)

In [None]:
# a column with NaNs cannot be an integer. type of np.nan is float
pdf_persons['age'] = pdf_persons['age'].astype(float)
pdf_persons

In [None]:
pdf_persons.dtypes

In [None]:
# Now we can get the correct mean value of the age column
pdf_persons.age.mean()

In [None]:
# Shall we cleanup something from the real world?
# What's the average number of coding years in the SOF survey?

pdf_survey.YearsCode

# The dtype is object, meaning string. 

In [None]:
# Try to cast into integer?
pdf_survey.YearsCode.astype(int)

In [None]:
# OK we must use float then!
pdf_survey.YearsCode.astype(float)

In [None]:
# Dang, who put that 'More than 50 years' in that column?!
# Cleaning is not always easy!

pdf_survey.YearsCode.unique()

In [None]:
to_replace = {
    'More than 50 years': '50',
    'Less than 1 year': '0'
}

pdf_survey.YearsCode.replace(to_replace).unique()

# yesssss, looking better!

In [None]:
pdf_survey['YearsCode'] = pdf_survey.YearsCode.replace(to_replace)

In [None]:
# now lets try casting again
pdf_survey['YearsCode'] = pdf_survey['YearsCode'].astype(float)

In [None]:
# now pluck that mean value
pdf_survey.YearsCode.mean()