### Data cleaning

Links:
https://pandas.pydata.org/docs/reference/api/pandas.Series.str.extract.html
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html
https://www.kaggle.com/datasets/borapajo/food-choices?select=food_coded.csv

In [1]:
import pandas as pd

food = pd.read_csv('food_coded.csv')

food.info()

food['GPA_clean'] = food.GPA.str.extract(r'(\d+.\d+)').astype(float)

food_gpa_mean = food.GPA_clean.mean()

# Rather than apply, use the built-in pandas method
food.GPA_clean = food.GPA_clean.fillna(food_gpa_mean)

food.columns

# Create dummy columns for gender, employment, father_education
genders = pd.get_dummies(food.Gender, prefix='gender')
employment = pd.get_dummies(food.employment, prefix='employ')
fe = pd.get_dummies(food.father_education, prefix='father')

# Alternative - this returns the dataframe with the encoded columns
# pd.get_dummies(food, columns=['Gender', 'employment', 'father_education'])

# Isolate the subset of columns we want to preserve
keep_cols = ['income', 'healthy_feeling', 'life_rewarding']
food_final = food[keep_cols]

food_final = pd.concat((food_final, genders, employment, fe), axis=1)

food_final = food_final.dropna()

food_final.info()

food_final.to_csv('food_cleaned.csv', index=False)