In [2]:
import pandas as pd
import numpy as np

# Show all rows
pd.set_option('display.max_rows', None)

# Load data
df = pd.read_csv("food_coded.csv")

# Show before cleaning (entire DataFrame subset)
print("Before Cleaning:")
print(df.info())
print(df.isnull().sum())

# Clean GPA and weight
df['GPA'] = pd.to_numeric(df['GPA'], errors='coerce')
df['weight'] = df['weight'].astype(str).str.extract(r'(\d+\.?\d*)')
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')

# Drop columns with more than 20% missing values
df = df.dropna(thresh=len(df) * 0.8, axis=1)

# Drop duplicate column if exists
if 'comfort_food_reasons_coded.1' in df.columns:
    df = df.drop(columns=['comfort_food_reasons_coded.1'])

# Fill missing numeric with mean
for col in df.select_dtypes(include='number'):
    df[col].fillna(df[col].mean(), inplace=True)

# Fill missing categorical with mode
for col in df.select_dtypes(include='object'):
    df[col].fillna(df[col].mode()[0], inplace=True)

# Show after cleaning (entire DataFrame subset)
print("\nAfter Cleaning:")
print(df[['GPA', 'weight', 'calories_day', 'comfort_food_reasons']])


Before Cleaning:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 61 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   GPA                           123 non-null    object 
 1   Gender                        125 non-null    int64  
 2   breakfast                     125 non-null    int64  
 3   calories_chicken              125 non-null    int64  
 4   calories_day                  106 non-null    float64
 5   calories_scone                124 non-null    float64
 6   coffee                        125 non-null    int64  
 7   comfort_food                  124 non-null    object 
 8   comfort_food_reasons          123 non-null    object 
 9   comfort_food_reasons_coded    106 non-null    float64
 10  cook                          122 non-null    float64
 11  comfort_food_reasons_coded.1  125 non-null    int64  
 12  cuisine                       108 non-null    f

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
