# MealMate Data Cleaning and EDA

### Import Statements

In [1]:
import pandas as pd

### Data Preprocessing

In [25]:
df = pd.read_csv('D:/Khushi/Python Practice/Python Projects/MealMate/data/meals_data.csv')

In [26]:
df.shape

(1960, 13)

In [27]:
df.columns

Index(['name', 'meal_type', 'diet_type', 'calories', 'protein', 'fats',
       'carbs', 'fiber', 'sugar', 'iron', 'sodium', 'cholesterol',
       'recipe_link'],
      dtype='object')

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1960 entries, 0 to 1959
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         1960 non-null   object 
 1   meal_type    1960 non-null   object 
 2   diet_type    1960 non-null   object 
 3   calories     1960 non-null   float64
 4   protein      1960 non-null   float64
 5   fats         1960 non-null   float64
 6   carbs        1960 non-null   float64
 7   fiber        1960 non-null   float64
 8   sugar        1960 non-null   float64
 9   iron         1960 non-null   float64
 10  sodium       1960 non-null   float64
 11  cholesterol  1960 non-null   float64
 12  recipe_link  1960 non-null   object 
dtypes: float64(9), object(4)
memory usage: 199.2+ KB


In [29]:
df.head()

Unnamed: 0,name,meal_type,diet_type,calories,protein,fats,carbs,fiber,sugar,iron,sodium,cholesterol,recipe_link
0,Powerhouse Almond Matcha Superfood Smoothie,breakfast,"gluten free, dairy free",281.18,10.33,12.94,33.78,7.08,20.05,2.79,252.79,0.0,https://spoonacular.com/recipes/Powerhouse-Alm...
1,Butternut Squash Frittata,breakfast,gluten free,462.09,23.89,3.99,96.03,16.56,23.67,8.22,336.24,7.42,https://spoonacular.com/recipes/Butternut-Squa...
2,Finger Foods: Frittata Muffins,breakfast,"gluten free, lacto ovo vegetarian, primal, ket...",654.73,49.45,44.88,12.9,3.51,5.84,5.64,964.99,1038.7,https://spoonacular.com/recipes/Finger-Foods:-...
3,Doughnuts,breakfast,lacto ovo vegetarian,430.24,11.36,2.04,91.33,2.99,19.34,4.5,602.56,3.64,https://spoonacular.com/recipes/Doughnuts-716276
4,Peanut Butter and Jelly Smoothie,breakfast,"gluten free, dairy free, fodmap friendly",779.48,19.58,35.76,103.92,7.57,65.38,2.42,364.08,0.0,https://spoonacular.com/recipes/Peanut-Butter-...


#### Handling Null Values

In [30]:
df.isnull().sum()

name           0
meal_type      0
diet_type      0
calories       0
protein        0
fats           0
carbs          0
fiber          0
sugar          0
iron           0
sodium         0
cholesterol    0
recipe_link    0
dtype: int64

DataSet is free from null values

#### Handling Duplicate Data

In [31]:
df.duplicated().sum()

np.int64(0)

The Data is free from duplicate values

#### Handling diet type

In the dataset, the non-veg meals are marked as unknown because data is extracted from spoonacular api which does not explicitly marks meals containing meat as non-veg, so we will do it manually

In [32]:
df['diet_type'] = df['diet_type'].replace('unknown', 'non vegetarian')

In [35]:
df[df['diet_type'] == 'non vegetarian'].head()

Unnamed: 0,name,meal_type,diet_type,calories,protein,fats,carbs,fiber,sugar,iron,sodium,cholesterol,recipe_link
24,"Open-Face Egg Sandwich with Bacon, Asparagus, ...",breakfast,non vegetarian,654.2,26.72,41.71,47.26,6.41,7.65,8.66,1252.54,196.97,https://spoonacular.com/recipes/Open-Face-Egg-...
31,Asparagus Eggs Benedict,breakfast,non vegetarian,779.55,34.72,57.46,30.62,2.77,1.8,4.67,1618.62,763.46,https://spoonacular.com/recipes/Asparagus-Eggs...
44,Simple Spinach and Tomato Frittata,breakfast,non vegetarian,162.82,13.45,8.66,8.38,1.73,4.2,3.38,190.25,327.36,https://spoonacular.com/recipes/Simple-Spinach...
57,Strawberry Shortcake w. Mini Strawberry PopTarts,breakfast,non vegetarian,306.38,3.56,12.79,45.17,3.56,17.01,1.68,192.47,4.24,https://spoonacular.com/recipes/Strawberry-Sho...
66,Best Breakfast Burrito,breakfast,non vegetarian,959.68,31.48,49.61,98.02,9.97,7.61,7.11,1330.26,65.14,https://spoonacular.com/recipes/Best-Breakfast...


#### Formatting diet_type Column

In [36]:
def preprocess_diet_types(df):
    # Handle missing and normalize text
    df['diet_type'] = df['diet_type'].fillna('').str.lower()
    
    # Convert to list of tags
    df['diet_type_list'] = df['diet_type'].apply(lambda x: [tag.strip() for tag in x.split(',')])
    
    # Extract all unique diet tags
    all_tags = sorted(set(tag for tags in df['diet_type_list'] for tag in tags if tag))

    # Add binary columns
    for tag in all_tags:
        col_name = f'{tag.replace(" ", "_")}'
        df[col_name] = df['diet_type_list'].apply(lambda x: int(tag in x))

    # Drop the helper column
    df.drop(columns=['diet_type_list'], inplace=True)

    return df

new_df = preprocess_diet_types(df=df)

In [39]:
new_df.columns

Index(['name', 'meal_type', 'diet_type', 'calories', 'protein', 'fats',
       'carbs', 'fiber', 'sugar', 'iron', 'sodium', 'cholesterol',
       'recipe_link', 'dairy_free', 'fodmap_friendly', 'gluten_free',
       'ketogenic', 'lacto_ovo_vegetarian', 'non_vegetarian', 'paleolithic',
       'pescatarian', 'primal', 'vegan', 'whole_30'],
      dtype='object')

In [40]:
new_df = new_df.drop(columns=['fodmap_friendly'])

In [43]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1960 entries, 0 to 1959
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  1960 non-null   object 
 1   meal_type             1960 non-null   object 
 2   diet_type             1960 non-null   object 
 3   calories              1960 non-null   float64
 4   protein               1960 non-null   float64
 5   fats                  1960 non-null   float64
 6   carbs                 1960 non-null   float64
 7   fiber                 1960 non-null   float64
 8   sugar                 1960 non-null   float64
 9   iron                  1960 non-null   float64
 10  sodium                1960 non-null   float64
 11  cholesterol           1960 non-null   float64
 12  recipe_link           1960 non-null   object 
 13  dairy_free            1960 non-null   int64  
 14  gluten_free           1960 non-null   int64  
 15  ketogenic            

#### Exporting Cleaned dataset as csv

In [None]:
cleaned_df.to_csv("D:/Khushi/Python Practice/Python Projects/MealMate/data/cleaned_data.csv", index=False)

NameError: name 'df' is not defined

In [3]:
# cleaned_csv info
cleaned_df = pd.read_csv('D:/Khushi/Python Practice/Python Projects/MealMate/data/cleaned_data.csv')
cleaned_df.columns

Index(['name', 'meal_type', 'diet_type', 'calories', 'protein', 'fats',
       'carbs', 'fiber', 'sugar', 'iron', 'sodium', 'cholesterol',
       'recipe_link', 'dairy_free', 'fodmap_friendly', 'gluten_free',
       'ketogenic', 'vegetarian', 'non_vegetarian', 'paleolithic',
       'pescatarian', 'primal', 'vegan', 'whole_30'],
      dtype='object')

In [4]:
cleaned_df['diet_type'] = cleaned_df['diet_type'].replace('lacto ovo vegetarian', 'vegetarian')

In [5]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1960 entries, 0 to 1959
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             1960 non-null   object 
 1   meal_type        1960 non-null   object 
 2   diet_type        1960 non-null   object 
 3   calories         1960 non-null   float64
 4   protein          1960 non-null   float64
 5   fats             1960 non-null   float64
 6   carbs            1960 non-null   float64
 7   fiber            1960 non-null   float64
 8   sugar            1960 non-null   float64
 9   iron             1960 non-null   float64
 10  sodium           1960 non-null   float64
 11  cholesterol      1960 non-null   float64
 12  recipe_link      1960 non-null   object 
 13  dairy_free       1960 non-null   int64  
 14  fodmap_friendly  1960 non-null   int64  
 15  gluten_free      1960 non-null   int64  
 16  ketogenic        1960 non-null   int64  
 17  vegetarian    

In [6]:
cleaned_df.to_csv("D:/Khushi/Python Practice/Python Projects/MealMate/data/cleaned_data.csv", index=False)