### Cleaning Fast_food dataset

In [42]:
import numpy as np
import pandas as pd

fast_food = pd.read_csv("https://raw.githubusercontent.com/houleye45/group3_week4_project/main/data/clean/Items%20selected%20for%20analysis%20-%20fast_foot.csv")


fast_food.columns

Index(['restaurant', 'food_name', 'calories_kcal', 'total_fat_g',
       'saturated_fat_g', 'cholesterol_mg', 'sodium_g', 'total_carb',
       'fiber_g', 'sugar_g', 'protein_g', 'vitamin_A_mg', 'vitamin_C_mg',
       'calcium_mg'],
      dtype='object')

In [43]:
#Renaming columns

new_columns = {"restaurant":"restaurant", "item":"food_name", "calories":"calories_kcal",
               "total_fat":"total_fat_g", "sat_fat":"saturated_fat_g", "cholesterol":"cholesterol_mg",
               "sodium":"sodium_g", "total_carb":"carbohydrates_g", "fiber":"fiber_g", "sugar":"sugar_g",
               "protein":"protein_g", "vit_a":"vitamin_A_mg", "vit_c":"vitamin_C_mg", "calcium":"calcium_mg"
              }

fast_food = fast_food.rename(columns=new_columns)
fast_food.columns

Index(['restaurant', 'food_name', 'calories_kcal', 'total_fat_g',
       'saturated_fat_g', 'cholesterol_mg', 'sodium_g', 'carbohydrates_g',
       'fiber_g', 'sugar_g', 'protein_g', 'vitamin_A_mg', 'vitamin_C_mg',
       'calcium_mg'],
      dtype='object')

In [44]:
fast_food = fast_food.drop_duplicates(subset='food_name')
fast_food.food_name.value_counts()

food_name
Grilled Bacon Smokehouse Chicken Sandwich      1
Lobster Roll                                   1
McChicken                                      1
4 piece Buttermilk Crispy Chicken Tenders      1
Premium Asian Salad w Grilled Chicken          1
Chargrilled Chicken Sandwich                   1
4 piece ChicknStrips                           1
6 piece Chicken Nuggets                        1
Chicken Salad Sandwich                         1
Smokehouse BBQ Bacon Sandwich                  1
Super Sonic Bacon Double Cheeseburger          1
Grilled Asiago Caesar Chicken Club Sandwich    1
Small Jumbo Popcorn Chicken                    1
4 Piece Super Crunch Chicken Strips            1
All Beef Chili Cheese Coney                    1
Beef n Cheddar Classic                         1
3 piece PrimeCut Chicken Tenders               1
Roast Turkey Swiss Sandwich                    1
Roast Turkey Ranch Bacon Wrap                  1
Crispy Chicken Farmhouse Salad                 1
GrillBurge

In [45]:
df1 = fast_food

In [46]:
#Converting numerical values to float 
df1[df1.select_dtypes(include='number').columns] = df1.select_dtypes(include='number').astype(float)

In [47]:
df1['category'] = "Fast Food"

In [48]:
df1.columns

Index(['restaurant', 'food_name', 'calories_kcal', 'total_fat_g',
       'saturated_fat_g', 'cholesterol_mg', 'sodium_g', 'carbohydrates_g',
       'fiber_g', 'sugar_g', 'protein_g', 'vitamin_A_mg', 'vitamin_C_mg',
       'calcium_mg', 'category'],
      dtype='object')

### Cleaning General_food dataset

In [49]:
import pandas as pd
general_food = pd.read_csv('https://raw.githubusercontent.com/houleye45/group3_week4_project/main/data/clean/general_food_items_selected%20-%20Sheet1.csv')

In [50]:
general_food.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'food', 'Caloric Value', 'Fat',
       'Saturated Fats', 'Monounsaturated Fats', 'Polyunsaturated Fats',
       'Carbohydrates', 'Sugars', 'Protein', 'Dietary Fiber', 'Cholesterol',
       'Sodium', 'Water', 'Vitamin A', 'Vitamin B1', 'Vitamin B11',
       'Vitamin B12', 'Vitamin B2', 'Vitamin B3', 'Vitamin B5', 'Vitamin B6',
       'Vitamin C', 'Vitamin D', 'Vitamin E', 'Vitamin K', 'Calcium', 'Copper',
       'Iron', 'Magnesium', 'Manganese', 'Phosphorus', 'Potassium', 'Selenium',
       'Zinc', 'Nutrition Density'],
      dtype='object')

In [51]:
#Formatting the columns
general_food = general_food.rename(columns={ col: col.replace(" ","_").lower() for col in general_food.columns })

In [52]:
general_food = general_food.drop(['unnamed:_0.1','unnamed:_0'], axis = 1)

In [53]:
new_columns = {
    "food": "food_name", 
    "caloric_value": "calories_kcal", 
    "fat": "total_fat_g", 
    "saturated_fats": "saturated_fat_g", 
    "cholesterol": "cholesterol_mg",
    "sodium": "sodium_g", 
    "carbohydrates": "carbohydrates_g",  
    "dietary_fiber": "fiber_g", 
    "sugars": "sugar_g",
    "protein": "protein_g", 
    "vitamin_a": "vitamin_A_mg", 
    "vitamin_c": "vitamin_C_mg", 
    "calcium": "calcium_mg"
}

general_food.rename(columns=new_columns, inplace=True)

general_food = general_food[list(new_columns.values())]

In [54]:
general_food.columns

Index(['food_name', 'calories_kcal', 'total_fat_g', 'saturated_fat_g',
       'cholesterol_mg', 'sodium_g', 'carbohydrates_g', 'fiber_g', 'sugar_g',
       'protein_g', 'vitamin_A_mg', 'vitamin_C_mg', 'calcium_mg'],
      dtype='object')

In [55]:
#Checking null values 
general_food.isna().sum()

food_name          0
calories_kcal      0
total_fat_g        0
saturated_fat_g    0
cholesterol_mg     0
sodium_g           0
carbohydrates_g    0
fiber_g            0
sugar_g            0
protein_g          0
vitamin_A_mg       0
vitamin_C_mg       0
calcium_mg         0
dtype: int64

In [56]:
df2  = general_food

In [57]:
#Adding column "cateopry"
df2['category'] = "General Food"

In [58]:
#Converting numerical values to float 
df1[df1.select_dtypes(include='number').columns] = df1.select_dtypes(include='number').astype(float)
df2[df2.select_dtypes(include='number').columns] = df2.select_dtypes(include='number').astype(float)

### Combining both dataset

In [59]:
df_food = pd.concat([df1, df2], ignore_index=True)

In [60]:
df_food.columns

Index(['restaurant', 'food_name', 'calories_kcal', 'total_fat_g',
       'saturated_fat_g', 'cholesterol_mg', 'sodium_g', 'carbohydrates_g',
       'fiber_g', 'sugar_g', 'protein_g', 'vitamin_A_mg', 'vitamin_C_mg',
       'calcium_mg', 'category'],
      dtype='object')

In [61]:
df_food.isna().sum()

restaurant         114
food_name            0
calories_kcal        0
total_fat_g          0
saturated_fat_g      0
cholesterol_mg       0
sodium_g             0
carbohydrates_g      0
fiber_g              0
sugar_g              0
protein_g            0
vitamin_A_mg         0
vitamin_C_mg         0
calcium_mg           0
category             0
dtype: int64

In [62]:
df_food = df_food.fillna('Not Fast Food')
df_food.isna().sum()

restaurant         0
food_name          0
calories_kcal      0
total_fat_g        0
saturated_fat_g    0
cholesterol_mg     0
sodium_g           0
carbohydrates_g    0
fiber_g            0
sugar_g            0
protein_g          0
vitamin_A_mg       0
vitamin_C_mg       0
calcium_mg         0
category           0
dtype: int64

In [63]:
df_food = df_food.apply(lambda x: x.str.replace('®', '', regex=False) if x.dtype == "object" else x)

In [64]:
df_food.to_csv('selected_food_cleaned', index=True)