In [1987]:
import pandas as pd
import numpy as np
from scipy.stats import norm
import math

# Read the data
df = pd.read_csv('NutritionFacts.csv')
print(len(df))

# clean data, so the Grams and Calories columns do not have commas and are floating point numbers
df['Calories'] = df['Calories'].str.replace(',', '').astype(float)
df['Grams'] = df['Grams'].str.replace(',', '').astype(float)
# convert Proteint and Fat columns to floating point numbers
df['Protein'] = df['Protein'].astype(float)
df['Fat'] = df['Fat'].astype(float)
# Fill all NaN values in the DataFrame with 0 because there is a blank value in the calories column and the sat.fat column which will cause issues
df.fillna(0, inplace=True)


333


### Question 1 
Consider the peanut butter row.
Order the nutritional contents by units/gram from largest to smallest.


In [1988]:
# Find peanut butter row index
butter_index = df[df['Food'] == 'Peanut butter'].index[0]
# Verify that the row index is correct
df.iloc[butter_index]

Food         Peanut butter
Grams                 50.0
Calories             300.0
Protein               12.0
Fat                   25.0
Sat.Fat               17.0
Fiber                  0.9
Carbs                  9.0
Category    Seeds and Nuts
Name: 246, dtype: object

In [1989]:
# Order the nutritional contents (Fiber, Protein, Fat, Carbohydrates, Saturated Fat) by units/gram from largest to smallest for the peanut butter row.
df.loc[butter_index, ['Fiber', 'Protein', 'Fat', 'Carbs', 'Sat.Fat']].sort_values(ascending=False)

Fat        25.0
Sat.Fat    17.0
Protein    12.0
Carbs       9.0
Fiber       0.9
Name: 246, dtype: object

### Question 2 
 
Which food has the maximum calories per gram (lowercase please)?


In [1990]:
# Maximum calories per gram
df['Calories_per_gram'] = df['Calories'] / df['Grams']
df.loc[df['Calories_per_gram'].idxmax()]

Food                                    Lard
Grams                                  110.0
Calories                               992.0
Protein                                  0.0
Fat                                    110.0
Sat.Fat                                 92.0
Fiber                                    0.0
Carbs                                    0.0
Category             Fats, Oils, Shortenings
Calories_per_gram                   9.018182
Name: 117, dtype: object

### Question 3
Find the two most imbalanced units/gram (i.e., the ones with the largest max and the smallest max).  The ratio from the largest to the smallest (to three decimal places) is:

In [1991]:
# Calculate the maximum values for each nutritional content after normalizing by grams
# Normalize all values
columns_to_normalize = ['Calories', 'Fiber', 'Protein', 'Fat', 'Carbs', 'Sat.Fat']

# Normalize all columns by dividing by 'Grams'
for column in columns_to_normalize:
    df[column] = df[column] / df['Grams']

max_values = df[['Calories', 'Protein', 'Fat', 'Sat.Fat', 'Fiber', 'Carbs']].max()

# Find the largest and smallest maximum values
largest_max = max_values.max()
smallest_max = max_values.min()

# Calculate ratio from the largest to the smallest (to three decimal places)
imbalanced_ratio = round(largest_max / smallest_max, 3)

# Output the result
imbalanced_ratio


25.436

### Question 4
The distance from macaroni to cheddar cheese (to three decimal places) is:

In [1992]:
# Define row indices for 'macaroni' and 'cheddar' or use actual row indices
macaroni_index = df[df['Food'] == 'Macaroni'].index[0]
cheddar_index = df[df['Food'] == 'Cheddar'].index[0]

# Define columns for the nutritional values (Protein, Fat, Sat.Fat, Fiber, Carbs)
columns = ['Calories', 'Protein', 'Fat', 'Sat.Fat', 'Fiber', 'Carbs']

# Initialize distance
distance = 0.0

# Calculate distance based on excel formula logic in NHL example
for column in columns:
    diff = df.at[macaroni_index, column] - df.at[cheddar_index, column]
    distance += diff**2

distance = math.sqrt(distance)

print(f"Distance between macaroni and cheddar: {distance:.3f}")

Distance between macaroni and cheddar: 3.060


### Question 5
The point of KNN is to invoke a democratic principle to classifying a group.  Brussels Sprouts are a vegetable.  What is the largest k such that for all n <= k, the majority of the votes are for vegetable, but for k + 1, the vote does not have a majority for vegetable.

For the purposes of this question, a majority is more than 50% of the votes.

In [1993]:
# Replicate distance formula from NHL example
def Distance_formula(row, reference_row, columns_to_use):
    squared_diff_sum = sum((row[col] - reference_row[col])**2 for col in columns_to_use)
    return np.sqrt(squared_diff_sum)

# Define the target row number for the reference row
N1 = df[df['Food'] == 'Brussels sprouts'].index[0]

# Extract the reference row
reference_row = df.iloc[N1]

# Define the columns to use in the formula
columns_to_use = ['Calories', 'Protein', 'Fat', 'Sat.Fat', 'Fiber', 'Carbs']

# Calculate the custom formula for each row
df['Distance_Formula_Result'] = df.apply(Distance_formula, axis=1, reference_row=reference_row, columns_to_use=columns_to_use)

# Add a new column for ranking
df['Rank'] = df['Distance_Formula_Result'].rank()
df.head()

Unnamed: 0,Food,Grams,Calories,Protein,Fat,Sat.Fat,Fiber,Carbs,Category,Calories_per_gram,Distance_Formula_Result,Rank
0,Biscuits,38.0,3.421053,0.078947,0.105263,0.078947,0.0,0.473684,"Breads, cereals, fastfood,grains",3.421053,2.987094,260.0
1,Bran flakes,25.0,4.68,0.12,0.0,0.0,0.004,1.28,"Breads, cereals, fastfood,grains",4.68,4.3831,304.0
2,"Bread, cracked wheat",23.0,2.608696,0.086957,0.043478,0.043478,0.004348,0.521739,"Breads, cereals, fastfood,grains",2.608696,2.19094,228.0
3,Rye,23.0,2.391304,0.086957,0.043478,0.043478,0.004348,0.521739,"Breads, cereals, fastfood,grains",2.391304,1.978365,214.0
4,"White, 20 slices, or",454.0,2.698238,0.085903,0.03304,0.026432,0.019824,0.504405,"Breads, cereals, fastfood,grains",2.698238,2.275097,232.0


In [1994]:
# Initialize the unique categories for the 'Category' column
unique_categories = df['Category'].unique()
k = 6 # attempt k = 1 to k = 10 until percentage in output DF = 50% for vegetables
x = k + 1
results = []

# Replicate countifs() function from Excel to find the counts for each category based on distance formula values, target row and k+1 to find the largest k such that for all n <= k, the majority of the votes are for vegetable, but for k + 1, the vote does not have a majority for vegetable.
for category in unique_categories:
    count1 = df[(df['Rank'] <= x) & (df['Category'] == category)].shape[0]
    count2 = df[(df['Rank'] == 1) & (df['Category'] == category)].shape[0]
    result = count1 - count2
    results.append(result)

# Create a new DataFrame with the results for each category
category_counts = pd.DataFrame({'Category': unique_categories, 'Result': results})

# Calculate the percentage for each category to find the largest k value where Vegetables = 50%
category_counts['Percentage'] = (category_counts['Result'] / category_counts['Result'].sum()) * 100

category_counts

Unnamed: 0,Category,Result,Percentage
0,"Breads, cereals, fastfood,grains",1,16.666667
1,Dairy products,0,0.0
2,"Desserts, sweets",0,0.0
3,"Drinks,Alcohol, Beverages",0,0.0
4,"Fats, Oils, Shortenings",0,0.0
5,"Fish, Seafood",0,0.0
6,Fruits,2,33.333333
7,"Jams, Jellies",0,0.0
8,"Meat, Poultry",0,0.0
9,Seeds and Nuts,0,0.0


### Question 6
The rule of thumb for the number of k we should consult in a KNN process is defined by the length of the dataset in question.  What is the appropriate number for k with this dataset (truncate the value)?

In [1995]:
num_data_points = len(df)  # number of data points
k_approximate = math.sqrt(num_data_points) / 2
k_approximate

9.12414379544733

### Question 7 
Tomatoes are the quintessential confusing vegetable because they are technically a fruit by definition.

For what value of k do we find the first fruit similar to tomatoes?

In [1996]:
# Find the row number for tomatoes
tomatoes_row = df[df['Food'] == 'Tomatoes'].index[0]

# Extract the reference row for tomatoes
reference_tomatoes = df.iloc[tomatoes_row]

first_fruit = None
k = 1

while first_fruit is None:
    df['Similarity'] = df.apply(Distance_formula, axis=1, reference_row=reference_tomatoes, columns_to_use=columns_to_use)
    df['Rank'] = df['Similarity'].rank()
    similar_fruits = df[(df['Rank'] <= k) & (df['Category'] == 'Fruits')]

    if len(similar_fruits) > 0:
        first_fruit = similar_fruits.iloc[0]['Food']
    else:
        k += 1

print(f"k = {k-1}")

k = 12


### Question 8
Which measures have an average value per gram of greater than 0 for every group?

In [1997]:
# Group the data by 'Category'
grouped = df.groupby('Category')

# List of column names (measures) to check
columns_to_check = ['Calories', 'Protein', 'Fat', 'Sat.Fat', 'Fiber', 'Carbs']

# List to store measures with average > 0 for every group
measures_with_avg_gt_0 = []

# Iterate through each column
for column in columns_to_check:
    # Check if the average value per gram is greater than 0 for every group
    if all(group[column].mean() > 0 for name, group in grouped):
        measures_with_avg_gt_0.append(column)

print("Measures with an average value per gram greater than 0 for every group:")
print(measures_with_avg_gt_0)


Measures with an average value per gram greater than 0 for every group:
['Calories', 'Carbs']


### Question 9
For each group, choose the measure that has the highest standard deviation.

In [1998]:
# Group the data by 'Category'
grouped = df.groupby('Category')

# List of column names (measures) to check
columns_to_check = ['Protein', 'Fat', 'Sat.Fat', 'Fiber', 'Carbs']

# Dictionary to store the measure with the highest standard deviation for each group
highest_std_measures = {}

# Iterate through each group
for name, group in grouped:
    highest_std_measure = None
    highest_std_value = -1  # Initialize with a low value
    
    # Iterate through each column to find the highest standard deviation
    for column in columns_to_check:
        std = group[column].std()
        if std > highest_std_value:
            highest_std_value = std
            highest_std_measure = column
    
    highest_std_measures[name] = highest_std_measure

print("Measure with the highest standard deviation for each group:")
for name, measure in highest_std_measures.items():
    print(f"Group: {name}, Measure: {measure}")


Measure with the highest standard deviation for each group:
Group: Breads, cereals, fastfood,grains, Measure: Carbs
Group: Dairy products, Measure: Carbs
Group: Desserts, sweets, Measure: Carbs
Group: Drinks,Alcohol, Beverages, Measure: Carbs
Group: Fats, Oils, Shortenings, Measure: Sat.Fat
Group: Fish, Seafood, Measure: Protein
Group: Fruits, Measure: Carbs
Group: Jams, Jellies, Measure: Fiber
Group: Meat, Poultry, Measure: Fat
Group: Seeds and Nuts, Measure: Sat.Fat
Group: Soups, Measure: Carbs
Group: Vegetables, Measure: Carbs


### Question 10
Which food has the highest vegetable Naive Bayes classifier numerator?
### Question 11
Of the fruits, which fruit has the lowest Naive Bayes classifier numerator?

In [1999]:
# Define the columns to analyze
columns_to_analyze = ['Calories', 'Protein', 'Fat', 'Sat.Fat', 'Fiber', 'Carbs']

# Calculate conditional average and standard deviation
conditional_stats = df.groupby('Category')[columns_to_analyze].agg(['mean', 'std'])
conditional_stats.columns = [f"{col}_{stat}" for col, stat in conditional_stats.columns]

# Calculate the Naive Bayes numerators
naive_bayes_numerators = []

# Iterate through each food item
for _, row in df.iterrows():
    numerator = 1.0

    # Iterate through each column (measure/gram)
    for col in columns_to_analyze:
        mean = conditional_stats.loc[row['Category'], f"{col}_mean"]
        std = conditional_stats.loc[row['Category'], f"{col}_std"]
        value = row[col]
        
        # Calculate the PDF for the value based on the specific mean and standard deviation
        pdf_value = norm.pdf(value, loc=mean, scale=std)
        
        # Multiply the PDF value to the numerator
        numerator *= pdf_value

    # Multiply by the number of fruits and dataset size
    if row['Category'] == 'Fruits':
        numerator *= len(df)

    naive_bayes_numerators.append(numerator)

# Add the Naive Bayes numerators to the DataFrame
df['Naive_Bayes_Numerator'] = naive_bayes_numerators

# Find the highest vegetable Naive Bayes classifier numerator and the corresponding vegetable
vegetable_max_row = df.loc[df[df['Category'] == 'Vegetables']['Naive_Bayes_Numerator'].idxmax()]

# Find the fruit with the lowest Naive Bayes classifier numerator and the corresponding fruit
lowest_fruit_row = df.loc[df[df['Category'] == 'Fruits']['Naive_Bayes_Numerator'].idxmin()]

# Print the results
print(f"Q10: Corresponding Vegetable: {vegetable_max_row['Food']}")
print(f"Q11: Corresponding Fruit: {lowest_fruit_row['Food']}")

Q10: Corresponding Vegetable: heated peas
Q11: Corresponding Fruit: OlivesRipe


  x = np.asarray((x - loc)/scale, dtype=dtyp)


### Question 12
Using a Naive Bayes Classifier Process, you are trying to determine if it is more likely that a random food is a fruit or a vegetable.  Custard has a Naive Bayes numerator (three decimal places) of ________ for fruit and a Naive Bayes numerator of ________ for vegetable,  therefore we would conclude that it is more likely that custard is a ________ than a ________.

In [2000]:
# Define the columns to analyze
columns_to_analyze = ['Calories', 'Protein', 'Fat', 'Sat.Fat', 'Fiber', 'Carbs']

# Define the food item (Custard) to analyze
custard = df[df['Food'] == 'Custard']

# Initialize the Naive Bayes numerators
fruit_numerator = 1.0
vegetable_numerator = 1.0

# Iterate through each column (measure/gram)
for col in columns_to_analyze:
    # Calculate the PDF for the value based on the specific mean and standard deviation for fruits
    pdf_value_fruits = norm.pdf(custard[col].values[0], loc=conditional_stats.loc['Fruits', f"{col}_mean"], scale=conditional_stats.loc['Fruits', f"{col}_std"])
    
    # Multiply the PDF value to the fruit numerator
    fruit_numerator *= pdf_value_fruits

    # Calculate the PDF for the value based on the specific mean and standard deviation for vegetables
    pdf_value_vegetables = norm.pdf(custard[col].values[0], loc=conditional_stats.loc['Vegetables', f"{col}_mean"], scale=conditional_stats.loc['Vegetables', f"{col}_std"])
    
    # Multiply the PDF value to the vegetable numerator
    vegetable_numerator *= pdf_value_vegetables

# Multiply by the number of fruits and vegetables and dataset size
fruit_numerator *= len(df[df['Category'] == 'Fruits']) / len(df)
vegetable_numerator *= len(df[df['Category'] == 'Vegetables']) / len(df)

# Determine whether Custard is more likely to be a fruit or a vegetable
result = "Fruit" if fruit_numerator > vegetable_numerator else "Vegetable"

# Print the results rounded to 3 decimal points
print(f"Fruit Naive Bayes Numerator for Custard: {fruit_numerator:.3f}")
print(f"Vegetable Naive Bayes Numerator for Custard: {vegetable_numerator:.3f}")
print(f"Therefore, we would conclude that it is more likely that Custard is a {result}.")

Fruit Naive Bayes Numerator for Custard: 0.305
Vegetable Naive Bayes Numerator for Custard: 2162.043
Therefore, we would conclude that it is more likely that Custard is a Vegetable.


### Question 13
Of the 70 vegetables in the dataset, how many of them are predicted to be more likely vegetables than fruits using the Naive Bayes Classifier?

In [2001]:
# Filter the dataset to include only vegetables
vegetables = df[df['Category'] == 'Vegetables']

# Initialize a count for vegetables predicted to be more likely vegetables than fruits
predicted_vegetables_count = 0

# Iterate through each vegetable in the dataset
for _, vegetable in vegetables.iterrows():
    # Initialize the Naive Bayes numerators for fruit and vegetable
    fruit_numerator = 1.0
    vegetable_numerator = 1.0
    
    # Iterate through each column (measure/gram)
    for col in columns_to_analyze:
        # Calculate the PDF value for fruits
        pdf_value_fruits = norm.pdf(vegetable[col], loc=conditional_stats.loc['Fruits', f"{col}_mean"], scale=conditional_stats.loc['Fruits', f"{col}_std"])
        fruit_numerator *= pdf_value_fruits
        
        # Calculate the PDF value for vegetables
        pdf_value_vegetables = norm.pdf(vegetable[col], loc=conditional_stats.loc['Vegetables', f"{col}_mean"], scale=conditional_stats.loc['Vegetables', f"{col}_std"])
        vegetable_numerator *= pdf_value_vegetables

    # Multiply by the number of fruits and vegetables and dataset size
    fruit_numerator *= len(df[df['Category'] == 'Fruits']) / len(df)
    vegetable_numerator *= len(df[df['Category'] == 'Vegetables']) / len(df)

    # If the Naive Bayes numerator for vegetables is higher, increment the count
    if vegetable_numerator > fruit_numerator:
        predicted_vegetables_count += 1

print(f"Number of vegetables predicted to be more likely vegetables than fruits: {predicted_vegetables_count}")


Number of vegetables predicted to be more likely vegetables than fruits: 61


### Question 14
To the tenths place, what is the percentage accuracy of the Naive Bayes classifier between Fruits and Vegetables (i.e., between the two options, it calls a fruit a fruit and a vegetable a vegetable).  Put the percent sign in your answer.

In [2002]:
# Filter the dataset to include only fruits and vegetables
fruits_and_vegetables = df[df['Category'].isin(['Fruits', 'Vegetables'])]

# Initialize a count for correct predictions
correct_predictions = 0

# Iterate through each food item in the dataset
for _, food_item in fruits_and_vegetables.iterrows():
    # Initialize the Naive Bayes numerators for fruit and vegetable
    fruit_numerator = 1.0
    vegetable_numerator = 1.0
    
    # Iterate through each column (measure/gram)
    for col in columns_to_analyze:
        # Calculate the PDF value for fruits
        pdf_value_fruits = norm.pdf(food_item[col], loc=conditional_stats.loc['Fruits', f"{col}_mean"], scale=conditional_stats.loc['Fruits', f"{col}_std"])
        fruit_numerator *= pdf_value_fruits
        
        # Calculate the PDF value for vegetables
        pdf_value_vegetables = norm.pdf(food_item[col], loc=conditional_stats.loc['Vegetables', f"{col}_mean"], scale=conditional_stats.loc['Vegetables', f"{col}_std"])
        vegetable_numerator *= pdf_value_vegetables

    # Multiply by the number of fruits and vegetables and dataset size
    fruit_numerator *= len(df[df['Category'] == 'Fruits']) / len(df)
    vegetable_numerator *= len(df[df['Category'] == 'Vegetables']) / len(df)

    # Determine whether the food item is more likely to be a fruit or a vegetable
    predicted_category = 'Fruits' if fruit_numerator > vegetable_numerator else 'Vegetables'

    # Check if the prediction matches the actual category
    if predicted_category == food_item['Category']:
        correct_predictions += 1

# Calculate the percentage accuracy
total_items = len(fruits_and_vegetables)
accuracy = (correct_predictions / total_items) * 100

# Print accuracy to one decimal place
print(f"Percentage accuracy of the Naive Bayes classifier: {accuracy:.1f}%")


Percentage accuracy of the Naive Bayes classifier: 76.6%
