In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

file = pd.read_csv("cereal.csv")
cereals = pd.DataFrame(file)
cereals



Your challenge is to plot the calories of each cereal against their sugar content.

You will notice there are missing values. Choose the most appropriate method to filling, or removing those values so as not to skew your results.

We will use this cleaned data for visualising in the next Chapter!

In [None]:
#Find out the missing values
cereals.isnull().values.any()
#Where are the missing values
for column in cereals:
    print(f"Missing column: {column}")
    print(f"Is missing values: {cereals[column].isnull().any()} \n")

In [None]:
#If there are only a few with missing values it will be sensible to drop these values. 
#If not, then find another method to fill these.

cereals_missing_calories = cereals[['name', 'calories', 'sugars']].loc[cereals['calories'].isnull()]
cereals_missing_sugars = cereals[['name', 'calories', 'sugars']].loc[cereals['sugars'].isnull()]
print(cereals_missing_calories)
print(cereals_missing_sugars)
    

Looking at the above, I think the most sensible way to fill the values is by calculating the mean.
So for Cheerios, to fill in the calories I would calculate the mean calories for all cereals with 1.0 sugar content. Then same for Cornflakes, honey nut, etc. For each one I will calculate the mean calores for cereals that have its corresponding sugar content.

I will follow the same method for the cereals missing sugars. So for the cereals with 120 calories, I will calculate the mean sugar content for cereals with 120 calories, and for 110 calories I will calculate accordingly.

In [None]:
#Lets fill Cheerios with a random value, as I can't do a mean.

for sugars in cereals['sugars']:
    if sugars == 1.0:
        average_calories_1 = 110

    elif sugars == 2.0:
        average_calories_2 = cereals.loc[cereals['sugars'] == 2.0, 'calories'].mean()
        
    elif sugars == 10.0:
        average_calories_10 = cereals.loc[cereals['sugars'] == 10.0, 'calories'].mean()
    
    elif sugars == 3.0:
        average_calories_3 = cereals.loc[cereals['sugars'] == 3.0, 'calories'].mean()

print(average_calories_1)
print(average_calories_2)
print(average_calories_10)
print(average_calories_3)

cereals.loc[(cereals['sugars'] == 1.0) & (cereals['calories'].isnull()), 'calories'] = average_calories_1
cereals.loc[(cereals['sugars'] == 2.0) & (cereals['calories'].isnull()), 'calories'] = average_calories_2
cereals.loc[(cereals['sugars'] == 10.0) & (cereals['calories'].isnull()), 'calories'] = average_calories_10
cereals.loc[(cereals['sugars'] == 3.0) & (cereals['calories'].isnull()), 'calories'] = average_calories_3


In [None]:
for calories in cereals['calories']:
    if calories == 110:
        average_sugars_110 = cereals.loc[cereals['calories'] == 110, 'sugars'].mean()

    elif calories == 120:
        average_sugars_120 = cereals.loc[cereals['calories'] == 120, 'sugars'].mean()

print(average_sugars_110)
print(average_sugars_120)

cereals.loc[(cereals['calories'] == 110) & (cereals['sugars'].isnull()), 'sugars'] = average_sugars_110
cereals.loc[(cereals['calories'] == 120) & (cereals['sugars'].isnull()), 'sugars'] = average_sugars_120


In [None]:
cereals.iloc[0:35]

In [None]:
cereals.iloc[35:77]

# Now that I have filled in the missing values, the challenge is to plot the calories of each cereal against their sugar content.

In [None]:
plt.scatter(cereals['calories'], cereals['sugars'])
plt.xlabel('Calories')
plt.ylabel('Sugars')
plt.title("Calories vs Sugars")
plt.show()

In [None]:
sugar_vs_calorie_corr = cereals['sugars'].corr(cereals['calories'])
print(f"The correlation between sugar and calories is: {sugar_vs_calorie_corr}")

The question is, does the sugar content influence the calorie count? 

Looking at the above visualisation it is clear there is a positive correlation. This indicates that as the sugar content increases so the the calorie count and vice versa.

# Which are the top 4 cereals for vitamin content?Visualise your conclusion.

In [None]:
group_by = cereals[['name', 'mfr', 'vitamins']]
vitamin_content_rank = group_by.sort_values(by='vitamins', ascending=False).iloc[0:4]
vitamin_content_rank.index = range(1, len(vitamin_content_rank) + 1)
vitamin_content_rank

To visualise the top 4 cereals with vitamin content in the DataFrame, I have taken the following steps:
1. I grouped the cereals by name, manufacturer and vitamins.
2. I created a variable that stores the filtered DataFrame and sorts the values by vitamins in descending order, then filtered this further to only see the first 4.
3. In order to have my table ordered from 1 to 4, I indexed my filtered DataFrame and used this code 'range(1, len(vitamin_content_rank) + 1)'. This generates a range of numbers ordered from 1 to the length of my results. By adding 1 we stop the value of the range, we ensure that the range includes the upper bound.

In [None]:
x = vitamin_content_rank['name']
y = vitamin_content_rank['vitamins']
plt.bar(x, y)
plt.xlabel("Cereal Name")
plt.ylabel("Vitamin content")
plt.xticks(rotation=-10)

As you can see, I've used a bar graph to show the top 4 cereals with vitamin content. Although I'm not sure how helpful this is given that they all have the same value.

# Which are the top 4 cereals for protein? Visualise your conclusion.

In [None]:
group_by = cereals[['name', 'mfr', 'protein']]
vitamin_content_rank = group_by.sort_values(by='protein', ascending=False).iloc[0:4]
vitamin_content_rank.index = range(1, len(vitamin_content_rank) + 1)
vitamin_content_rank

Here I have used the same code as I did when visualising the vitamin content, except this is sorted by protein.
The numbers here are varied so I believe the easiest chart to visualise this would be a bar chart. See below:

In [None]:
x = vitamin_content_rank['name']
y = vitamin_content_rank['protein']
colors = ['orange', 'green', 'pink', 'blue']

plt.bar(x, y, color=colors) #added colors so each bar is filled by a specific color
plt.xlabel("Cereal Name")
plt.ylabel("Protein content")
plt.title("Protein in Cereal")
plt.show()

# Does Sodium content positively or negatively affect overall ranking? Visualise your conclusion.

In [None]:
cereals.columns
group_by_sodium = cereals[['name', 'mfr', 'sodium', 'rating']]
sodium_vs_rating = group_by_sodium.sort_values(by='sodium', ascending=False)
sodium_vs_rating.index = range(1, len(sodium_vs_rating) + 1)
sodium_vs_rating


As you can see above, I have filtered the DataFrame and sorted by sodium content. The top five cereals with the highest sodium content have lower ratings than the bottom five with the lowest sodium content.

Below I'm going to plot the sodium content and rating on the same chart to show if sodium positively or negatively affects rating.

In [None]:
x = sodium_vs_rating['name']
y1 = sodium_vs_rating['sodium']
y2 = sodium_vs_rating['rating']

fig, ax1 = plt.subplots()

ax2 = ax1.twinx()
ax1.plot(x, y1, 'g-')
ax2.plot(x, y2, 'b-')

ax1.set_xlabel('Cereal')
ax1.set_ylabel('Sodium content', color='r')
ax2.set_ylabel('Rating', color='b')


In [None]:
plt.bar(x, y1, label="Sodium")
plt.bar(x, y2, width=0.4, label="Rating")

plt.xlabel("Cereals")
plt.ylabel("Amount")
plt.xticks(rotation=-90)

plt.title("Sodium content vs Rating")
plt.legend(loc="upper right" )
plt.show()

Out of the two graphs I've plotted, the second one is better visually as I can see them against each other. Based on my finding it seems that the rating fluctuates so it is difficult to tell whether lower sodium content means increased ratings. Though if you look closely you can see that the ratings do increase a little towards the lower end of the sodium content.

The names of the cereals all being muddle together makes it look terrible. So below I'll make a graph that only looks at the top 15 and another graph that looks at the bottom 15.

In [None]:
x = sodium_vs_rating['name'].iloc[0:15]
y1 = sodium_vs_rating['sodium'].iloc[0:15]
y2 = sodium_vs_rating['rating'].iloc[0:15]

plt.bar(x, y1, label="Sodium")
plt.bar(x, y2, width=0.4, label="Rating")

plt.xlabel("Cereals")
plt.ylabel("Amount")
plt.xticks(rotation=-90)

plt.title("Sodium content vs Rating")
plt.legend(loc="upper right" )
plt.show()

Now one that looks at the bottom 15.

In [None]:
x = sodium_vs_rating['name'].iloc[62:77]
y1 = sodium_vs_rating['sodium'].iloc[62:77]
y2 = sodium_vs_rating['rating'].iloc[62:77]

plt.bar(x, y1, label="Sodium")
plt.bar(x, y2, width=0.4, label="Rating")

plt.xlabel("Cereals")
plt.ylabel("Amount")
plt.xticks(rotation=-90)
plt.yticks(np.arange(0, 300, step=50))

plt.title("Sodium content vs Rating")
plt.legend(loc="upper right" )
plt.show()

# Conclusion

Overall, I would say that Sodium content negatively affects Rating