In [1]:
library(knitr)
opts_chunk$set(out.width = "75%", 
               fig.align = "center")


library(tidyverse)
library(lubridate)
library(extrafont)
library(scales)
library(ggridges)
library(corrplot)
library(Hmisc)
library(reshape2)



theme_set(theme_light() + theme(text = element_text(family = "Impact")))

recipes <- read_csv("../input/food-com-recipes-and-user-interactions/RAW_recipes.csv")
reviews <- read_csv("../input/food-com-recipes-and-user-interactions/RAW_interactions.csv")

food <- inner_join(recipes, reviews, by = c("id" = "recipe_id"))

str(food)
head(food, 10)

In [2]:

food2 <- food %>% 
  mutate(submission_month = month(submitted, label = TRUE), submission_year = year(submitted), review_month = month(date, label = TRUE), review_year = year(date)) %>%
  select(-c(submitted, date))

food2 %>% 
  select(name, submission_month, submission_year, review_month, review_year) %>%
  head(10)

In [3]:
food3 <- food2 %>%
  mutate(nutrition = str_replace_all(nutrition, "\\[|\\]", "")) %>%
  separate(nutrition, into = c("calories", "total_fat", "sugar", "sodium", "protein", "sat_fat", "carbs"), sep = ",")

food3 %>%
  select(name, calories, total_fat, sugar, sodium, protein, sat_fat, carbs) %>%
  distinct() %>%
  head(10)

In [4]:
food3 %>%
  select(id, name, minutes) %>%
  distinct() %>%
  ggplot(aes(minutes)) +
  geom_density() 

In [5]:
food3 %>%
  select(id, name, n_ingredients) %>%
  distinct() %>%
  ggplot(aes(n_ingredients)) +
  geom_histogram(fill = "#c9c7f1") +
  geom_vline(xintercept = median(food3$n_ingredients)) +
  ggtitle("Distribution of Recipe Num of Ingredients") +
  xlab("Num of Ingredients") +
  ylab("Number of Recipies") 

paste0("Median Number of Ingredients: ", median(food3$n_ingredients))

In [6]:
food3 %>%
  select(rating) %>%
  ggplot(aes(rating)) +
  geom_bar(fill = "#c9c7f1") +
  geom_vline(xintercept = mean(food3$rating)) +
  ggtitle("Distribution of Recipe Ratings") +
  xlab("Rating") +
  ylab("Number of Racipies") +
  scale_x_continuous(breaks = c(0,1,2,3,4,5))

paste0("Mean Rating: ", round(mean(food3$rating), 2))

In [7]:
sub_years <- food3 %>%
  select(submission_year, id) %>%
  distinct()
review_years <- food3 %>%
  select(review_year)

ggplot() +
  geom_bar(data = review_years, aes(review_year), fill = "#ef5f58") +
  geom_bar(data = sub_years, aes(submission_year), fill = "#d3f0b0") +
  xlab("Year") +
  ylab("Count") +
  ggtitle("Distribution of Recipe Submission and Review Years", subtitle = "Green bars represent submission, red bars represent reviews")
 

In [8]:
sub_months <- food3 %>%
  select(submission_month, id) %>%
  distinct()
review_months <- food3 %>%
  select(review_month)

ggplot() +
  geom_bar(data = review_months, aes(review_month), fill = "#ef5f58") +
  geom_bar(data = sub_months, aes(submission_month), fill = "#d3f0b0") +
  xlab("Month") +
  ylab("Count") +
  ggtitle("Distribution of Recipe Submission and Review Years", subtitle = "Green bars represent submission, red bars represent reviews")


In [9]:

food3 %>%
  select(6:12) %>%
  distinct() %>%
  ggplot(aes(as.numeric(calories))) +
  geom_density(fill = "#d3f0b0") +
  geom_vline(xintercept = median(as.numeric(food3$calories))) +
  xlab("Calories") +
  ylab(NULL) +
  ggtitle("Distribution of Calories") +
  theme(axis.text.y = element_blank()) +
  scale_x_log10()

paste0("Median Calories: ", median(as.numeric(food3$calories)))


In [10]:
food3 %>%
  select(6:12) %>%
  distinct() %>%
  ggplot(aes(as.numeric(total_fat))) +
  geom_density(fill = "#d3f0b0") +
  geom_vline(xintercept = median(as.numeric(food3$total_fat))) +
  xlab("Total Fat") +
  ylab(NULL) +
  ggtitle("Distribution of Total Fat") +
  theme(axis.text.y = element_blank()) +
  scale_x_log10()

paste0("Median Total Fat: ", median(as.numeric(food3$total_fat)))

In [11]:
food3 %>%
  select(6:12) %>%
  distinct() %>%
  ggplot(aes(as.numeric(sugar))) +
  geom_density(fill = "#d3f0b0") +
  geom_vline(xintercept = median(as.numeric(food3$sugar))) +
  xlab("Sugar") +
  ylab(NULL) +
  ggtitle("Distribution of Sugar") +
  theme(axis.text.y = element_blank()) +
  scale_x_log10()

paste0("Median Sugar: ", median(as.numeric(food3$sugar)))

In [12]:

food3 %>%
  select(6:12) %>%
  distinct() %>%
  ggplot(aes(as.numeric(sodium))) +
  geom_density(fill = "#d3f0b0") +
  geom_vline(xintercept = median(as.numeric(food3$sodium))) +
  xlab("Sodium") +
  ylab(NULL) +
  ggtitle("Distribution of Sodium") +
  theme(axis.text.y = element_blank()) +
  scale_x_log10()

paste0("Median Sodium: ", median(as.numeric(food3$sodium)))

In [13]:
green = "#d3f0b0"
food3 %>%
  select(6:12) %>%
  distinct() %>%
  ggplot(aes(as.numeric(protein))) +
  geom_density(fill = green) +
  geom_vline(xintercept = median(as.numeric(food3$protein))) +
  xlab("Protein") +
  ylab(NULL) +
  ggtitle("Distribution of Protein") +
  theme(axis.text.y = element_blank()) +
  scale_x_log10()

paste0("Median Protein: ", median(as.numeric(food3$protein)))


In [14]:
food3 %>%
  select(6:12) %>%
  distinct() %>%
  ggplot(aes(as.numeric(sat_fat))) +
  geom_density(fill = green) +
  geom_vline(xintercept = median(as.numeric(food3$sat_fat))) +
  xlab("Saturated Fat") +
  ylab(NULL) +
  ggtitle("Distribution of Saturated Fat") +
  theme(axis.text.y = element_blank()) +
  scale_x_log10()

paste0("Median Saturated Fat: ", median(as.numeric(food3$sat_fat)))


In [15]:
food3 %>%
  select(6:12) %>%
  distinct() %>%
  ggplot(aes(as.numeric(carbs))) +
  geom_density(fill = green) +
  geom_vline(xintercept = median(as.numeric(food3$carbs))) +
  xlab("Carbs") +
  ylab(NULL) +
  ggtitle("Distribution of Carbs") +
  theme(axis.text.y = element_blank()) +
  scale_x_log10()

paste0("Median Carbs: ", median(as.numeric(food3$carbs)))


In [16]:

corr_values <- food3 %>%
  select(minutes, 6:13, n_ingredients, rating, submission_year, review_year) %>%
  na.omit() 

corrmat1 <- rcorr(as.matrix(corr_values))

corrplot(corrmat1$r, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)

corrmat1$r

In [17]:
text_food <- food3 %>%
  select(name, id, tags, steps, description, ingredients, review, rating, user_id)

In [18]:
ingredients <- food3 %>%
  select(ingredients, rating) %>%
  separate_rows(ingredients, sep = "\\, ")

ingredients2 <- ingredients %>%
  mutate(ingredients = str_replace_all(ingredients, "\\[|\\]|\\'", ""))  %>%
  group_by(ingredients) %>%
  summarise(count = n(), avg_rating = mean(rating))

head(ingredients2, 10)

In [19]:
ingredients2 %>%
  arrange(desc(count)) %>%
  head(10) %>%
  ggplot(aes(fct_reorder(ingredients, count), count)) +
  geom_bar(stat = "identity", fill = green) +
  coord_flip() +
  xlab(NULL) +
  ylab("Count") +
  ggtitle("The 10 Most Common Ingredients")

In [20]:
ingredients2 %>%
  filter(count >= 500) %>%
  arrange(desc(avg_rating)) %>%
  head(10) %>%
  ggplot(aes(fct_reorder(ingredients, avg_rating), avg_rating)) +
  geom_bar(stat = "identity", fill = green) +
  geom_label(aes(label = round(avg_rating, 2))) +
  coord_flip() +
  xlab(NULL) +
  ylab("Average Rating") +
  ggtitle("The 10 Best Common Ingredients")

In [21]:
ingredients2 %>%
  filter(count >= 500) %>%
  arrange(desc(avg_rating)) %>%
  tail(10) %>%
  ggplot(aes(rev(fct_reorder(ingredients, avg_rating)), avg_rating)) +
  geom_bar(stat = "identity", fill = green) +
  geom_label(aes(label = round(avg_rating, 2))) +
  coord_flip() +
  xlab(NULL) +
  ylab("Average Rating") +
  ggtitle("The 10 Worst Common Ingredients")
  

In [22]:
reviews <- text_food %>%
  select(review, rating) %>%
  na.omit() %>%
  mutate(rating_count = str_length(review)) %>%
  mutate(rating = as.character(rating))

reviews$review[[73]]

reviews[73,]

reviews %>%
  ggplot(aes(rating, rating_count)) +
  geom_boxplot(fill = green) +
  scale_y_log10() +
  xlab("Rating") +
  ylab("Character Count") +
  ggtitle("Review Character Counts by Rating")

paste0("Correlation Value: ", round(cor(reviews$rating_count, as.numeric(reviews$rating)), 2))

In [23]:
reviews2 <- reviews %>%
  mutate(num_exclam = str_count(review, "!")) %>%
  mutate(num_upper = str_count(review, "\\b[A-Z][A-Z]+\\b")) #Regex for getting upper-case words of at least two characters.  The \b escape allows all seperate words to be included even if they have punctuation afterwards.

reviews2$review[[73]]

reviews2[73,]

In [24]:
reviews2 %>%
  ggplot(aes(rating, num_exclam)) +
  geom_boxplot(fill = green) +
  scale_y_log10() +
  xlab("Rating") +
  ylab("Exclamation Points") +
  ggtitle("Number of Exclamation Marks in Reviews by Rating")

paste0("Correlation Value: ", round(cor(reviews2$num_exclam, as.numeric(reviews2$rating)), 2))

In [25]:
reviews2 %>%
  ggplot(aes(rating, num_upper)) +
  geom_boxplot(fill = green) +
  scale_y_log10() +
  xlab("Rating") +
  ylab("Upper-Case Words") +
  ggtitle("Number of Upper-Case Words in Reviews by Rating")

paste0("Correlation Value: ", round(cor(reviews2$num_upper, as.numeric(reviews2$rating)), 2))

In [26]:

library(wordcloud)

ingred <- ingredients2 %>%
  separate_rows(ingredients, sep = "\\ ") %>%
  mutate(ingredients = str_replace_all(ingredients, '"', "")) %>%
  group_by(ingredients) %>%
  summarise(n = n()) %>%
  filter(!(ingredients %in% c("s", "the", "and", "or", "a", "")))
  

wordcloud(words = ingred$ingredients, freq = ingred$n, max.words = 200, random.order = FALSE)

In [27]:
names <- text_food %>%
  separate_rows(name, sep = " ") %>%
  group_by(name) %>%
  summarise(n = n())

names2 <- names %>%
  filter(!(name %in% c("s", "the", "and", "or", "a", "")))

wordcloud(words = names2$name, freq = names2$n, max.words = 200, random.order = FALSE)
