In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize, pos_tag
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.tokenize import RegexpTokenizer, word_tokenize
from PIL import Image
import plotly.express as px



In [2]:
# kaggle link for dataset: https://www.kaggle.com/datasets/chiyucheng/fda-food-enforcement-20082022?select=food_enforcement.csv
# reading the dataset in pandas dataframe

df = pd.read_csv('food_enforcement.csv')

## EDA to the first dataset

In [3]:
# checking columns and datatypes
df.info()
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22840 entries, 0 to 22839
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   status                      22840 non-null  object
 1   city                        22840 non-null  object
 2   state                       22534 non-null  object
 3   country                     22840 non-null  object
 4   classification              22840 non-null  object
 5   event_id                    22840 non-null  int64 
 6   recalling_firm              22840 non-null  object
 7   address_1                   22838 non-null  object
 8   address_2                   1362 non-null   object
 9   postal_code                 22520 non-null  object
 10  voluntary_mandated          22834 non-null  object
 11  initial_firm_notification   22833 non-null  object
 12  distribution_pattern        22840 non-null  object
 13  recall_number               22839 non-null  ob

Unnamed: 0,status,city,state,country,classification,event_id,recalling_firm,address_1,address_2,postal_code,...,distribution_pattern,recall_number,product_description,product_quantity,reason_for_recall,recall_initiation_date,center_classification_date,report_date,code_info,termination_date
0,Ongoing,Jeffersonville,IN,United States,Class II,90568,"Enjoy Life Natural Brands, LLC",301 Salem Rd,,47130,...,Nationwide US and Canada,F-1568-2022,Enjoy Life Chewy Bars Caramel Blondie (Choco...,"89,736 total cases",Due to the potential presence of a foreign mat...,2022-06-30,2022-08-19,2022-08-31,Best By 3/3/2023 3/4/2023 23-MR-03,
1,Ongoing,Jeffersonville,IN,United States,Class II,90568,"Enjoy Life Natural Brands, LLC",301 Salem Rd,,47130,...,Nationwide US and Canada,F-1572-2022,"Enjoy Life Brownie Bites Rich Chocolate, 4.7...","89,736 total cases",Due to the potential presence of a foreign mat...,2022-06-30,2022-08-19,2022-08-31,Best By 1/10/2023,
2,Ongoing,Jeffersonville,IN,United States,Class II,90568,"Enjoy Life Natural Brands, LLC",301 Salem Rd,,47130,...,Nationwide US and Canada,F-1567-2022,"Enjoy Life Chewy Bars Sunseed Crunch, 5.75 o...","89,736 total cases",Due to the potential presence of a foreign mat...,2022-06-30,2022-08-19,2022-08-31,Best By 3/3/2023,
3,Ongoing,Jeffersonville,IN,United States,Class II,90568,"Enjoy Life Natural Brands, LLC",301 Salem Rd,,47130,...,Nationwide US and Canada,F-1569-2022,Enjoy Life Soft Baked Fruit & Oat Breakfast Ov...,"89,736 total cases",Due to the potential presence of a foreign mat...,2022-06-30,2022-08-19,2022-08-31,Best By 2/3/2023 2/4/2023 2/12/2023 2/13/2023 ...,
4,Terminated,Lewis Center,OH,United States,Class II,90671,Pegroks Kitchen,7500 Green Meadows Dr N,,43035,...,"Ohio, New York",F-1582-2022,"Homemade Bofrot/Puff Puff mix, Pegroks Kitchen...",1000 units,Undeclared Allergen - Wheat,2022-07-26,2022-08-26,2022-08-31,Lot Number: 723592191436 Best By: 12/2022,2022-08-29


In [4]:
# String to datetime

df["recall_initiation_date"] =  pd.to_datetime(df["recall_initiation_date"], format="%Y-%m-%d")

# making a new dataframe with the columns I want to use at the beginning

food = df[['classification', 'recalling_firm', 'reason_for_recall', 'recall_initiation_date']]

# making two more columns with month and year

food['year'] = food['recall_initiation_date'].dt.year
food['month'] = food['recall_initiation_date'].dt.month


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  food['year'] = food['recall_initiation_date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  food['month'] = food['recall_initiation_date'].dt.month


In [5]:
import plotly.graph_objs as go


# checking the number of food recalls per year

years = food.groupby('year', as_index = False).count()

# a plot to visualise the difference between years

fig_years = px.bar(years, 
           x='year', 
           y='recalling_firm',
           color='classification', 
           title='Total number of food recalls per year',
           template='simple_white',
           color_continuous_scale=["#F4ECD6","#A7CAB1","#88B7B5", "#847996"],
           width=1000,
           height=500)

fig_years.add_traces(go.Scatter(x= years.year, y=years.recalling_firm, mode = 'lines', showlegend = False, marker_color='#FDDB31'))
fig_years.update_layout( font_family='Courier New', title_font_size=24)


In [6]:
# checking the number of food recalls per month

months = food.groupby('month', as_index = False).count()

# a plot to visualise the difference between months

fig_months = px.bar(months, 
           x='month', 
           y='recalling_firm',
           color='classification', 
           title='Total number of food recalls per month',
           template='simple_white',
           color_continuous_scale=["#F4ECD6","#A7CAB1","#88B7B5", "#847996"],
           width=1000,
           height=500)

fig_months.add_traces(go.Scatter(x= months.month, y=months.recalling_firm, mode = 'lines', showlegend = False, marker_color='#FDDB31'))
fig_months.update_layout( font_family='Courier New', title_font_size=24)


In [7]:
# top 10 firms in recall number

most_recalling_firm = food.groupby('recalling_firm',  as_index=False).count().sort_values('classification', ascending = False).head(10)


In [8]:
# plots them in a bar chart

fig = px.bar(most_recalling_firm, 
           x='classification', 
           y='recalling_firm',
           color='classification', 
           title='Firms with the largest number of food recalls',
           template='simple_white',
           color_continuous_scale=["#F4ECD6", "#A7CAB1","#88B7B5", "#847996"],
           width=1000,
           height=500)

fig.update_layout(font_family='Courier New', title_font_size=24)

In [9]:
# Distribution of danger class (1: most dangerous)

classes = food.groupby('classification',  as_index=False).count()


In [10]:
# pie chart 


fig = px.pie(data_frame=classes,
             names= {'Class I': 'Most Dangerous', 
                     'Class II':'Potentially Dangerous', 
                     'Class III':'Least Dangerous'},
             color_discrete_map={9916:'#88B7B5',
                                 11681:'#847996',
                                 1243:'#F4ECD6',
                                 },
             labels=classes['recalling_firm'],
             values=classes['recalling_firm'],
             hole=.4,
             color='reason_for_recall',
             title='Recall Classification')

fig.update_layout(font_family='Courier New', title_font_size=20)
fig.update_layout(height=400, width=500, title_text='Recall Classification', font_size=14)


## NLP to the announcement texts

### I am keeping most of the code as comment in order to decrease the file size

In [11]:
# convert the column with reason_for_recall text in lowercase

food["reason_for_recall"] = food["reason_for_recall"].str.lower()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [12]:
# clear the text from characters we don't need

import string
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text
food["reason_for_recall"] = food["reason_for_recall"].apply(remove_punctuations)

# also words we don't need
words = ['food','products', 'product', 'potential', 'due', 'presence', 'good', 'recall','hospital','contamination', 'contains','patient','single','fda', 'inspection']

for word in words:
    food["reason_for_recall"] = food["reason_for_recall"].str.replace(word,"")




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [13]:
# Stopwords from nltk

nltk.download('stopwords')
  
stop = stopwords.words('english')

#Removes them

food["reason_for_recall"] = food["reason_for_recall"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/konstantina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [14]:
#Make lists out of tokens
nltk.download('punkt')

food["tokens"] = food["reason_for_recall"].apply(word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/konstantina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [15]:
# Lemmatize the column
nltk.download('wordnet')
nltk.download('omw-1.4')

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

food['text_lemmatized'] = food['reason_for_recall'].apply(lemmatize_text)



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/konstantina/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/konstantina/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [16]:
# Tag the words
nltk.download('averaged_perceptron_tagger')


food['tagged'] = food['text_lemmatized'].apply(nltk.pos_tag)



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/konstantina/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [17]:
# keep only nouns, adjectives, adverbs

food['n_adj_adv'] = food['tagged'].apply(lambda x: [word for word, tag in x if tag in ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR', 'RB', 'RBR', 'RBS' ]])





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
# makes a column with a final cleared text


def make_string(lst):
    string = " ".join(lst)
    return string

food['final'] = food["n_adj_adv"].apply(make_string)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [19]:
# making the wordpress 

#reasons = " ".join(reason for reason in food.final)


In [20]:
# taking an image in order to generate the wordcloud in a shape. I am choosing an apple..!

#apple = np.array(Image.open(r'/Users/konstantina/Downloads/14228450.jpg'))



In [21]:
# Create and generate wordcloud

#wordcloud = WordCloud( background_color=None, mask=apple, mode='RGBA', max_words=200, width=800, height=400).generate(reasons)


#Display the image

# plt.figure( figsize=(20,10) )

# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# plt.show()



## Parsing one more dataset in order to study further the reasons of the recalls

In [22]:
df2 = pd.read_csv('combined.csv')

In [23]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21357 entries, 0 to 21356
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   status                        21357 non-null  object
 1   city                          21357 non-null  object
 2   state                         21076 non-null  object
 3   country                       21357 non-null  object
 4   classification                21357 non-null  int64 
 5   event_id                      21357 non-null  int64 
 6   recalling_firm                21357 non-null  object
 7   address_1                     21355 non-null  object
 8   address_2                     1295 non-null   object
 9   postal_code                   21063 non-null  object
 10  voluntary_mandated            21352 non-null  object
 11  initial_firm_notification     21352 non-null  object
 12  distribution_pattern          21357 non-null  object
 13  recall_number   

In [24]:
# keeping in a new dataframe just the columns I need

combined = df2[['product_description','reason_for_recall_simplified']]


In [25]:
reason_for_recall = combined.groupby('reason_for_recall_simplified', as_index=False).count().sort_values('product_description', ascending = False).head(10)

In [26]:
# plots them in a bar chart

fig_reasons = px.bar(reason_for_recall, 
           x='product_description', 
           y='reason_for_recall_simplified',
           color='product_description', 
           title='Top 10 reasons for food recalls',
           template='simple_white',
           color_continuous_scale=["#F4ECD6", "#A7CAB1","#88B7B5", "#847996"],
           width=1000,
           height=500)

# fig_reasons.update_layout(font_family='Courier New', title_font_size=24)