# Fast Food EDA

## Purpose

The purpose of this project is to conduct exploratory data analysis on american fast food chains, and compare those that are doing well, with those that are struggling. 

## Introduction

In [74]:
# import libraries
import pandas as pd 
import numpy as np
import plotly_express as px 

In [41]:
# read dataframe
df = pd.read_csv('datasets/Top 50 Fast-Food Chains in USA.csv')

In [42]:
# look at dataframe
df.head()

Unnamed: 0,name,us_sales_millions,average_unit_sales_thousands,franchised_stores,company_stores,2021_total_units,change_in_units_from_2020
0,Arby's,4462,1309,2293,1116,3409,40
1,Baskin-Robbins,686,296,2317,0,2317,102
2,Bojangles,1485,1924,496,277,773,15
3,Burger King,10033,1470,7054,51,7105,24
4,Carl's Jr.,1560,1400,1011,47,1058,-21


In [43]:
# look for missing values
df.isna().sum()

name                            0
us_sales_millions               0
average_unit_sales_thousands    0
franchised_stores               0
company_stores                  0
2021_total_units                0
change_in_units_from_2020       0
dtype: int64

In [44]:
# look for duplicats
df.duplicated().sum()

0

In [45]:
# look at column info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   name                          50 non-null     object
 1   us_sales_millions             50 non-null     int64 
 2   average_unit_sales_thousands  50 non-null     int64 
 3   franchised_stores             50 non-null     int64 
 4   company_stores                50 non-null     int64 
 5   2021_total_units              50 non-null     int64 
 6   change_in_units_from_2020     50 non-null     int64 
dtypes: int64(6), object(1)
memory usage: 2.9+ KB


In [46]:
# look at column names
df.columns

Index(['name', 'us_sales_millions', 'average_unit_sales_thousands',
       'franchised_stores', 'company_stores', '2021_total_units',
       'change_in_units_from_2020'],
      dtype='object')

The dataset looks clean, with 50 fast food chain names and some sales information. We changed the column names of the spreadsheet, and read it to a pandas dataframe. 

## Feature Engineering

In [None]:
df.head()

Unnamed: 0,name,us_sales_millions,average_unit_sales_thousands,franchised_stores,company_stores,2021_total_units,change_in_units_from_2020
0,Arby's,4462,1309,2293,1116,3409,40
1,Baskin-Robbins,686,296,2317,0,2317,102
2,Bojangles,1485,1924,496,277,773,15
3,Burger King,10033,1470,7054,51,7105,24
4,Carl's Jr.,1560,1400,1011,47,1058,-21


In [None]:
# creating new features 
df['share_of_sales'] = df.us_sales_millions / df.us_sales_millions.sum(axis=0) * 100
df['company_store_ratio'] = df.company_stores / df.franchised_stores
df['units_percent_change'] = df.change_in_units_from_2020 / df['2021_total_units'] * 100

In [75]:
# replace infinity with 0
df.loc[np.isinf(df['company_store_ratio']), 'company_store_ratio'] = 0


In [122]:
df.head()

Unnamed: 0,name,us_sales_millions,average_unit_sales_thousands,franchised_stores,company_stores,2021_total_units,change_in_units_from_2020,share_of_sales,company_store_ratio,units_percent_change,total_stores
0,Arby's,4462,1309,2293,1116,3409,40,1.79736,0.486699,1.173365,3409
1,Baskin-Robbins,686,296,2317,0,2317,102,0.276331,0.0,4.402244,2317
2,Bojangles,1485,1924,496,277,773,15,0.59818,0.558468,1.940492,773
3,Burger King,10033,1470,7054,51,7105,24,4.041442,0.00723,0.33779,7105
4,Carl's Jr.,1560,1400,1011,47,1058,-21,0.628391,0.046489,-1.984877,1058


## EDA

In [77]:
# summary statistics of columns
df.describe()

Unnamed: 0,us_sales_millions,average_unit_sales_thousands,franchised_stores,company_stores,2021_total_units,change_in_units_from_2020,share_of_sales,company_store_ratio,units_percent_change
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,4965.06,1857.66,2663.96,503.5,3167.4,29.7,2.0,1.348229,2.335806
std,7531.439444,1163.266418,3801.49754,1332.16928,4124.173105,173.769875,3.033776,4.140731,4.347926
min,615.0,296.0,0.0,0.0,243.0,-1043.0,0.247731,0.0,-4.932142
25%,935.25,1084.0,478.0,33.75,788.25,-4.25,0.376733,0.007223,-0.096925
50%,2289.5,1510.5,1115.5,176.0,1634.0,24.0,0.922245,0.065273,0.954043
75%,5400.0,2129.75,3103.0,396.0,3516.25,91.0,2.1752,0.482101,4.664918
max,45960.0,6100.0,21147.0,8953.0,21147.0,246.0,18.513371,23.652174,15.63786


In [78]:
# skew of the columns
df.skew()


Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.



us_sales_millions               3.849276
average_unit_sales_thousands    1.607486
franchised_stores               2.968557
company_stores                  5.580399
2021_total_units                2.674267
change_in_units_from_2020      -4.815774
share_of_sales                  3.849276
company_store_ratio             4.213765
units_percent_change            1.134608
dtype: float64

In [79]:
# looking at correlations
px.imshow(df.corr(), text_auto=True, aspect='auto', template='ggplot2', title='Correlations')

In [123]:
df.columns

Index(['name', 'us_sales_millions', 'average_unit_sales_thousands',
       'franchised_stores', 'company_stores', '2021_total_units',
       'change_in_units_from_2020', 'share_of_sales', 'company_store_ratio',
       'units_percent_change', 'total_stores'],
      dtype='object')

In [124]:
columns = ['us_sales_millions', 'average_unit_sales_thousands',
       'franchised_stores', 'company_stores', '2021_total_units',
       'change_in_units_from_2020', 'share_of_sales', 'company_store_ratio',
       'units_percent_change']
for column in columns:
    px.histogram(df[column], title='Distribution of ' + str.upper(column), labels={'value': str.upper(column)}).show()

In [125]:
for column in columns:
    px.box(df[column], title='Distribution of ' + str.upper(column), labels={'value': str.upper(column)}).show()

In [88]:
df.sort_values(by='us_sales_millions', ascending=False).head()

Unnamed: 0,name,us_sales_millions,average_unit_sales_thousands,franchised_stores,company_stores,2021_total_units,change_in_units_from_2020,share_of_sales,company_store_ratio,units_percent_change
28,McDonald's,45960,3420,12775,663,13438,244,18.513371,0.051898,1.815746
40,Starbucks,24300,1200,6497,8953,15450,113,9.788401,1.378021,0.731392
6,Chick-fil-A,16700,6100,2650,82,2732,155,6.727008,0.030943,5.673499
42,Taco Bell,12600,1823,6540,462,7002,203,5.075467,0.070642,2.899172
45,Wendy's,11111,1895,5535,403,5938,57,4.475676,0.072809,0.959919


In [149]:
# Sales 
for number in [10, 25, 50]:
    px.bar(df.sort_values(by='us_sales_millions', ascending=False).head(number), x='name', y='us_sales_millions', title='Top ' +str(number)+ ' US Sales', labels={'us_sales_millions': 'Sales (millions)'}, height=700, color='name').show()

In [151]:
for number in [10, 25, 50]:
    px.bar(df.sort_values(by='average_unit_sales_thousands', ascending=False).head(number), x='name', y='average_unit_sales_thousands', title='Top ' +str(number)+ ' Unit Sales', labels={'average_unit_sales_thousands': 'Unit Sales (thousands)'}, height=700, color='name').show()

In [119]:
px.bar(df, x='name', y=['franchised_stores', 'company_stores'], title='Total Number of Stores',labels={'value': 'Number of Stores'}, height=700)

In [135]:
px.bar(df.sort_values(by='total_stores', ascending=False).head(10), x='name', y='total_stores', title='Top 10 Total Number of Stores', labels={'total_stores': 'Number of Stores'}, height=700, color='name')

In [154]:
for number in [10, 25, 50]:
    px.bar(df.sort_values(by='2021_total_units', ascending=False).head(number), x='name', y='2021_total_units', title='Top ' +str(number)+ ' 2021 Total Units', color='name', height=700).show()