In [1]:
# Analyzing data on Thanksgiving dinner in the U.S. File contains 1058 rows and 65 columns.
# The file has responses to an online survey about what Amercians eat for Thanksgiving dinner.
# Most of the columns are questions

In [2]:
import pandas as pd
import numpy as np
df_thanksgiving = pd.read_csv('/home/ivor/Documents/DataQuest/Intermediate Python and Pandas/\
DataAnalysisWithPandas/thanksgiving.csv', encoding="Latin-1")

In [3]:
df_thanksgiving.head(2)

Unnamed: 0,RespondentID,Do you celebrate Thanksgiving?,What is typically the main dish at your Thanksgiving dinner?,What is typically the main dish at your Thanksgiving dinner? - Other (please specify),How is the main dish typically cooked?,How is the main dish typically cooked? - Other (please specify),What kind of stuffing/dressing do you typically have?,What kind of stuffing/dressing do you typically have? - Other (please specify),What type of cranberry saucedo you typically have?,What type of cranberry saucedo you typically have? - Other (please specify),...,Have you ever tried to meet up with hometown friends on Thanksgiving night?,"Have you ever attended a ""Friendsgiving?""",Will you shop any Black Friday sales on Thanksgiving Day?,Do you work in retail?,Will you employer make you work on Black Friday?,How would you describe where you live?,Age,What is your gender?,How much total combined money did all members of your HOUSEHOLD earn last year?,US Region
0,4337954960,Yes,Turkey,,Baked,,Bread-based,,,,...,Yes,No,No,No,,Suburban,18 - 29,Male,"$75,000 to $99,999",Middle Atlantic
1,4337951949,Yes,Turkey,,Baked,,Bread-based,,Other (please specify),Homemade cranberry gelatin ring,...,No,No,Yes,No,,Rural,18 - 29,Female,"$50,000 to $74,999",East South Central


In [4]:
df_thanksgiving.columns

Index(['RespondentID', 'Do you celebrate Thanksgiving?',
       'What is typically the main dish at your Thanksgiving dinner?',
       'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)',
       'How is the main dish typically cooked?',
       'How is the main dish typically cooked? - Other (please specify)',
       'What kind of stuffing/dressing do you typically have?',
       'What kind of stuffing/dressing do you typically have? - Other (please specify)',
       'What type of cranberry saucedo you typically have?',
       'What type of cranberry saucedo you typically have? - Other (please specify)',
       'Do you typically have gravy?',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Brussel sprouts',
       'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Carrots',
       'Which of these side dishes aretypically served

In [5]:
# Because I want to understand what people ate for Thanksgiving, I'll remove any responses from people who
# don't celebrate it. The column "Do you celebrate Thanksgiving?" contains this information. 
# I only want to keep data for people who answered Yes to this questions.

In [6]:
# this method is used to calculate the total no. of people who celebrate and don't
df_thanksgiving["Do you celebrate Thanksgiving?"].value_counts()

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64

In [7]:
# So there are 980 people in this dataset who celebrate thanksgiving and so lets focus on them only.

In [8]:
# this method is used to filter out only the ones who celebrate and remove the ones who don't
# basically remove everyone who dont celebrate thanksgiving from the dateframe
df_thanksgiving = df_thanksgiving[df_thanksgiving['Do you celebrate Thanksgiving?'] == 'Yes']

In [57]:
df_thanksgiving.head(3)

Unnamed: 0,RespondentID,Do you celebrate Thanksgiving?,What is typically the main dish at your Thanksgiving dinner?,What is typically the main dish at your Thanksgiving dinner? - Other (please specify),How is the main dish typically cooked?,How is the main dish typically cooked? - Other (please specify),What kind of stuffing/dressing do you typically have?,What kind of stuffing/dressing do you typically have? - Other (please specify),What type of cranberry saucedo you typically have?,What type of cranberry saucedo you typically have? - Other (please specify),...,Will you shop any Black Friday sales on Thanksgiving Day?,Do you work in retail?,Will you employer make you work on Black Friday?,How would you describe where you live?,Age,What is your gender?,How much total combined money did all members of your HOUSEHOLD earn last year?,US Region,int_age,int_income
0,4337954960,Yes,Turkey,,Baked,,Bread-based,,,,...,No,No,,Suburban,18 - 29,Male,"$75,000 to $99,999",Middle Atlantic,18,75000
1,4337951949,Yes,Turkey,,Baked,,Bread-based,,Other (please specify),Homemade cranberry gelatin ring,...,Yes,No,,Rural,18 - 29,Female,"$50,000 to $74,999",East South Central,18,50000
2,4337935621,Yes,Turkey,,Roasted,,Rice-based,,Homemade,,...,Yes,No,,Suburban,18 - 29,Male,"$0 to $9,999",Mountain,18,0


In [10]:
# so the new dataframe should have only 980 yes'.
# this method has verified it.
df_thanksgiving["Do you celebrate Thanksgiving?"].value_counts()

Yes    980
Name: Do you celebrate Thanksgiving?, dtype: int64

In [11]:
# I am going to explore what main dishes people tend to eat during Thanksgiving dinner.

In [12]:
df_thanksgiving["What is typically the main dish at your Thanksgiving dinner?"].value_counts()

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64

In [13]:
# So out of the 980 people, there are 859 people who have Turkey as their main dish.
# So 121 people actually dont have Turkey as their main dish... Interesting......

In [14]:
# There are 20 people who buy Tofurkey which is a replacement for Turkey
# lets see how many of them actually use gravy 

In [15]:
Tofurkey = df_thanksgiving[df_thanksgiving['What is typically the main dish at your\
 Thanksgiving dinner?'] == 'Tofurkey']

In [16]:
Tofurkey.loc[:, 'Do you typically have gravy?']

4      Yes
33     Yes
69      No
72      No
77     Yes
145    Yes
175    Yes
218     No
243    Yes
275     No
393    Yes
399    Yes
571    Yes
594    Yes
628     No
774     No
820     No
837    Yes
860     No
953    Yes
Name: Do you typically have gravy?, dtype: object

In [17]:
# Findings - Over 50% use gravy. 

In [18]:
# lets explore the dessert dishes. Specifically, we'll look at how many people eat Apple, Pecan, or 
# Pumpkin pie during Thanksgiving dinner. This data is encoded in the following three columns:
    # Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple
    # Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin
    # Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan

In [19]:
# In order to find the total no. of people who ate at least one of the above, I am generating a boolean 
# Series for each column to eliminate the nulls.
# Then I join all three Series to get a single boolean Series, Where the Series contains False, the person ate
# at least one of the types of pie.

In [20]:
apple_isnull = df_thanksgiving["Which type of pie is typically served at your Thanksgiving dinner?\
 Please select all that apply. - Apple"].isnull()

pumpkin_isnull = df_thanksgiving["Which type of pie is typically served at your Thanksgiving dinner?\
 Please select all that apply. - Pumpkin"].isnull()

pecan_isnull = df_thanksgiving["Which type of pie is typically served at your Thanksgiving dinner?\
 Please select all that apply. - Pecan"].isnull()

In [21]:
ate_pies = apple_isnull & pumpkin_isnull & pecan_isnull

In [22]:
ate_pies.value_counts()

False    876
True     104
dtype: int64

In [23]:
# So out of the 980 people, therewere 876 who ate at least one of the types of pie.

In [24]:
# I am going to analyze the Age column in more depth. In order to analyze the Age column, I will have to 
# first convert it to numeric values. 
# This will make it simple to figure out things like the average age of survey respondents. 

In [25]:
df_thanksgiving['Age'].value_counts()

45 - 59    269
60+        258
30 - 44    235
18 - 29    185
Name: Age, dtype: int64

In [26]:
# You see that the Age Col does not have the exact age, therefore I wont be able to extract an exact int value
# I will then extract the first age value in the group, example: 18 - 29 will count as 18 

In [27]:
# This is a function to convert a string to an integer value.
# it will take a specific group age and extract only the first age value and convert it to an integer.
def convert_str_toint(row):
    age_group = row['Age']
    if pd.isnull(age_group):
        return None
    elif age_group == '60+':
        split_age = age_group.split('+')
        int_age = int(split_age[0])
    else:
        split_age = age_group.split()
        int_age = int(split_age[0])
    return int_age

In [28]:
# I am using the function to each value in the Age col and assign it to int_age
int_age = df_thanksgiving.apply(convert_str_toint, axis = 1)

In [29]:
int_age[:20]

0     18
1     18
2     18
3     30
4     30
5     18
6     18
7     18
8     30
9     30
11    30
12    18
13    18
14    60
15    30
16    30
17    18
18    30
19    30
20    30
dtype: float64

In [30]:
# adding a new column called int_age to the dataframe
df_thanksgiving['int_age'] = int_age

In [58]:
df_thanksgiving.head(3)

Unnamed: 0,RespondentID,Do you celebrate Thanksgiving?,What is typically the main dish at your Thanksgiving dinner?,What is typically the main dish at your Thanksgiving dinner? - Other (please specify),How is the main dish typically cooked?,How is the main dish typically cooked? - Other (please specify),What kind of stuffing/dressing do you typically have?,What kind of stuffing/dressing do you typically have? - Other (please specify),What type of cranberry saucedo you typically have?,What type of cranberry saucedo you typically have? - Other (please specify),...,Will you shop any Black Friday sales on Thanksgiving Day?,Do you work in retail?,Will you employer make you work on Black Friday?,How would you describe where you live?,Age,What is your gender?,How much total combined money did all members of your HOUSEHOLD earn last year?,US Region,int_age,int_income
0,4337954960,Yes,Turkey,,Baked,,Bread-based,,,,...,No,No,,Suburban,18 - 29,Male,"$75,000 to $99,999",Middle Atlantic,18,75000
1,4337951949,Yes,Turkey,,Baked,,Bread-based,,Other (please specify),Homemade cranberry gelatin ring,...,Yes,No,,Rural,18 - 29,Female,"$50,000 to $74,999",East South Central,18,50000
2,4337935621,Yes,Turkey,,Roasted,,Rice-based,,Homemade,,...,Yes,No,,Suburban,18 - 29,Male,"$0 to $9,999",Mountain,18,0


In [32]:
df_thanksgiving.int_age.describe()

count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64

In [33]:
# The describe method shows the statistical results of the Age.
# However we cannot say this is completely accurate. Since we didnt have the exact age of the participants 
# we extracted the just the first age of each group.
# So this is just a rough approximation

In [34]:
# I am going to analyze the "How much total combined money did all members of your HOUSEHOLD earn last year?"
# column in more depth. In order to analyze that column, I will have to first convert it to numeric values. 
# This will me to perform statistical calculations

In [35]:
df_thanksgiving['How much total combined money did all members of your HOUSEHOLD earn last year?'].value_counts()

$25,000 to $49,999      166
$50,000 to $74,999      127
$75,000 to $99,999      127
Prefer not to answer    118
$100,000 to $124,999    109
$200,000 and up          76
$10,000 to $24,999       60
$0 to $9,999             52
$125,000 to $149,999     48
$150,000 to $174,999     38
$175,000 to $199,999     26
Name: How much total combined money did all members of your HOUSEHOLD earn last year?, dtype: int64

In [36]:
# As you see that this Col is very similar to Age col. Does not have the exact income, therefore I am going to
# extract the first value in the group like how I did it to the Age.

In [37]:
# This is the function that will take the group and extract only the first value and convert it to an integer.
def extract_income_toint(row):
    income_group = row['How much total combined money did all members of your HOUSEHOLD earn last year?']
    if pd.isnull(income_group):
        return None
    elif income_group == 'Prefer not to answer':
        return None
    split_income = income_group.split(" ")
    income_dollar = split_income[0]
    income_comma = income_dollar.replace("$", "")
    income_int = int(income_comma.replace(",", ""))
    return income_int

In [38]:
# I am using the function to apply in each value in the column and assign it to int_income
int_income = df_thanksgiving.apply(extract_income_toint, axis = 1)

In [39]:
int_income[:5]

0     75000
1     50000
2         0
3    200000
4    100000
dtype: float64

In [40]:
# adding a new column called int_income to the dataframe
df_thanksgiving['int_income'] = int_income

In [41]:
df_thanksgiving.head(3)

Unnamed: 0,RespondentID,Do you celebrate Thanksgiving?,What is typically the main dish at your Thanksgiving dinner?,What is typically the main dish at your Thanksgiving dinner? - Other (please specify),How is the main dish typically cooked?,How is the main dish typically cooked? - Other (please specify),What kind of stuffing/dressing do you typically have?,What kind of stuffing/dressing do you typically have? - Other (please specify),What type of cranberry saucedo you typically have?,What type of cranberry saucedo you typically have? - Other (please specify),...,Will you shop any Black Friday sales on Thanksgiving Day?,Do you work in retail?,Will you employer make you work on Black Friday?,How would you describe where you live?,Age,What is your gender?,How much total combined money did all members of your HOUSEHOLD earn last year?,US Region,int_age,int_income
0,4337954960,Yes,Turkey,,Baked,,Bread-based,,,,...,No,No,,Suburban,18 - 29,Male,"$75,000 to $99,999",Middle Atlantic,18,75000
1,4337951949,Yes,Turkey,,Baked,,Bread-based,,Other (please specify),Homemade cranberry gelatin ring,...,Yes,No,,Rural,18 - 29,Female,"$50,000 to $74,999",East South Central,18,50000
2,4337935621,Yes,Turkey,,Roasted,,Rice-based,,Homemade,,...,Yes,No,,Suburban,18 - 29,Male,"$0 to $9,999",Mountain,18,0


In [42]:
df_thanksgiving.int_income.describe()

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64

In [43]:
# The describe method shows the statistical results of the Column
# However we cannot say this is completely accurate. Since we didnt have the exact income of the participants 
# we extracted the just the first value of each group.
# So this is just a rough approximation.

In [44]:
# Now I would like to correlate travel distance and income. let's  see how the distance someone travels for 
# Thanksgiving dinner relates to their income.
# We can assume that people who earn less money could be younger and therefore travel to their parents house 
# for Thanksgiving. People earning more are likely to have Thanksgiving at their house.

In [45]:
df_thanksgiving['How far will you travel for Thanksgiving?'].value_counts()

Thanksgiving is happening at my home--I won't travel at all                         396
Thanksgiving is local--it will take place in the town I live in                     276
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    197
Thanksgiving is out of town and far away--I have to drive several hours or fly       82
Name: How far will you travel for Thanksgiving?, dtype: int64

In [46]:
# Out of 951 people, 396 people have Thanksgiving at their house.
# So 555 people travel out for Thanksgiving, lets analyze to see if income has any affect on this

In [47]:
less_than_150000 = df_thanksgiving['int_income'] < 150000

In [48]:
df_thanksgiving.loc[less_than_150000, 'How far will you travel for Thanksgiving?'].value_counts()

Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64

In [49]:
over_than_150000 = df_thanksgiving['int_income'] >= 150000

In [50]:
df_thanksgiving.loc[over_than_150000, 'How far will you travel for Thanksgiving?'].value_counts()

Thanksgiving is happening at my home--I won't travel at all                         66
Thanksgiving is local--it will take place in the town I live in                     34
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    25
Thanksgiving is out of town and far away--I have to drive several hours or fly      15
Name: How far will you travel for Thanksgiving?, dtype: int64

# Findings

In [51]:
# There are a total of 689 people who earn less than $150,000, in which 408 of them travel away from their
# home for Thanksgiving. 
# There are a total of 140 people who earn more than $150,000, in which 66 of them have Thanksgiving at their
# house.
# So around 60% of the people who earn less than $150,000 travel away from home, this might be because
# younger students who dont have a high income might go home for Thanksgiving.

In [52]:
# I am now going to link friendship and age. There are 2 columns which directly pertain to friendship
# Have you ever tried to meet up with hometown friends on Thanksgiving night?, and 
# Have you ever attended a "Friendsgiving?. We can assume that mostly the younger people will fit more
# into this category, and we already know that 60% don't have Thanksgiving at their house.

In [53]:
# here i am generating a pivot table shwoing the average age of respondents for each category
df_thanksgiving.pivot_table(index = 'Have you ever tried to meet up with hometown friends on Thanksgiving\
 night?', columns = 'Have you ever attended a "Friendsgiving?"', values = 'int_age', aggfunc = np.mean)

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


In [54]:
# # here i am generating a pivot table shwoing the average income of respondents for each category
df_thanksgiving.pivot_table(index = 'Have you ever tried to meet up with hometown friends on Thanksgiving\
 night?', columns = 'Have you ever attended a "Friendsgiving?"', values = 'int_income', aggfunc = np.mean)

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,78914.549654,72894.736842
Yes,78750.0,66019.736842


# Findings

In [55]:
# Results show that an average age of 34  and an average income of $66,000 Have attended a "Friendsgiving? 
# and Have tried to meet up with hometown friends on Thanksgiving night?
# This shows that people who are younger are most likely to attend a Friendsgiving, and try to meet up 
# with friends on Thanksgiving.

# Next Steps

In [56]:
# Here are some potential next steps:

# Figure out the most common dessert people eat.
# Figure out the most common complete meal people eat.
# Identify how many people work on Thanksgiving.
# Find regional patterns in the dinner menus.
# Find age, gender, and income based patterns in dinner menus.