In [None]:
# Title: US_thanksgiving.ipynb
# Desc: Python Intermediate Data Analysis with Pandas project from Dataquest, 
#       helping to practice Series and Dataframe objects.
# Date: 3/8/2017
# Note: thanksgiving.csv contains a survey information on American Thanksgiving
#       celebration. 
#       The dataset came from FiveThirtyEight.

In [4]:
import pandas
data = pandas.read_csv("thanksgiving.csv", encoding="Latin-1")

In [5]:
# displays all column names
# data.columns

In [6]:
# Check how many people said "Yes" when asked if they celebrate Thanksgiving
# Keep only rows on which people said "Yes"
data["Do you celebrate Thanksgiving?"].value_counts()

data = data[data["Do you celebrate Thanksgiving?"] == "Yes"]

In [7]:
# Explore main dishes
data["What is typically the main dish at your Thanksgiving dinner?"].value_counts()

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64

In [8]:
tofurkey = data[data["What is typically the main dish at your Thanksgiving dinner?"] == "Tofurkey"]
tofurkey["Do you typically have gravy?"]

4      Yes
33     Yes
69      No
72      No
77     Yes
145    Yes
175    Yes
218     No
243    Yes
275     No
393    Yes
399    Yes
571    Yes
594    Yes
628     No
774     No
820     No
837    Yes
860     No
953    Yes
Name: Do you typically have gravy?, dtype: object

In [9]:
# How many people eat at least one pie?
# More people ate at least a pie than turkey
apple_isnull = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"].isnull()
pumpkin_isnull = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"].isnull()
pecan_isnull = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"].isnull()
ate_pies = apple_isnull & pumpkin_isnull & pecan_isnull
ate_pies.value_counts()

False    876
True     104
dtype: int64

In [10]:
# convert string of age group into an integer
# we are taking the first number of each group (i.e. 18 for 18 - 29)
def age_str_to_int (string):
    if pandas.isnull(string):
        return None
    number = string.split(" ")[0]
    number = number.replace("+", "")
    number = int(number)
    return number

data["int_age"] = data["Age"].apply(age_str_to_int)
data["int_age"].describe()

count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64

Findings:

We are taking the first number of each group, so the numbers are probably skewed to the left. When I look at the percentiles, the age distribution looks pretty evenly spread out.

In [13]:
# convert string of income group into an integer
# we are taking the first number of each group (i.e. 18 for 18 - 29)
def inc_str_to_int (string):
    if pandas.isnull(string):
        return None
    number = string.split(" ")[0]
    if number == "Prefer":
        return None
    number = number.replace("$", "")
    number = number.replace(",", "")
    number = int(number)
    return number

data["int_income"] = data["How much total combined money did all members of your HOUSEHOLD earn last year?"].apply(inc_str_to_int)
data["int_income"].describe()

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64

Findings:

Since we are using the first number of each categories again, the numbers are probably skewed to the left. The maximum number is 200,000, so that could skew the results as well. Mean seems high. Standard deviation is big.

In [20]:
# Are income and traveling correlated?
under_150000 = data[data["int_income"] < 150000]
under_150000["How far will you travel for Thanksgiving?"].value_counts()

over_150000 = data[data["int_income"] > 150000]
#over_150000["How far will you travel for Thanksgiving?"].value_counts()
under_150000["How far will you travel for Thanksgiving?"].value_counts()


Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64

Findings:

About 41 percent (281/689) of people who earn less than $150,000 will celebrate Thanksgiving at their houses. 
About 48 percent (49/102) of people who earn more than $150,000 will celebrate Thanksgiving at their houses. 
There is some difference but not that much. $150,000 mark might be too high to include the young people with less income that we wanted to test for.

In [21]:
# Are celebrating Thanksgiving with friends and age correlated?
data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"', values="int_age")

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


Findings:

Attending a "Friendsgiving" lowers the mean age. 

There is not a big difference between the group that has only met up with hometown friends on Thanksgiving night and the group that has done neither.

There is about 3 years difference, though, between the group that has only attended a "Friendsgiving" and the group that has done both.