In [1]:
import pandas as pd
import janitor

# ---------------------------------------------------------------------
# Load the dataset directly from GitHub
# ---------------------------------------------------------------------
url = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-05-14/coffee_survey.csv"
coffee = pd.read_csv(url)

# ---------------------------------------------------------------------
# Clean column names (snake_case)
# ---------------------------------------------------------------------
coffee = coffee.clean_names()

# ---------------------------------------------------------------------
# Columns to keep (exact matches from GitHub CSV)
# ---------------------------------------------------------------------
keep_cols = {
    "submission_id": "submission_id",
    "age": "age",
    "cups": "cups",
    "where_drink": "where_drink",
    "brew": "brew",
    "brew_other": "brew_other",
    "purchase": "purchase",
    "purchase_other": "purchase_other",
    "favorite": "favorite",
    "favorite_specify": "favorite_specify",
    "additions": "additions",
    "additions_other": "additions_other",
    "dairy": "dairy",
    "sweetener": "sweetener",
    "style": "style",
    "strength": "strength",
    "roast_level": "roast_level",
    "caffeine": "caffeine",
    "expertise": "expertise",
    "prefer_abc": "prefer_abc",
    "prefer_ad": "prefer_ad",
    "prefer_overall": "prefer_overall",
    "wfh": "wfh",
    "total_spend": "total_spend",
    "why_drink": "why_drink",
    "why_drink_other": "why_drink_other",
    "taste": "taste",
    "know_source": "know_source",
    "most_paid": "most_paid",
    "most_willing": "most_willing",
    "value_cafe": "value_cafe",
    "spent_equipment": "spent_equipment",
    "value_equipment": "value_equipment",
    "gender": "gender",
    "gender_specify": "gender_specify",
    "education_level": "education_level",
    "ethnicity_race": "ethnicity_race",
    "ethnicity_race_specify": "ethnicity_race_specify",
    "employment_status": "employment_status",
    "number_children": "number_children",
    "political_affiliation": "political_affiliation",
}

# ---------------------------------------------------------------------
# Collect all coffee taste columns (coffee_a_*, coffee_b_*, etc.)
# ---------------------------------------------------------------------
coffee_cols = [c for c in coffee.columns if c.startswith("coffee_")]

# ---------------------------------------------------------------------
# Build final dataset
# ---------------------------------------------------------------------
selected_cols = list(keep_cols.keys()) + coffee_cols

coffee_survey = coffee.loc[:, selected_cols].rename(columns=keep_cols)

# ---------------------------------------------------------------------
# Save the cleaned file
# ---------------------------------------------------------------------
coffee_survey.to_csv("coffee_survey_cleaned.csv", index=False)

print("Saved: coffee_survey_cleaned.csv")


Saved: coffee_survey_cleaned.csv


In [2]:
coffee_survey

Unnamed: 0,submission_id,age,cups,where_drink,brew,brew_other,purchase,purchase_other,favorite,favorite_specify,...,coffee_b_personal_preference,coffee_b_notes,coffee_c_bitterness,coffee_c_acidity,coffee_c_personal_preference,coffee_c_notes,coffee_d_bitterness,coffee_d_acidity,coffee_d_personal_preference,coffee_d_notes
0,gMR29l,18-24 years old,,,,,,,Regular drip coffee,,...,,,,,,,,,,
1,BkPN0e,25-34 years old,,,Pod/capsule machine (e.g. Keurig/Nespresso),,,,Iced coffee,,...,,,,,,,,,,
2,W5G8jj,25-34 years old,,,Bean-to-cup machine,,,,Regular drip coffee,,...,,,,,,,,,,
3,4xWgGr,35-44 years old,,,Coffee brewing machine (e.g. Mr. Coffee),,,,Iced coffee,,...,,,,,,,,,,
4,QD27Q8,25-34 years old,,,Pour over,,,,Latte,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4037,PA44VP,>65 years old,2,At home,Coffee brewing machine (e.g. Mr. Coffee),,,,Regular drip coffee,,...,3.0,,2.0,2.0,4.0,,5.0,5.0,1.0,Sour
4038,vNgpPD,>65 years old,2,At home,Coffee brewing machine (e.g. Mr. Coffee),,,,Regular drip coffee,,...,2.0,,4.0,3.0,2.0,,4.0,3.0,2.0,
4039,g5ggRM,18-24 years old,1,"At a cafe, At home, On the go, At the office","Espresso, Pod/capsule machine (e.g. Keurig/Nes...",Aeropress,"National chain (e.g. Starbucks, Dunkin), Drive...",,Latte,,...,4.0,"Chocolate?, Orange",3.0,1.0,1.0,Licorice,1.0,4.0,5.0,"Apple, Cinnamon"
4040,rlgbDN,25-34 years old,2,At home,Pour over,,,,Iced coffee,,...,3.0,Dark cocoa,2.0,3.0,3.0,Wine,2.0,5.0,2.0,"Kombucha, cider"
