## Data description: 
Q1: From a scale 1 to 5, how complex is it to make this food? (Where 1 is the most simple, and 5 is the most complex)

Q2: How many ingredients would you expect this food item to contain?

Q3: In what setting would you expect this food to be served? Please check all that apply

Q4: How much would you expect to pay for one serving of this food item?

Q5: What movie do you think of when thinking of this food item?

Q6: What drink would you pair with this food item?

Q7: When you think about this food item, who does it remind you of?

Q8: How much hot sauce would you add to this food item?

Label

In [2]:
# Changing to one hot encoding
# Define all possible categories
Q3_categories = ["Week day lunch", "Week day dinner", "Weekend lunch", "Weekend dinner", "At a party", "Late night snack"]
Q6_categories = [
    "soda", "other", "tea", "alcohol", "water", "soup", "juice", "milk", 
    "unknown", "smoothie", "asian alcohol", "asian pop", "milkshake"
]
Q7_categories = ["Parents", "Siblings", "Friends", "Teachers", "Strangers"]
Q8_categories = ["None", "A little (mild)", "A moderate amount (medium)", "A lot (hot)", "I will have some of this food item with my hot sauce"]

# Create binary features for each category
import pandas as pd
import numpy as np

def one_hot_encode(response_str, categories):
    result = np.zeros(len(categories))
    if isinstance(response_str, str):  # Check if it's a string (not NaN)
        selections = response_str.split(',')
        for i, category in enumerate(categories):
            if category in selections:
                result[i] = 1
    return result

In [3]:
# Load data
data = pd.read_csv("data/cleaned_data.csv")
data.describe()

data.isna().sum()

id         0
Q1         0
Q2         1
Q3         0
Q4         6
Q5       183
Q6         0
Q7         2
Q8       746
Label      0
dtype: int64

In [4]:
data = pd.read_csv("data/cleaned_data.csv")

q3 = data["Q3"]
q6 = data["Q6"]
q7 = data["Q7"]
q8 = data["Q8"]

# One hot encode the columns
q3_encoded = np.array([one_hot_encode(response, Q3_categories) for response in q3])
q6_encoded = np.array([one_hot_encode(response, Q6_categories) for response in q6])
q7_encoded = np.array([one_hot_encode(response, Q7_categories) for response in q7])
q8_encoded = np.array([one_hot_encode(response, Q8_categories) for response in q8])

# Create new columns
for i, category in enumerate(Q3_categories):
    data[category] = q3_encoded[:, i]
for i, category in enumerate(Q6_categories):
    data[category] = q6_encoded[:, i]
for i, category in enumerate(Q7_categories):
    data[category] = q7_encoded[:, i]
for i, category in enumerate(Q8_categories):
    data[category] = q8_encoded[:, i]

q8_dummies = pd.get_dummies(data["Q8"], prefix="Q8")
data = pd.concat([data, q8_dummies], axis=1)

In [10]:
# Check the data
data.head()

Unnamed: 0,id,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Label,...,Strangers,None,A little (mild),A moderate amount (medium),A lot (hot),I will have some of this food item with my hot sauce,Q8_A little (mild),Q8_A lot (hot),Q8_A moderate amount (medium),Q8_I will have some of this food item with my hot sauce
0,716549,3,6.0,"Week day lunch,At a party,Late night snack",5,Cloudy With A Chance Of Meatballs,soda,Friends,A little (mild),Pizza,...,0.0,0.0,1.0,0.0,0.0,0.0,True,False,False,False
1,715742,4,2.0,"Week day lunch,At a party,Late night snack",5,,soda,"Friends,Teachers,Strangers",,Pizza,...,1.0,0.0,0.0,0.0,0.0,0.0,False,False,False,False
2,727333,3,5.0,"Week day lunch,Week day dinner,Weekend lunch,W...",10,Action Movie,soda,Friends,A moderate amount (medium),Pizza,...,0.0,0.0,0.0,1.0,0.0,0.0,False,False,True,False
3,606874,4,6.0,"Week day lunch,Week day dinner,Weekend lunch,W...",3,Mamma Mia,other,"Siblings,Friends,Teachers",I will have some of this food item with my hot...,Pizza,...,0.0,0.0,0.0,0.0,0.0,1.0,False,False,False,True
4,505318,2,3.0,"Week day lunch,Week day dinner,Weekend lunch,W...",5,Cloudy With A Chance Of Meatballs,other,"Siblings,Friends",A little (mild),Pizza,...,0.0,0.0,1.0,0.0,0.0,0.0,True,False,False,False


In [12]:
from sklearn.preprocessing import LabelEncoder

df = pd.DataFrame(data)

# Q3

df["Q3"] = df["Q3"].fillna("")
df["Q3_list"] = df["Q3"].apply(lambda x: str(x).split(","))
df["Q3_list_clean"] = df["Q3_list"].apply(
    lambda x: "|".join([i.strip() for i in x if i.strip()])
)

Q3_dummies = df["Q3_list_clean"].str.get_dummies(sep="|")
df = pd.concat([df, Q3_dummies], axis=1)
df.drop(columns=["Q3", "Q3_list", "Q3_list_clean"], inplace=True)


# Q4
if "Q4" in df.columns:
    df["Q4"] = df["Q4"].fillna("$0")  # Replace NaN with "$0"
    df["Q4"] = df["Q4"].str.replace("$", "", regex=False)
    df["Q4"] = pd.to_numeric(df["Q4"], errors="coerce")  # Convert to float

# Q7
if "Q7" in df.columns:
    df["Q7"] = df["Q7"].fillna("")  # Replace NaN with empty string
    df["Q7_list"] = df["Q7"].apply(lambda x: x.split(","))

    # Join each list with '|' so we can split them easily via get_dummies
    df["Q7_joined"] = df["Q7_list"].apply(
        lambda cats: "|".join([c.strip() for c in cats if c.strip()])
    )

    Q7_dummies = df["Q7_joined"].str.get_dummies(sep="|")
    df = pd.concat([df, Q7_dummies], axis=1)
    df.drop(columns=["Q7", "Q7_list", "Q7_joined"], inplace=True)

#Q5, 6, 8
for col in ["Q5", "Q6", "Q8"]:
    if col in df.columns:
        df[col] = df[col].fillna("Unknown")
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))  # Ensure string type before encoding


if "Label" in df.columns:
    df["Label"] = df["Label"].fillna("Unknown")
    label_encoder = LabelEncoder()
    df["Label"] = label_encoder.fit_transform(df["Label"].astype(str))
else:
    print("WARNING: No 'Label' column found. Skipping label encoding.")


print("===== HEAD OF FINAL DATAFRAME =====")
print(df.head())
print("===== DATAFRAME INFO =====")
print(df.info())

if "Label" in df.columns:
    X = df.drop(columns=["Label"])
    y = df["Label"]
    print("Feature matrix shape:", X.shape)
    print("Target vector shape:", y.shape)
else:
    X = df
    y = None
    print("No Label column found. X shape:", X.shape)


df.to_csv("data/multi_hot_encoded_df.csv", index=False)

===== HEAD OF FINAL DATAFRAME =====
       id  Q1   Q2  Q4   Q5  Q6  Q8  Label  Week day lunch  Week day dinner  \
0  716549   3  6.0   5   49  27   0      0             1.0              0.0   
1  715742   4  2.0   5  332  27   4      0             1.0              0.0   
2  727333   3  5.0  10    7  27   2      0             1.0              1.0   
3  606874   4  6.0   3  168  25   3      0             1.0              1.0   
4  505318   2  3.0   5   49  25   0      0             1.0              1.0   

   ...  Late night snack  Week day dinner  Week day lunch  Weekend dinner  \
0  ...                 1                0               1               0   
1  ...                 1                0               1               0   
2  ...                 0                1               1               1   
3  ...                 1                1               1               1   
4  ...                 0                1               1               1   

   Weekend lunch  Friends 