In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Load dataset
df = pd.read_csv("coffee_sales.csv")

# Step 2: Select the first 5 columns and take 100 samples
df_sample = df.iloc[:, :5].dropna().astype(str).sample(100, random_state=42)

# Step 3: Combine the 5 columns into one text field per row
combined_text = df_sample.apply(lambda x: ' '.join(x.values), axis=1)

# Step 4: Create Bag-of-Words representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(combined_text)

# Step 5: Convert to DataFrame for readability
bow_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Step 6: Show results
print("\n--- Vocabulary (Features) ---")
print(vectorizer.get_feature_names_out())

print("\nShape of Feature Matrix:", bow_df.shape)

print("\n--- Bag-of-Words Numerical Feature Matrix (first 10 docs) ---")
print(bow_df.head(10))

# Step 7: Save to file
bow_df.to_csv("coffee_sales_bow_transformed.csv", index=False)
print("\n BoW transformed data saved as 'coffee_sales_bow.csv'")



--- Vocabulary (Features) ---
['02' '06' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '20' '21'
 '22' '23' '25' '27' '28' '30' '32' '33' '35' '37' '38' '72' '76' '82'
 '86' '92' '96' 'afternoon' 'americano' 'cappuccino' 'card' 'chocolate'
 'cocoa' 'cortado' 'espresso' 'hot' 'latte' 'milk' 'morning' 'night'
 'with']

Shape of Feature Matrix: (100, 45)

--- Bag-of-Words Numerical Feature Matrix (first 10 docs) ---
   02  06  10  11  12  13  14  15  16  17  ...  chocolate  cocoa  cortado  \
0   0   0   0   0   0   0   0   0   0   0  ...          0      0        0   
1   0   0   0   0   0   0   0   0   0   0  ...          0      0        0   
2   0   0   0   0   0   0   0   0   0   1  ...          0      0        0   
3   0   0   0   0   0   0   1   0   0   0  ...          0      1        0   
4   0   0   0   0   1   0   0   1   0   0  ...          0      0        0   
5   0   0   0   0   0   0   0   0   0   0  ...          0      0        0   
6   1   0   0   0   0   0   0   0   0  