# Import Libraries

In [76]:
!pip install pandas



In [77]:
!pip install scikit-learn



In [None]:
!pip install 

In [78]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer

# Combine dataset from scrape dataset

In [79]:
# Daftar path file CSV beserta label kategorinya
file_paths = {
    r"raw-dataset\Scrapcookpad(cleanfood).csv": "cleanfood",
    r"raw-dataset\Scrapcookpad(diet).csv": "diet",
    r"raw-dataset\Scrapcookpad(rendahkalori).csv": "rendahkalori",
    r"raw-dataset\Scrapcookpad(sehat).csv": "sehat"
}

In [80]:
# Membaca, memberi label, dan menggabungkan semua file CSV
dataframes = []
for path, label in file_paths.items():
    df = pd.read_csv(path)
    df['kategori'] = label  # Menambahkan kolom kategori
    dataframes.append(df)

In [81]:
# Gabungkan semua dataframe
combined_df = pd.concat(dataframes, ignore_index=True)

In [82]:
# Hapus duplikat berdasarkan judul dan url (jika ada)
combined_df.drop_duplicates(subset=['Judul', 'URL'], inplace=True)

In [83]:
output_path = r"recipes.csv"
combined_df.to_csv(output_path, index=False)

In [84]:
print(f"Jumlah baris: {combined_df.shape[0]}")
print(f"Jumlah kolom: {combined_df.shape[1]}")

Jumlah baris: 11934
Jumlah kolom: 5


# Preprocessing

In [85]:
df = pd.read_csv('recipes.csv')
df.head()

Unnamed: 0,Judul,URL,Bahan,Langkah,kategori
0,Steak Ayam Lemon Madu (Eat Clean Series),https://cookpad.com/id/resep/1419994-steak-aya...,1/2 kg daging dada ayam tanpa tulang; 1/2 buah...,1\n \n\n\n\nKucuri daging ayam dengan sedik...,cleanfood
1,Bala bala Oatmeal Clean eating,https://cookpad.com/id/resep/14034644-bala-bal...,4 sdm oatmeal instan; 1 buah wortel iris korek...,1\n \n\n\n\nCampur semua bahan jadi satu ta...,cleanfood
2,Chicken steak (menu diet/clean eating),https://cookpad.com/id/resep/9595903-chicken-s...,200 gr ayam kampung fillet; Brokoli; Wortel; S...,1\n \n\n\n\nRebus ayam sebentar sampai sete...,cleanfood
3,Clean Eating Aini'Cooking #2,https://cookpad.com/id/resep/23944711-clean-ea...,1 buah wortel kecil potong dadu; 1 buah jagung...,1\n \n\n\n\nSiapkan semua siapkan semua bah...,cleanfood
4,Clean Eating Aini'Cooking #4,https://cookpad.com/id/resep/23961936-clean-ea...,Secukupnya ayam yang telah dimarinasi semalama...,1\n \n\n\n\nRebus wortel dan brokoli lalu t...,cleanfood


In [86]:
# Langkah jadi list, hapus newline dan strip
def clean_steps(raw_text):
    # Pisahkan berdasarkan angka di awal langkah (1\n, 2\n, dst.)
    steps = re.split(r'\n?\s*\d+\s*\n+', raw_text)
    # Bersihkan spasi kosong dan newline
    cleaned = [re.sub(r'\s+', ' ', step).strip() for step in steps if step.strip()]
    return cleaned

df['list_langkah'] = df['Langkah'].apply(clean_steps)

In [87]:
# Bahan jadi list
df['list_bahan'] = df['Bahan'].str.split(';').apply(lambda x: [b.strip() for b in x if b.strip()])

def clean_bahan_list(bahan_list):
    hasil = []
    for bahan in bahan_list:
        # Hilangkan kuantitas dan satuan (angka, kg, sdm, buah, siung, dst.)
        bahan = re.sub(r'\b(\d+/\d+|\d+)(\s*(kg|gr|g|sdm|sdt|ml|buah|butir|iris|sepotong|secukupnya|siung|lembar|potong|cup|sendok|liter)?)?\b', '', bahan, flags=re.IGNORECASE)
        
        # Hilangkan kata penghubung/deskriptif umum
        bahan = re.sub(r'\b(diperlukan|selera|jadi|buang|skip|yang|telah|dengan|iris|cemplung|utuh|lalu|bila|sesuai|potong|kecil|marinate|dimarinasi|disini|kecilskip|sesuaikan|irispotongutuh|semalaman|secukupnya|tanpa|untuk|halus|kukus|dipotong|dicincang|optional|sedikit)\b', '', bahan, flags=re.IGNORECASE)

        # Hapus simbol dan whitespace berlebih
        bahan = re.sub(r'[^\w\s]', '', bahan)  # hilangkan tanda baca
        bahan = bahan.strip()

        # Ambil 1–2 kata pertama yang paling mungkin relevan
        kata = bahan.split()
        if kata:
            hasil.append(' '.join(kata[:2]))
    return hasil

df['label_bahan'] = df['list_bahan'].apply(clean_bahan_list)

In [None]:
import spacy

nlp = spacy.load("xx_ent_wiki_sm")  # model multilingual, atau pakai "id_core_news_sm" jika tersedia

def extract_bahan_utama(text):
    doc = nlp(text)
    return [chunk.text.lower() for chunk in doc.noun_chunks if len(chunk.text.strip()) > 1]

all_bahan = []
for row in df['label_bahan']:
    for item in row:
        if isinstance(item, dict):
            all_bahan.extend(extract_bahan_utama(item['bahan']))
        elif isinstance(item, str):
            all_bahan.extend(extract_bahan_utama(item))

# Hitung frekuensi noun phrase
counter = Counter(all_bahan)
top_noun_phrases = counter.most_common(100)

print(top_noun_phrases)


In [None]:
# from collections import Counter

# all_bahan = []
# for row in df['label_bahan']:
#     for item in row:
#         print(f"str: ", item)
#         all_bahan.append(item)  # fallback

# # print(all_bahan)  

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(all_bahan)

# word_freq = X.sum(axis=0).A1
# words = vectorizer.get_feature_names_out()
# freq_dict = dict(zip(words, word_freq))

# # Top 100 kata paling umum
# top_words = sorted(freq_dict.items(), key=lambda x: x[1], reverse=True)[:200]
# top_keywords = set([word for word, _ in top_words])

# print(top_keywords)

str:  daging dada
str:  perasan jeruk
str:  Bumbu
str:  perasan jeruk
str:  minyak zaitun
str:  Madu
str:  garam
str:  lada putih
str:  bawang bombay
str:  bawang putih
str:  Sayuran Pelengkap
str:  tomat
str:  daun lettuce
str:  kentang
str:  oatmeal instan
str:  wortel korek
str:  kol
str:  batang daun
str:  Bumbu
str:  bawang putih
str:  Merica
str:  Garam
str:  ayam kampung
str:  Brokoli
str:  Wortel
str:  Slada
str:  Kol merah
str:  Mayonaise
str:  Garam
str:  Lada
str:  Margarin
str:  wortel dadu
str:  jagung serut
str:  tomat bagian
str:  timun
str:  sawi putih
str:  telor rebus
str:  Kewpie wijen
str:  ayam
str:  Brokoli
str:  Wortel
str:  Bawang merah
str:  bawang putih
str:  cabe rawit
str:  Blue band
str:  Bumbu marinasi
str:  saus tiram
str:  lada bubuk
str:  kecap manis
str:  saus sambal
str:  bawang bombai
str:  totole
str:  aku tambahin
str:  Dada Ayam
str:  Paprika
str:  bawang putih
str:  bawang merah
str:  bawang bombay
str:  Olive oil
str:  saos Tiram
str:  Garam him

In [89]:
def extract_keywords(bahan):
    words = bahan.lower().split()
    keywords = [w for w in words if w in top_keywords]
    return ' '.join(keywords[:2]) if keywords else 'lainnya'

# Ubah setiap list bahan menjadi list kata kunci
def clean_bahan_auto(bahan_list):
    return [extract_keywords(bahan) for bahan in bahan_list]

# Terapkan
df['clean_label_bahan'] = df['list_bahan'].apply(clean_bahan_auto)


In [90]:
# Peta metode masak beserta sinonimnya
metode_map = {
    'panggang': r'panggang|dipanggang|oven|grill|bakar',
    'rebus': r'rebus|mendidih|didihkan|dididihkan',
    'kukus': r'kukus|steam|dikukus',
    'goreng': r'goreng|digoreng|deep fry|fried|orak arik',
    'tumis': r'tumis|saute|ditumis|menumis',
    'ungkep': r'ungkep|ungkepan',
    'masak': r'masak|memasak|dimasak|panaskan|dimasukkan ke dalam wajan',
    'simpan dingin': r'simpan di kulkas|dinginkan|overnight|semalaman|kulkas',
    'blender': r'blender|haluskan|dihaluskan|lumatkan|blend',
    'campur': r'campur|aduk|tuang|mix|diaduk',
}

def ekstrak_metode_memasak(langkah_list):
    hasil = set()

    for langkah in langkah_list:
        langkah_lower = langkah.lower()

        # Matching keyword
        for metode, pattern in metode_map.items():
            if re.search(pattern, langkah_lower):
                hasil.add(metode)

        # Rule-based tambahan (jika perlu tambahkan di sini)
        if 'air mendidih' in langkah_lower and 'masukkan' in langkah_lower:
            hasil.add('rebus')
        if 'tutup dan simpan' in langkah_lower and 'kulkas' in langkah_lower:
            hasil.add('simpan dingin')
    
    return list(hasil)

df['metode_memasak'] = df['list_langkah'].apply(ekstrak_metode_memasak)

In [91]:
# Hapus kolom yang tidak dibutuhkan
df.drop(columns=['URL', 'Bahan', 'Langkah'], inplace=True)

# Hapus baris yang mengandung data kosong (NaN)
df.dropna(inplace=True)

# Buat mask: True untuk baris dengan metode_memasak kosong (list kosong)
kosong_mask = df['metode_memasak'].apply(lambda x: isinstance(x, list) and len(x) == 0)

# Tampilkan baris yang akan dihapus (opsional, untuk cek)
baris_kosong = df[kosong_mask]
print("Jumlah baris dengan metode_memasak kosong:", len(baris_kosong))

# Hapus baris tersebut dari df
df = df[~kosong_mask].reset_index(drop=True)

# Cek hasil
df.head()

Jumlah baris dengan metode_memasak kosong: 237


Unnamed: 0,Judul,kategori,list_langkah,list_bahan,label_bahan,clean_label_bahan,metode_memasak
0,Steak Ayam Lemon Madu (Eat Clean Series),cleanfood,"[Kucuri daging ayam dengan sedikit air lemon, ...","[1/2 kg daging dada ayam tanpa tulang, 1/2 bua...","[daging dada, perasan jeruk, Bumbu, perasan je...","[daging dada, buah perasan, bumbu, buah perasa...","[campur, panggang, masak]"
1,Bala bala Oatmeal Clean eating,cleanfood,[Campur semua bahan jadi satu tambahkan air se...,"[4 sdm oatmeal instan, 1 buah wortel iris kore...","[oatmeal instan, wortel korek, kol, batang dau...","[oatmeal instan, buah wortel, kol, batang daun...",[campur]
2,Chicken steak (menu diet/clean eating),cleanfood,"[Rebus ayam sebentar sampai setengah matang, t...","[200 gr ayam kampung fillet, Brokoli, Wortel, ...","[ayam kampung, Brokoli, Wortel, Slada, Kol mer...","[ayam fillet, brokoli, wortel, lainnya, kol me...","[rebus, kukus, masak]"
3,Clean Eating Aini'Cooking #2,cleanfood,"[Siapkan semua siapkan semua bahan, cuci bersi...","[1 buah wortel kecil potong dadu, 1 buah jagun...","[wortel dadu, jagung serut, tomat bagian, timu...","[buah wortel, buah jagung, buah tomat, buah ti...",[rebus]
4,Clean Eating Aini'Cooking #4,cleanfood,"[Rebus wortel dan brokoli lalu tiriskan. |, Pa...",[Secukupnya ayam yang telah dimarinasi semalam...,"[ayam, Brokoli, Wortel, Bawang merah, bawang p...","[ayam, brokoli, wortel, bawang merah, bawang p...","[rebus, tumis, masak]"


In [92]:
df.to_csv('recipes_cleaned.csv', index=False)

# EDA

In [93]:
# Jumlah resep per kategori
print(df['clean_label_bahan'].value_counts())

# Frekuensi metode memasak
from collections import Counter
recipe = df['clean_label_bahan'].explode()
print(Counter(recipe))

clean_label_bahan
[buah, buah pir, buah perasan, buah, water kefir]                                                                                                                                 2
[labu, air]                                                                                                                                                                       2
[buah jeruk, sachet, air, es batu]                                                                                                                                                2
[kacang]                                                                                                                                                                          2
[buah nanas, air]                                                                                                                                                                 2
                                                                                  