In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def extract_uid_from_href(href):
    match = re.search(r'uid=(\w+)', href)
    if match:
        return match.group(1)
    else:
        return None

def crawl_reviews(recipe_url, recipe_number):
    response = requests.get(recipe_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extracting the dish name correctly
        dish_name_element = soup.find('div', class_='view2_summary st3')
        dish_name = dish_name_element.find('h3').text.strip() if dish_name_element else "No Dish Name"
        
        reviews_info = []
        for review in soup.find_all('div', class_='media'):
            review_id = posting_time = rating = review_content = None
            
            media_left = review.find('div', class_='media-left')
            if media_left:
                review_id_element = media_left.find('a')
                if review_id_element:
                    review_id_href = review_id_element.get('href')
                    review_id = extract_uid_from_href(review_id_href)
            
            media_heading = review.find('h4', class_='media-heading')
            if media_heading:
                # Extracting posting time correctly
                posting_time_text = media_heading.get_text().split('</b>')[-1].strip()
                time_match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', posting_time_text)
                if time_match:
                    posting_time = time_match.group()
            
            rating_icons = review.find_all('img', src='https://recipe1.ezmember.co.kr/img/mobile/icon_star2_on.png')
            rating = len(rating_icons) if rating_icons else None
            
            review_cont = review.find('p', class_='reply_list_cont')
            if review_cont:
                review_content = review_cont.text.strip()
                if review_content:
                    reviews_info.append({
                        'Recipe Number': recipe_number,
                        'Dish Name': dish_name,  # Using the extracted dish name
                        'Review ID': review_id,
                        'Posting Time': posting_time,
                        'Rating': rating,
                        'Review Content': review_content
                    })
        return reviews_info
    else:
        return None

def crawl_category_reviews(category_url, category_number, max_recipes=60):
    reviews_info = []
    recipe_count = 0
    
    page = 1
    while recipe_count < max_recipes:
        page_url = f"{category_url}&page={page}"
        response = requests.get(page_url)
        if response.status_code != 200:
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        recipe_links = soup.find_all('a', class_='common_sp_link')
        if not recipe_links:
            break
        
        for link in recipe_links:
            if recipe_count >= max_recipes:
                break
            recipe_url = "https://www.10000recipe.com" + link['href']
            recipe_reviews = crawl_reviews(recipe_url, f"1010120{category_number:02}")
            if recipe_reviews:
                reviews_info.extend(recipe_reviews)
                recipe_count += 1
        
        page += 1
    
    return reviews_info

def main():
    for category_number in range(1, 11):
        category_url = f"https://www.10000recipe.com/theme/view.html?theme=1010120{category_number:02}"
        category_reviews = crawl_category_reviews(category_url, category_number)
        
        if category_reviews:
            df = pd.DataFrame(category_reviews)
            excel_file_name = f"category_{category_number}_reviews.xlsx"
            df.to_excel(excel_file_name, index=False)
            print(f"Reviews for category {category_number} have been successfully crawled and saved to '{excel_file_name}'.")
        else:
            print(f"No reviews found for category {category_number}.")

if __name__ == "__main__":
    main()

Reviews for category 1 have been successfully crawled and saved to 'category_1_reviews.xlsx'.
Reviews for category 2 have been successfully crawled and saved to 'category_2_reviews.xlsx'.
Reviews for category 3 have been successfully crawled and saved to 'category_3_reviews.xlsx'.
Reviews for category 4 have been successfully crawled and saved to 'category_4_reviews.xlsx'.
Reviews for category 5 have been successfully crawled and saved to 'category_5_reviews.xlsx'.
Reviews for category 6 have been successfully crawled and saved to 'category_6_reviews.xlsx'.
Reviews for category 7 have been successfully crawled and saved to 'category_7_reviews.xlsx'.
Reviews for category 8 have been successfully crawled and saved to 'category_8_reviews.xlsx'.
Reviews for category 9 have been successfully crawled and saved to 'category_9_reviews.xlsx'.
Reviews for category 10 have been successfully crawled and saved to 'category_10_reviews.xlsx'.


In [2]:
import pandas as pd

excel_files = ["category_1_reviews.xlsx", "category_2_reviews.xlsx", "category_4_reviews.xlsx", "category_8_reviews.xlsx"]

dfs = []

for excel_file in excel_files:
    df = pd.read_excel(excel_file)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

combined_excel_file_name = "combined_reviews.xlsx"
combined_df.to_excel(combined_excel_file_name, index=False)

print(f"Combined reviews from specified categories have been successfully saved to '{combined_excel_file_name}'.")

Combined reviews from specified categories have been successfully saved to 'combined_reviews.xlsx'.


In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

def extract_uid_from_href(href):
    match = re.search(r'uid=(\w+)', href)
    if match:
        return match.group(1)
    else:
        return None

def crawl_reviews(recipe_url, recipe_number):
    response = requests.get(recipe_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extracting the dish name correctly
        dish_name_element = soup.find('div', class_='view2_summary st3')
        dish_name = dish_name_element.find('h3').text.strip() if dish_name_element else "No Dish Name"
        
        reviews_info = []
        for review in soup.find_all('div', class_='media'):
            review_id = posting_time = rating = review_content = None
            
            media_left = review.find('div', class_='media-left')
            if media_left:
                review_id_element = media_left.find('a')
                if review_id_element:
                    review_id_href = review_id_element.get('href')
                    review_id = extract_uid_from_href(review_id_href)
            
            media_heading = review.find('h4', class_='media-heading')
            if media_heading:
                # Extracting posting time correctly
                posting_time_text = media_heading.get_text().split('</b>')[-1].strip()
                time_match = re.search(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', posting_time_text)
                if time_match:
                    posting_time = time_match.group()
            
            rating_icons = review.find_all('img', src='https://recipe1.ezmember.co.kr/img/mobile/icon_star2_on.png')
            rating = len(rating_icons) if rating_icons else None
            
            review_cont = review.find('p', class_='reply_list_cont')
            if review_cont:
                review_content = review_cont.text.strip()
                if review_content:
                    reviews_info.append({
                        'Recipe Number': recipe_number,
                        'Dish Name': dish_name,  # Using the extracted dish name
                        'Review ID': review_id,
                        'Posting Time': posting_time,
                        'Rating': rating,
                        'Review Content': review_content
                    })
        return reviews_info
    else:
        return None

def crawl_category_reviews(category_url, category_number, max_recipes=60):
    reviews_info = []
    recipe_count = 0
    
    page = 1
    while recipe_count < max_recipes:
        page_url = f"{category_url}&page={page}"
        response = requests.get(page_url)
        if response.status_code != 200:
            break
        
        soup = BeautifulSoup(response.content, 'html.parser')
        recipe_links = soup.find_all('a', class_='common_sp_link')
        if not recipe_links:
            break
        
        for link in recipe_links:
            if recipe_count >= max_recipes:
                break
            recipe_url = "https://www.10000recipe.com" + link['href']
            recipe_reviews = crawl_reviews(recipe_url, f"1010120{category_number:02}")
            if recipe_reviews:
                reviews_info.extend(recipe_reviews)
                recipe_count += 1
        
        page += 1
    
    return reviews_info

def main():
    for category_number in range(1, 12):
        category_url = f"https://www.10000recipe.com/theme/view.html?theme=1010140{category_number:02}"
        category_reviews = crawl_category_reviews(category_url, category_number)
        
        if category_reviews:
            df = pd.DataFrame(category_reviews)
            excel_file_name = f"category_{category_number+10}_reviews.xlsx"
            df.to_excel(excel_file_name, index=False)
            print(f"Reviews for category {category_number+10} have been successfully crawled and saved to '{excel_file_name}'.")
        else:
            print(f"No reviews found for category {category_number+10}.")

if __name__ == "__main__":
    main()

KeyboardInterrupt: 

In [None]:
import pandas as pd

excel_files = ["category_{}_reviews.xlsx".format(i) for i in range(11, 22)]

dfs = []

for excel_file in excel_files:
    df = pd.read_excel(excel_file)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

combined_excel_file_name = "combined_reviews2.xlsx"
combined_df.to_excel(combined_excel_file_name, index=False)

print(f"Combined reviews from categories 11 to 21 have been successfully saved to '{combined_excel_file_name}'.")

In [None]:
import pandas as pd

# Load the Excel files
df1 = pd.read_excel("combined_reviews.xlsx")
df2 = pd.read_excel("combined_reviews2.xlsx")

# Merge the dataframes
merged_df = pd.concat([df1, df2], ignore_index=True)

# Write the merged dataframe to a new Excel file
merged_df.to_excel("merged_reviews.xlsx", index=False)

print("Excel files merged successfully.")
