In [49]:
from bs4 import BeautifulSoup
import pandas as pd
import re
from PIL import Image
import uuid 
import os
import gc
import time
import numpy as np

In [50]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

# Import Main Dataset

In [51]:
file_path = './dataset/dataset.csv'
dataset = pd.read_csv(file_path)

# Scrape Data Seasonal

In [52]:
def get_update_seasonal(year, season):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run Chrome in headless mode
    options.add_argument("--disable-blink-features=AutomationControlled")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    
    driver.get(f"https://myanimelist.net/anime/season/{year}/{season}")

    # Tunggu hingga elemen ranking-list muncul
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, "content"))
        )
    except:
        driver.quit()

    page = driver.page_source
    soup = BeautifulSoup(page, 'html.parser')
    list = soup.find_all('div', class_='js-anime-type-all')

    data = []

    for idx, j in enumerate(list):
        uuid_data = str(uuid.uuid4())
        title = j.find('span', class_='js-title').text 
        mal_id = (j.find('a', class_='link-title').get('href')).split('/')[4]
        link = j.find('a', class_='link-title').get('href')
        score = j.find('span', class_='js-score').text 
        member = j.find('span', class_='js-members').text
        synopsis = j.find('p', class_='preline').text
        image = (j.find('div', class_='image')).a['href']
        genres = ','.join([g.a.text.strip() for g in j.find_all('span', class_='genre')])

        aired = (el.text.strip() if (el := j.select_one('div.prodsrc div.info span.item')) else '')
        episode, duration = ((j.select_one('div.prodsrc div.info').text if j.select_one('div.prodsrc div.info') else '').replace(aired,"").replace("\n","").replace(" ","").split(',') + ["",""])[:2]

        properties = j.find_all('div', class_='property')
        result = {}

        for prop in properties:
            caption = prop.find('span', class_='caption')
            if not caption:
                continue
            key = caption.text.strip()
            items = [i.get_text(strip=True) for i in prop.find_all('span', class_='item')]
            result[key] = items if len(items) > 1 else items[0]

        studio       = result.get('Studios')      or result.get('Studio')      or ""
        source       = result.get('Sources')      or result.get('Source')      or ""
        themes       = result.get('Themes')       or result.get('Theme')       or ""
        demographic  = result.get('Demographics') or result.get('Demographic') or ""

        anime_data = {
            'id': uuid_data,
            'mal_id': int(mal_id),
            'title': title,
            'image_url': image,
            'synopsis': synopsis,
            'aired': aired,
            'premiered': f"{season} {year}",
            'member': int(member),
            'favorite': 0,
            'source': source,
            'rank': '',
            'link': link,
            'episode': episode,
            'type': '',
            'genre': genres,
            'producer': '',
            'studio': studio if isinstance(studio, str) else ','.join(studio),
            'theme': themes if isinstance(themes, str) else ','.join(themes),
            'demographic': demographic if isinstance(demographic, str) else ','.join(demographic),
            'duration': duration,
            'rating': '',
            'score': float(score),
            'count_user_score': float(0),
        }

        data.append(anime_data) 

    df = pd.DataFrame(data)
    
    return df

In [53]:
df_updated = get_update_seasonal('2026', 'spring')

# Concat Data Updated with New Data

In [54]:
preserve_columns = ['rank', 'type', 'producer', 'rating', 'count_user_score']

existing_mal_ids = dataset['mal_id'].unique()
df_updated['is_new'] = ~df_updated['mal_id'].isin(existing_mal_ids)

for col in preserve_columns:
    if col in df_updated.columns:
        old_values = dataset.set_index('mal_id')[col].to_dict()
        
        df_updated[col] = df_updated.apply(
            lambda row: old_values.get(row['mal_id'], row[col]) if not row['is_new'] else row[col],
            axis=1
        )

df_updated = df_updated.drop('is_new', axis=1)
dataset_filtered = dataset[~dataset['mal_id'].isin(df_updated['mal_id'])]
dataset = pd.concat([dataset_filtered, df_updated], ignore_index=True)

dataset.to_csv(file_path, index=False)