<a href="https://colab.research.google.com/github/joyuno/lguplus_project/blob/main/%ED%94%84%EB%A1%9C%EB%AA%A8%EC%85%98_%EC%97%85%EB%AC%B4_%EC%9E%90%EB%8F%99%ED%99%94(%EC%9B%B9%ED%81%AC%EB%A1%A4%EB%A7%81).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException,ElementClickInterceptedException
import time
from google.colab import files
import requests
import re
from bs4 import BeautifulSoup
from io import BytesIO
import pandas as pd
import numpy as np
# Selenium 설정
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)


In [41]:
class TelecomScraperProcessor:
    def __init__(self, driver, site_type: str, search_terms: list = None):
        """
        :param driver: Selenium WebDriver 인스턴스 (단, site_type이 'smatel'인 경우 driver는 사용하지 않습니다)
        :param site_type: 'eyes', 'moyoplan', 'freet', 'smatel', 'eyagi', 'tplus' 중 하나
        :param search_terms: moyoplan, smatel, eyagi, tplus일 경우 검색어 또는 URL 리스트 (eyes, freet는 None)
        """
        self.site_type = site_type.lower()
        # smatel 사이트의 경우 requests를 사용하므로 driver가 필요x
        if self.site_type == "smatel":
            self.driver = None
        else:
            self.driver = driver
        self.search_terms = search_terms if search_terms is not None else []
        self.data_rows = []

    # ---------------- 공통 유틸리티 메서드 ----------------
    @staticmethod
    def extract_digit(s: str) -> str:
        match = re.search(r'(\d+)', s) if s else None
        return match.group(1) if match else ''

    @staticmethod
    def filter_valid_telecom(name):
        valid_names = ['SKT', 'LGU+', 'KT']
        name = name.strip().replace(" ", "")
        for valid in valid_names:
            if valid in name:
                return valid
        return None

    @staticmethod
    def convert_discount(self,value):
        if isinstance(value, str) and value.strip() == '평생':
            return value
        m = re.search(r'(\d+)', str(value))
        if self.site_type == "mobing":
            return int(m.group(1))-1 if m else value
        else:
            return int(m.group(1)) if m else value

    def selenium_scroll_option(self):
        SCROLL_PAUSE_SEC = 2
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        while True:
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(SCROLL_PAUSE_SEC)
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    def clean_data(self, df):
        df = df.dropna(subset=['데이터무제한속도', '할인개월수'])
        if '통신사이름' in df.columns:
            df['통신사이름'] = df['통신사이름'].apply(lambda x: self.filter_valid_telecom(x) if x else x)
        else:
            df['통신사이름'] = 'LGU+'
        df['데이터크기'] = df['데이터크기'].str.extract(r'(\d+)').astype(int)
        df['데이터무제한속도'] = df['데이터무제한속도'].str.extract(r'(\d+)').astype(int)
        df['할인개월수'] = df['할인개월수'].apply(lambda x: self.convert_discount(self,x))
        return df

    def categorize_data_size(self, row):
        size = row['데이터크기']
        unlimited_speed = row['데이터무제한속도']
        call_supply = row['통화제공량']
        if size == 7 and call_supply == '기본제공':
            return "7"
        elif size == 10 and call_supply == '기본제공':
            return "10"
        elif size == 15:
            return "15"
        elif size in [11, 71] and call_supply == '기본제공':
            return "11 or 71"
        elif (size >= 100 or (size == 5 and unlimited_speed == 5)) and call_supply == '기본제공':
            return "100+"
        else:
            return "Other"

    def set_unlimited_speed(self, row):
        if row['데이터크기_범주'] == "7":
            return 1
        elif row['데이터크기_범주'] == "10":
            return 1
        elif row['데이터크기_범주'] in ["15", "11 or 71"]:
            return 3
        elif row['데이터크기_범주'] == "100+":
            return 5
        else:
            return row['데이터무제한속도']

    def process_dataframe(self, df):
        df['데이터크기_범주'] = df.apply(self.categorize_data_size, axis=1)
        df['데이터무제한속도'] = df.apply(self.set_unlimited_speed, axis=1)
        df['특별조건'] = df.apply(
            lambda row: f"15-{self.extract_digit(str(row['통화제공량']))}"
            if row['데이터크기_범주'] == "15" and any(c.isdigit() for c in str(row['통화제공량']))
            else '기타',
            axis=1
        )
        df['데이터크기_범주'] = df.apply(self.categorize_data_size, axis=1)
        df['프로모션가격'] = pd.to_numeric(df['프로모션가격'], errors='coerce')
        num_col = ['데이터크기', '데이터무제한속도', '원가', '프로모션가격']
        df[num_col] = df[num_col].astype(int)
        df.loc[df['데이터크기_범주'] == '15', '데이터크기_범주'] = df.loc[df['데이터크기_범주'] == '15', '특별조건']
        idx = df.groupby(['통신사이름', '데이터크기_범주'])['프로모션가격'].idxmin()
        df_grouped = df.loc[idx].reset_index(drop=True)
        print(df_grouped)
        df_grouped.drop(columns=['데이터크기', '데이터무제한속도', '특별조건', '통화제공량'], inplace=True)
        con2 = (df_grouped['데이터크기_범주'] != 'Other') & (df_grouped['데이터크기_범주'] != '기타')
        result = df_grouped[con2]
        desired_order = ['7', '10', '15-100', '15-300', '11 or 71', '100+']
        desired_index_order = ['SKT', 'KT', 'LGU+']
        result['데이터크기_범주'] = pd.Categorical(result['데이터크기_범주'],
                                                    categories=desired_order,
                                                    ordered=True)
        pivot_df = result.pivot_table(
            index='통신사이름',
            columns='데이터크기_범주',
            values=['프로모션가격', '할인개월수', '원가'],
            aggfunc='first'
        )
        pivot_df = pivot_df.swaplevel(0, 1, axis=1)
        new_cols = pd.MultiIndex.from_product(
            [desired_order, ['프로모션가격', '할인개월수', '원가']],
            names=['데이터크기_범주', '범주']
        )
        pivot_df = pivot_df.reindex(columns=new_cols)
        pivot_df = pivot_df.reindex(desired_index_order)
        pivot_df = pivot_df.dropna(how='all')
        if pivot_df.empty:
            return pivot_df
        return pivot_df

    def click_buttons(self, css_selector):
        try:
            while True:
                try:
                    button = WebDriverWait(self.driver, 10).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, css_selector))
                    )
                except TimeoutException:
                    print("더 이상 클릭 가능한 버튼이 없습니다.")
                    break
                self.driver.execute_script("arguments[0].scrollIntoView(true);", button)
                time.sleep(0.5)
                try:
                    button.click()
                except ElementClickInterceptedException as e:
                    print("기본 클릭 실패, JavaScript 클릭 시도:", e)
                    self.driver.execute_script("arguments[0].click();", button)
                print("Clicked button successfully!")
                time.sleep(1)
        except Exception as e:
            print("Error during button click:", e)
    def click_buttons_eyes(self):
        try:
            cnt_label = 0
            total_cnt_label = 1
            while cnt_label != total_cnt_label:
                button = self.driver.find_element(By.CSS_SELECTOR, "button.btn-type3")
                print(f"Found button: {button.text}")
                button.click()
                print("Clicked button successfully!")
                time.sleep(1)
                cnt_label = button.find_element(By.ID, "cnt").text.strip()
                total_cnt_label = button.find_element(By.ID, "totalCnt").text.strip()
                print(f"Current counts: cnt={cnt_label}, totalCnt={total_cnt_label}")
        except Exception as e:
            print("Error during button click:", e)

    # ---------------- eyes 사이트 데이터 추출 ----------------
    def extract_data_eyes(self, soup):
        for best in soup.select("li.best"):
            buttons = best.select("div ul li button[class^='tag on']")
            telecom_names = ", ".join([button.get_text(strip=True) for button in buttons])
            no_button = best.select_one("div ul li button[type='button'][id='no']")
            discount_months = no_button.get_text(strip=True) if no_button else None
            data_size_elem = best.select_one("div.data strong")
            data_size = data_size_elem.get_text(strip=True) if data_size_elem else None
            unlimited_speed_elem = best.select_one("div.data span")
            unlimited_speed = unlimited_speed_elem.get_text(strip=True) if unlimited_speed_elem else None
            provide = best.select_one("ul.provide li.call span")
            call_provision = provide.get_text(strip=True) if provide else None
            origin_price_elem = best.select_one("div.side-box div.origin-price span")
            plan_origin_price = "".join(filter(str.isdigit, origin_price_elem.get_text(strip=True))) if origin_price_elem else None
            price_elem = best.select_one("div.side-box div.price strong")
            plan_price = "".join(filter(str.isdigit, price_elem.get_text(strip=True))) if price_elem else None
            self.data_rows.append({
                "통신사이름": telecom_names,
                "할인개월수": discount_months,
                "데이터크기": data_size,
                "데이터무제한속도": unlimited_speed,
                "통화제공량": call_provision,
                "원가": plan_origin_price,
                "프로모션가격": plan_price
            })
        df = pd.DataFrame(self.data_rows)
        return df

    # ---------------- moyoplan 사이트 데이터 추출 ----------------
    def extract_data_moyoplan(self, soup):
        for best in soup.select("div.css-1vhtfj8.e1b8jytj0"):
            elem = best.select_one("div.css-em89nk.e1b8jytj20")
            data_size_speed = elem.get_text(strip=True) if elem else None
            data_size_match = re.search(r'(\d+)\s*GB', data_size_speed, re.IGNORECASE) if data_size_speed else None
            unlimited_speed_match = re.search(r'(\d+)\s*Mbps', data_size_speed, re.IGNORECASE) if data_size_speed else None
            data_size = data_size_match.group(1) if data_size_match else None
            unlimited_speed = unlimited_speed_match.group(1) if unlimited_speed_match else None
            elem2 = best.select_one("div.css-infalx.e1uyrbxj4")
            discount_months_origin_price = elem2.get_text(strip=True) if elem2 else None
            if discount_months_origin_price:
                match = re.search(r'(\d+)\s*개월.*?([\d,]+)\s*원', discount_months_origin_price)
            else:
                match = None
            if match:
                discount_months = match.group(1)
                plan_origin_price = match.group(2).replace(',', '')
            else:
                discount_months, plan_origin_price = None, None
            elem3 = best.select_one("div.css-1pdnyll.e1b8jytj12")
            call_provision = elem3.get_text(strip=True).replace('통화무제한', '기본제공') if elem3 else None
            elem4 = best.select_one("span.css-1djsysu.e1uyrbxj3")
            plan_price = elem4.get_text(strip=True).replace(',', '') if elem4 else None
            plan_price = self.extract_digit(plan_price)
            self.data_rows.append({
                "할인개월수": discount_months,
                "데이터크기": data_size,
                "데이터무제한속도": unlimited_speed,
                "통화제공량": call_provision,
                "원가": plan_origin_price,
                "프로모션가격": plan_price
            })
        df = pd.DataFrame(self.data_rows)
        return df

    # ---------------- freet 사이트 데이터 추출 ----------------
    def extract_data_freet(self, soup):
        for best in soup.select("li.plan-item.v2"):
            telecom_elem = best.select_one("span[class^='flag-type']")
            telecom_names = telecom_elem.get_text(strip=True) if telecom_elem else None
            elem2 = best.select_one("p.spec")
            discount_months_origin_price = elem2.get_text(strip=True) if elem2 else None
            if discount_months_origin_price:
                match = re.search(r'(\d+)\s*개월.*?([\d,]+)\s*원', discount_months_origin_price)
            else:
                match = None
            if match:
                discount_months = match.group(1)
                plan_origin_price = match.group(2).replace(',', '')
            else:
                discount_months, plan_origin_price = None, None
            title_elem = best.select_one("div.plan-title h3.title")
            data_size = title_elem.get_text(strip=True) if title_elem else None
            if data_size:
                data_size_match = re.search(r'(\d+)\s*GB', data_size, re.IGNORECASE)
                data_size = data_size_match.group(1) if data_size_match else None
            else:
                data_size = None
            desc_elem = best.select_one("div.plan-title p.desc")
            unlimited_speed = desc_elem.get_text(strip=True) if desc_elem else None
            if unlimited_speed:
                unlimited_speed_match = re.search(r'(\d+)\s*Mbps', unlimited_speed, re.IGNORECASE)
                unlimited_speed = unlimited_speed_match.group(1) if unlimited_speed_match else None
            else:
                unlimited_speed = None
            provide = best.select_one("ul.plan-icon-list li")
            call_provision = provide.get_text(strip=True) if provide else None
            price_elem = best.select_one("p.price strong")
            plan_price = "".join(filter(str.isdigit, price_elem.get_text(strip=True))) if price_elem else None
            self.data_rows.append({
                "통신사이름": telecom_names,
                "할인개월수": discount_months,
                "데이터크기": data_size,
                "데이터무제한속도": unlimited_speed,
                "통화제공량": call_provision,
                "원가": plan_origin_price,
                "프로모션가격": plan_price
            })
        df = pd.DataFrame(self.data_rows)
        return df
    # ---------------- mobing 사이트 데이터 추출 ----------------
    def extract_data_mobing(self, soup):
        data_rows = []
        for best in soup.select("div.callplan-list__listbox"):
            # 통신사 이름 추출
            telecom_elem = best.select_one("div[class^='chip-']")
            telecom_names = telecom_elem.get_text(strip=True) if telecom_elem else None

            # 할인개월수 및 원가 추출
            elem2 = best.select_one("div.descript")
            discount_months_origin_price = elem2.get_text(strip=True) if elem2 else None
            print(discount_months_origin_price)
            if discount_months_origin_price:
                match = re.search(r'(\d+)\s*개월.*?([\d,]+)\s*원', discount_months_origin_price)
            else:
                match = None
            if match:
                discount_months = match.group(1)
                print(discount_months)
                plan_origin_price = match.group(2).replace(',', '')
                print(plan_origin_price)
            else:
                discount_months, plan_origin_price = None, None

            # 데이터 크기 및 무제한 속도 추출
            elem = best.select_one("div.data")
            if elem:
                data_size_speed = elem.get_text(strip=True)
                data_size_match = re.search(r'(\d+)\s*GB', data_size_speed, re.IGNORECASE)
                data_size = data_size_match.group(1) if data_size_match else None
                unlimited_speed_match = re.search(r'(\d+)\s*Mbps', data_size_speed, re.IGNORECASE)
                unlimited_speed = unlimited_speed_match.group(1) if unlimited_speed_match else None
                print(data_size, unlimited_speed)
            else:
                data_size, unlimited_speed = None, None

            # 통화제공량 추출
            provide = best.select_one("div.voice")
            call_provision = provide.get_text(strip=True) if provide else None
            if call_provision and '기본제공' in call_provision:
                call_provision = '기본제공'
            print(call_provision)

            # 프로모션가격 추출
            price = best.select_one("div.sum strong")
            plan_price = "".join(filter(str.isdigit, price.get_text(strip=True))) if price else None

            data_rows.append({
                "통신사이름": telecom_names,
                "할인개월수": discount_months,
                "데이터크기": data_size,
                "데이터무제한속도": unlimited_speed,
                "통화제공량": call_provision,
                "원가": plan_origin_price,
                "프로모션가격": plan_price
            })
        df_mobing = pd.DataFrame(data_rows)
        return df_mobing
    # ---------------- smatel 사이트 데이터 추출 ----------------
    def extract_data_smatel(self, soup):
        self.data_rows = []
        for i in range(1, 5):
            smatel_url = f"https://smartel.kr/phoneplan?bundledServices=%5B%5D&page={i}&mno=&sort=낮은+가격순&data=%5B%227~15%22%2C%2215~100%22%2C%22100~999%22%5D&network_type=LTE"
            response = requests.get(smatel_url, allow_redirects=False)
            if response.status_code in (301, 302, 303, 307, 308):
                print(f"Redirection detected: {response.status_code} -> {response.headers.get('Location')}")
            else:
                print(f"Page {i} loaded successfully with status {response.status_code}")
            time.sleep(2)
            page_soup = BeautifulSoup(response.text, 'html.parser')
            print(page_soup.get_text()[:300])
            for best in page_soup.select("a[class^='mb-4 hidden items-center justify-between rounded']"):
                telecom_elem = best.select_one("span[class^='undefined font-medium inline-block select-none rounded']")
                telecom_names = telecom_elem.get_text(strip=True) if telecom_elem else None
                elem = best.select_one("p.text-txt-02")
                data_size_speed = elem.get_text(strip=True) if elem else None
                data_size_match = re.search(r'(\d+)\s*GB', data_size_speed, re.IGNORECASE) if data_size_speed else None
                data_size = data_size_match.group(1) if data_size_match else None
                elem_1 = best.select_one("p.text-txt-06")
                print(elem_1)
                unlimited_speed = elem_1.get_text(strip=True) if elem_1 else None
                unlimited_speed_match = re.search(r'(\d+)\s*Mbps', unlimited_speed, re.IGNORECASE) if unlimited_speed else None
                unlimited_speed = unlimited_speed_match.group(1) if unlimited_speed_match else None
                print(data_size, unlimited_speed)
                elem2 = best.select_one("p.text-txt-03")
                print(elem2)
                discount_months_origin_price = elem2.get_text(strip=True) if elem2 else None
                if discount_months_origin_price:
                    match = re.search(r'(\d+)\s*개월.*?([\d,]+)\s*원', discount_months_origin_price, re.DOTALL)
                else:
                    match = None
                if match:
                    discount_months = match.group(1)
                    plan_origin_price = match.group(2).replace(',', '')
                else:
                    discount_months, plan_origin_price = None, None
                print(discount_months, plan_origin_price)
                elem3 = best.select_one("p[class^='mr-2 text-21 font-normal']")
                call_provision = elem3.get_text(strip=True).replace('기본 제공', '기본제공') if elem3 else None
                print(call_provision)
                elem4 = best.select_one("p.text-28")
                plan_price = elem4.get_text(strip=True).replace(',', '') if elem4 else None
                plan_price = self.extract_digit(plan_price)
                self.data_rows.append({
                    "통신사이름": telecom_names,
                    "할인개월수": discount_months,
                    "데이터크기": data_size,
                    "데이터무제한속도": unlimited_speed,
                    "통화제공량": call_provision,
                    "원가":  plan_origin_price,
                    "프로모션가격": plan_price
                })
        df_smatel = pd.DataFrame(self.data_rows)
        return df_smatel

    # ---------------- eyagi 사이트 데이터 추출 ----------------
    def extract_data_eyagi_site(self):
        self.data_rows = []
        telecom_eyagi = self.search_terms if self.search_terms else ['skt', 'kt', 'lgt']
        for telecom in telecom_eyagi:
            eyagi_url = f"https://www.eyagi.co.kr/shop/plan/list.php?tag={telecom}"
            self.driver.get(eyagi_url)
            self.selenium_scroll_option()
            self.click_buttons("div.i-btn-more")
            soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            print(soup.get_text()[:300])
            for best in soup.select("div.plan-list"):
                telecom_elem = best.select_one("span[class^='badge mno']")
                telecom_names = telecom_elem.get_text(strip=True) if telecom_elem else None
                elem = best.select_one("div.data p.free")
                data_size_speed = elem.get_text(strip=True) if elem else None
                data_size_match = re.search(r'(\d+)\s*GB', data_size_speed, re.IGNORECASE) if data_size_speed else None
                data_size = data_size_match.group(1) if data_size_match else None
                elem_1 = best.select_one("p.qos")
                unlimited_speed = elem_1.get_text(strip=True) if elem_1 else None
                unlimited_speed_match = re.search(r'(\d+)\s*Mbps', unlimited_speed, re.IGNORECASE) if unlimited_speed else None
                unlimited_speed = unlimited_speed_match.group(1) if unlimited_speed_match else None
                elem2 = best.select_one("p.orgin-price")
                discount_months_origin_price = elem2.get_text(strip=True) if elem2 else '평생'
                if discount_months_origin_price:
                    match = re.search(r'(\d+)\s*개월.*?([\d,]+)\s*원', discount_months_origin_price)
                else:
                    match = None
                if match and hasattr(match, "group"):
                    discount_months = match.group(1)
                    plan_origin_price = match.group(2).replace(',', '')
                else:
                    discount_months, plan_origin_price = '평생', '0'
                elem3 = best.select_one("div.sms p.free")
                call_provision = elem3.get_text(strip=True).replace('기본 제공', '기본제공') if elem3 else None
                elem4 = best.select_one("p.current-price")
                plan_price = elem4.get_text(strip=True).replace(',', '') if elem4 else None
                plan_price = self.extract_digit(plan_price)
                self.data_rows.append({
                    "통신사이름": telecom_names,
                    "할인개월수": discount_months,
                    "데이터크기": data_size,
                    "데이터무제한속도": unlimited_speed,
                    "통화제공량": call_provision,
                    "원가": plan_origin_price,
                    "프로모션가격": plan_price
                })
        df_eyagi = pd.DataFrame(self.data_rows)
        self.driver.quit()
        return df_eyagi

    # ---------------- tplus 사이트 데이터 추출 ----------------
    def extract_data_tplus(self, soup):
        self.data_rows = []
        tplus_url = 'https://www.tplusmobile.com/main/rate/join'
        self.driver.get(tplus_url)
        self.selenium_scroll_option()
        self.click_buttons("div.i-btn-more")
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        self.driver.quit()
        for best in soup.select("div.plan-list"):
            telecom_elem = best.select_one("div.plan-service span")
            telecom_names = telecom_elem.get_text(strip=True) if telecom_elem else None
            elem = best.select_one("p.plan-data")
            data_size_speed = elem.get_text(strip=True) if elem else None
            data_size_match = re.search(r'(\d+)\s*GB', data_size_speed, re.IGNORECASE) if data_size_speed else None
            data_size = data_size_match.group(1) if data_size_match else None
            unlimited_speed_match = re.search(r'(\d+)\s*Mbps', data_size_speed, re.IGNORECASE) if data_size_speed else None
            unlimited_speed = unlimited_speed_match.group(1) if unlimited_speed_match else None
            elem2 = best.select_one("p.plan-af-price")
            discount_months_origin_price = elem2.get_text(strip=True) if elem2 else '평생'
            if discount_months_origin_price:
                match = re.search(r'(\d+)\s*개월.*?([\d,]+)\s*원', discount_months_origin_price)
            else:
                match = None
            if match and hasattr(match, "group"):
                discount_months = match.group(1)
                plan_origin_price = match.group(2).replace(',', '')
            else:
                discount_months, plan_origin_price = '평생', '0'
            elem3 = best.select_one("p.plan-call span")
            call_provision = elem3.get_text(strip=True).replace('기본 제공', '기본제공') if elem3 else None
            elem4 = best.select_one("span.t-color")
            plan_price = elem4.get_text(strip=True).replace(',', '') if elem4 else None
            plan_price = self.extract_digit(plan_price)
            self.data_rows.append({
                "통신사이름": telecom_names,
                "할인개월수": discount_months,
                "데이터크기": data_size,
                "데이터무제한속도": unlimited_speed,
                "통화제공량": call_provision,
                "원가": plan_origin_price,
                "프로모션가격": plan_price
            })
        df_tplus = pd.DataFrame(self.data_rows)
        return df_tplus

    # ---------------- 실행 메서드 ----------------
    def run_eyes(self):
        self.driver.get("https://eyes.co.kr/payplan/info2")
        self.click_target_link()
        self.click_buttons_eyes()
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        self.driver.quit()
        df = self.extract_data_eyes(soup)
        df = self.clean_data(df)
        pivot_df = self.process_dataframe(df)
        return pivot_df

    def run_moyoplan(self):
        for term in self.search_terms:
            self.data_rows = []
            moyo_url_demo = f'https://www.moyoplan.com/plans?speedWhenExhausted=5000-9999%2C1000-2999%2C3000-4999&page=1&searchText={term}'
            self.driver.get(moyo_url_demo)
            self.selenium_scroll_option()
            pages = self.driver.find_elements(By.CSS_SELECTOR, "button.tw-w-40")
            num_pages = len(pages)
            for i in range(1, num_pages + 1):
                moyo_url = f'https://www.moyoplan.com/plans?speedWhenExhausted=5000-9999%2C1000-2999%2C3000-4999&page={i}&searchText={term}'
                self.driver.get(moyo_url)
                time.sleep(3)
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
                self.extract_data_moyoplan(soup)
            globals()[f"df_{term}"] = pd.DataFrame(self.data_rows)
        dfs = [self.process_dataframe(self.clean_data(globals()[f"df_{term}"])) for term in self.search_terms if f"df_{term}" in globals()]
        combined_df = pd.concat(dfs, axis=0, ignore_index=True)
        self.driver.quit()
        return combined_df

    def run_freet(self):
        freet_url = 'https://www.freet.co.kr/plan/ratePlan'
        driver.get(freet_url)
        self.click_buttons("a.btn-type3")
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        self.driver.quit()
        df = self.extract_data_freet(soup)
        df = self.clean_data(df)
        pivot_df_freet = self.process_dataframe(df)
        return pivot_df_freet
    def run_mobing(self):
        mobing_url = 'https://www.mobing.co.kr/product/plan/promotion'
        self.driver.get(mobing_url)
        self.selenium_scroll_option()
        self.click_buttons("div.i-btn-more")
        soup = BeautifulSoup(self.driver.page_source, 'html.parser')
        self.driver.quit()
        df_mobing = self.extract_data_mobing(soup)
        df_mobing = self.clean_data(df_mobing)
        pivot_df_mobing = self.process_dataframe(df_mobing)
        return pivot_df_mobing
    def run_smatel(self):
        df = self.extract_data_smatel(None)
        df = self.clean_data(df)
        pivot_df_smatel = self.process_dataframe(df)
        return pivot_df_smatel

    def run_eyagi(self):
        df = self.extract_data_eyagi_site()
        df = self.clean_data(df)
        pivot_df_eyagi = self.process_dataframe(df)
        return pivot_df_eyagi

    def run_tplus(self):
        df = self.extract_data_tplus(None)
        df = self.clean_data(df)
        pivot_df_tplus = self.process_dataframe(df)
        return pivot_df_tplus

    def run(self):
        if self.site_type == "eyes":
            return self.run_eyes()
        elif self.site_type == "moyoplan":
            return self.run_moyoplan()
        elif self.site_type == "freet":
            return self.run_freet()
        elif self.site_type == "mobing":
            return self.run_mobing()
        elif self.site_type == "smatel":
            return self.run_smatel()
        elif self.site_type == "eyagi":
            return self.run_eyagi()
        elif self.site_type == "tplus":
            return self.run_tplus()
        else:
            raise ValueError("지원하지 않는 site_type입니다. 'eyes', 'moyoplan', 'freet','mobing','smatel', 'eyagi' 또는 'tplus'를 선택하세요.")

    def click_target_link(self):
        try:
            print("Waiting for target link (eyes)...")
            target_link = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "#wrap > main > div.content > div.payplan-main > ul > li:nth-child(11) > a")
                )
            )
            print("Found target link:", target_link.text)
            target_link.click()
            print("Target link clicked successfully!")
            time.sleep(3)
        except Exception as e:
            print("Error in click_target_link:", e)


In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time

# 1. eyes 사이트 실행
processor_eyes = TelecomScraperProcessor(driver, site_type="eyes")
pivot_df_eyes = processor_eyes.run()
print("----- eyes -----")
print(pivot_df_eyes)
print("----- moyoplan -----")
moyo_list = ['인스','슈가','마블','KG','미니','고고']
driver = webdriver.Chrome(options=chrome_options)
processor_moyoplan = TelecomScraperProcessor(driver, site_type="moyoplan", search_terms=moyo_list)
pivot_df_moyoplan = processor_moyoplan.run()
print("----- moyoplan -----")
print(pivot_df_moyoplan)
print("----- freet -----")
# 3. freet 사이트 실행
driver = webdriver.Chrome(options=chrome_options)
processor_freet = TelecomScraperProcessor(driver, site_type="freet")
pivot_df_freet = processor_freet.run()
print("----- freet -----")
print(pivot_df_freet)
print("----- mobing -----")
driver = webdriver.Chrome(options=chrome_options)
processor_mobing = TelecomScraperProcessor(driver, site_type="mobing")
pivot_df_mobing = processor_mobing.run()
print("----- mobing -----")
print(pivot_df_mobing)
print("----- smatel -----")
# 4. smatel 사이트 실행
driver = webdriver.Chrome(options=chrome_options)
processor_smatel = TelecomScraperProcessor(None, site_type="smatel")
pivot_df_smatel = processor_smatel.run()
print("----- smatel -----")
print(pivot_df_smatel)
print("----- eyagi -----")
# 5. eyagi 사이트 실행 (검색어 또는 태그 리스트를 전달)
driver = webdriver.Chrome(options=chrome_options)
processor_eyagi = TelecomScraperProcessor(driver, site_type="eyagi", search_terms=["skt", "kt", "lgt"])
pivot_df_eyagi = processor_eyagi.run()
print("----- eyagi -----")
print(pivot_df_eyagi)
print("----- tplus -----")
# 6. tplus 사이트 실행
driver = webdriver.Chrome(options=chrome_options)
processor_tplus = TelecomScraperProcessor(driver, site_type="tplus")
pivot_df_tplus = processor_tplus.run()
print("----- tplus -----")
print(pivot_df_tplus)

Waiting for target link (eyes)...
Found target link: 아이즈모바일 요금제 전체보기
Target link clicked successfully!
Found button: 더보기 (
10
/
173
)
Clicked button successfully!
Current counts: cnt=20, totalCnt=173
Found button: 더보기 (
20
/
173
)
Clicked button successfully!
Current counts: cnt=30, totalCnt=173
Found button: 더보기 (
30
/
173
)
Clicked button successfully!
Current counts: cnt=40, totalCnt=173
Found button: 더보기 (
40
/
173
)
Clicked button successfully!
Current counts: cnt=50, totalCnt=173
Found button: 더보기 (
50
/
173
)
Clicked button successfully!
Current counts: cnt=60, totalCnt=173
Found button: 더보기 (
60
/
173
)
Clicked button successfully!
Current counts: cnt=70, totalCnt=173
Found button: 더보기 (
70
/
173
)
Clicked button successfully!
Current counts: cnt=80, totalCnt=173
Found button: 더보기 (
80
/
173
)
Clicked button successfully!
Current counts: cnt=90, totalCnt=173
Found button: 더보기 (
90
/
173
)
Clicked button successfully!
Current counts: cnt=100, totalCnt=173
Found button: 더보기 (
100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['통신사이름'] = df['통신사이름'].apply(lambda x: self.filter_valid_telecom(x) if x else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['데이터크기'] = df['데이터크기'].str.extract(r'(\d+)').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['데이터무제한속도'] = df['데이터무제한속도'].str.extract(r'(\d+)').astype(

   통신사이름  할인개월수  데이터크기  데이터무제한속도 통화제공량     원가  프로모션가격  데이터크기_범주    특별조건
0     KT      7     10         1  기본제공  31900   11000        10      기타
1     KT      7    100         5  기본제공  55000   25080      100+      기타
2     KT      7     11         3  기본제공  48400   20020  11 or 71      기타
3     KT      7     15         3  100분  39600   15070    15-100  15-100
4     KT      7     15         3  300분  41800   17050    15-300  15-300
5     KT      7      7         1  기본제공  26400    8030         7      기타
6     KT      7     15         3  기본제공  34100   13090        기타      기타
7   LGU+     36     10         1  기본제공  27500   18700        10      기타
8   LGU+     36      5         5  기본제공  53000   38700      100+      기타
9   LGU+     12     11         3  기본제공  44000   29700  11 or 71      기타
10  LGU+     12     15         3  100분  33900   21800    15-100  15-100
11  LGU+     36     15         3  300분  36900   27000    15-300  15-300
12  LGU+     36      7         1  기본제공  23900   15100         7 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['통신사이름'] = 'LGU+'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['데이터크기'] = df['데이터크기'].str.extract(r'(\d+)').astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['데이터무제한속도'] = df['데이터무제한속도'].str.extract(r'(\d+)').astype(int)
A value is trying to be set on a copy of a slice from a Da

   할인개월수  데이터크기  데이터무제한속도   통화제공량     원가  프로모션가격 통신사이름  데이터크기_범주    특별조건
0      7      5         5    기본제공  43700   26000  LGU+      100+      기타
1      7     11         3    기본제공  35700   20000  LGU+  11 or 71      기타
2      7     15         3  통화100분  29600   13000  LGU+    15-100  15-100
3      7      7         1    기본제공  18370    8000  LGU+         7      기타
4      7      5         1  통화200분  14300    6050  LGU+     Other      기타
   할인개월수  데이터크기  데이터무제한속도   통화제공량     원가  프로모션가격 통신사이름  데이터크기_범주    특별조건
0      7     10         1    기본제공  23970   11100  LGU+        10      기타
1      7      5         5    기본제공  48000   25010  LGU+      100+      기타
2      7     11         3    기본제공  40900   20000  LGU+  11 or 71      기타
3      7     15         3  통화100분  34490   13040  LGU+    15-100  15-100
4      7     15         3  통화300분  34000   15080  LGU+    15-300  15-300
5      7      7         1    기본제공  21980    8010  LGU+         7      기타
6      6     12         1    기본제공  41300   24800  L

KeyError: ['데이터무제한속도', '할인개월수']

#실행코드


In [19]:
pivot_df_moyoplan.index = ['inskorea', 'sugar', 'marvel', 'KG', 'mini', 'gogo']
combined_pivot_df = pd.concat(
    [pivot_df_eyes, pivot_df_freet,pivot_df_mobing,pivot_df_smatel,pivot_df_eyagi, pivot_df_tplus, pivot_df_moyoplan],
    keys=['eyes','freet','mobing','smatel','eyagi', 'tplus','moyoplan'],
    names=['site']
)
combined_pivot_df

Unnamed: 0_level_0,데이터크기_범주,7,7,7,10,10,10,15-100,15-100,15-100,15-300,15-300,15-300,11 or 71,11 or 71,11 or 71,100+,100+,100+
Unnamed: 0_level_1,범주,프로모션가격,할인개월수,원가,프로모션가격,할인개월수,원가,프로모션가격,할인개월수,원가,프로모션가격,할인개월수,원가,프로모션가격,할인개월수,원가,프로모션가격,할인개월수,원가
site,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
eyes,SKT,8000.0,7.0,23900.0,11000.0,7.0,35200.0,15000.0,7,33900.0,17000.0,7,36300.0,20000.0,7.0,44000.0,28000.0,7.0,53000.0
eyes,KT,8030.0,7.0,26400.0,11000.0,7.0,31900.0,15070.0,7,39600.0,17050.0,7,41800.0,20020.0,7.0,48400.0,25080.0,7.0,55000.0
eyes,LGU+,15100.0,36.0,23900.0,18700.0,36.0,27500.0,21800.0,12,33900.0,27000.0,36,36900.0,29700.0,12.0,44000.0,38700.0,36.0,53000.0
freet,SKT,8030.0,7.0,29700.0,11000.0,7.0,34100.0,15070.0,7.0,37400.0,,,,20020.0,7.0,44990.0,28030.0,7.0,53000.0
freet,KT,8030.0,7.0,29700.0,11000.0,7.0,34650.0,15070.0,7.0,37400.0,17050.0,7.0,39600.0,20020.0,7.0,44990.0,25080.0,7.0,55000.0
freet,LGU+,8800.0,7.0,29700.0,11990.0,7.0,34100.0,13750.0,12.0,36520.0,19250.0,24.0,39600.0,22330.0,12.0,43230.0,27720.0,24.0,52800.0
smatel,SKT,8000.0,7.0,24200.0,11000.0,7.0,31900.0,15000.0,7.0,33000.0,17000.0,7.0,34100.0,20000.0,7.0,40700.0,28000.0,7.0,47300.0
smatel,KT,8000.0,7.0,24200.0,11000.0,7.0,28600.0,15000.0,7.0,33000.0,,,,20000.0,7.0,40700.0,25000.0,7.0,47300.0
smatel,LGU+,14300.0,12.0,19800.0,11000.0,7.0,29040.0,20900.0,12.0,27500.0,23100.0,12.0,29700.0,28600.0,12.0,36300.0,34300.0,12.0,45300.0
eyagi,SKT,8000.0,7.0,24200.0,11000.0,7.0,37400.0,15000.0,7,31900.0,17600.0,7,36300.0,20000.0,7.0,39600.0,30800.0,7.0,49500.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [26]:
import smtplib
from email.message import EmailMessage
import pandas as pd
from datetime import datetime

today = datetime.now().strftime("%m%d")
excel_filename = f"프로모션_{today}.xlsx"


combined_pivot_df.to_excel(excel_filename, index=True)
file_path = '/content/drive/MyDrive/promotion/자동메일발송비밀번호.txt'
with open(file_path, 'r', encoding='utf-8') as f:
    password = f.read()

my_email = "joy981017@gmail.com"           # 발신자 이메일 주소
receiver_email = "ho98@lguplus.co.kr"     # 수신자 이메일 주소


msg = EmailMessage()
msg['Subject'] = f"프로모션_{today}"
msg['From'] = my_email
msg['To'] = receiver_email
msg.set_content("안녕하세요,\n\n25년 {today} 프로모션 현황입니다.")


with open(excel_filename, "rb") as f:
    file_data = f.read()
    msg.add_attachment(file_data,
                       maintype="application",
                       subtype="vnd.openxmlformats-officedocument.spreadsheetml.sheet",
                       filename=excel_filename)


with smtplib.SMTP("smtp.gmail.com", 587) as connection:
    connection.starttls()
    connection.login(user=my_email, password=password)
    connection.send_message(msg)

print("이메일이 성공적으로 발송되었습니다.")


이메일이 성공적으로 발송되었습니다.
