In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd

def clean_text(text):
    return ' '.join(text.split())  # Menghapus newline dan menggabungkan teks menjadi satu baris

# Inisialisasi driver Chrome
driver = webdriver.Chrome()
driver.set_window_size(1300, 800)

# Baca file CSV yang sudah ada
file_path = 'HasilCrawl.csv'
df_links = pd.read_csv(file_path)

questions_data = []

# Langkah 1: Buka halaman utama Stack Overflow
driver.get("https://stackoverflow.com/")

# Langkah 2: Klik pada link "Tags"
tags_link = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.LINK_TEXT, "Tags"))
)
tags_link.click()

# Langkah 3: Klik pada tag "Python"
python_tag_link = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.LINK_TEXT, "Python"))
)
python_tag_link.click()

# Langkah 4: Crawling pertanyaan dari halaman tag Python
content = driver.page_source
soup = BeautifulSoup(content, 'html.parser')

# Ekstrak pertanyaan dari ringkasan
for area in soup.find_all('div', class_="s-post-summary js-post-summary"):
    title = area.find('a', class_="s-link").get_text(strip=True)  # Judul pertanyaan
    question_excerpt = clean_text(area.find('div', class_="s-post-summary--content-excerpt").get_text(strip=True))  # Ringkasan pertanyaan
    votes = area.select_one('.s-post-summary--stats-item__emphasized .s-post-summary--stats-item-number').get_text(strip=True)  # Jumlah vote
    answers = area.select_one('.s-post-summary--stats-item:nth-child(2) .s-post-summary--stats-item-number').get_text(strip=True)  # Jumlah jawaban
    views = area.select_one('.s-post-summary--stats-item:nth-child(3) .s-post-summary--stats-item-number').get_text(strip=True)  # Jumlah tampilan
    tags = [tag.get_text(strip=True) for tag in area.select('.s-post-summary--meta-tags .post-tag')]  # Tag pertanyaan
    author = area.select_one('.s-user-card--info .flex--item').get_text(strip=True) if area.select_one('.s-user-card--info .flex--item') else 'Anonymous'  # Penulis
    reputation = area.select_one('.s-user-card--rep').get_text(strip=True) if area.select_one('.s-user-card--rep') else '0'  # Reputasi
    link = "https://stackoverflow.com" + area.find('a', class_="s-link")['href']  # Link pertanyaan

    # Simpan data ke dalam dictionary
    question_info = {
        'Title': title,
        'Excerpt': question_excerpt,
        'Votes': votes,
        'Answers': answers,
        'Views': views,
        'Tags': ', '.join(tags),
        'Author': author,
        'Reputation': reputation,
        'Link': link
    }

    questions_data.append(question_info)

# Setelah mengambil data ringkasan, sekarang ambil detail dari setiap pertanyaan yang ada di df_links
for index, row in df_links.iterrows():
    link = row['Link_question']  # Mengambil link pertanyaan dari DataFrame
    driver.get(link)  # Kunjungi link pertanyaan
    question_content = driver.page_source
    question_soup = BeautifulSoup(question_content, 'html.parser')

    # Ambil detail pertanyaan
    question_text = clean_text(question_soup.find('div', class_="s-prose js-post-body").text)  # Isi teks pertanyaan
    # Cari pertanyaan yang sesuai dengan link dan tambahkan detail teks
    for question in questions_data:
        if question['Link'] == link:
            question['Question_Text'] = question_text  # Menyimpan isi pertanyaan ke dalam dictionary
            break

# Tutup driver setelah selesai
driver.quit()

# Simpan DataFrame ke file CSV
output_file = 'HasilScrappingDenganCSV_Selenium.csv'
df_questions = pd.DataFrame(questions_data)
df_questions.to_csv(output_file, index=False)
