In [None]:
import csv
import time
import random
import os
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout

# ==================================================
# CONFIG - Sesuaikan bila perlu
# ==================================================
INPUT_CSV = "google_links_all_pages.csv"   # harus punya kolom 'url'
OUTPUT_CSV = "quora_opinions.csv"
HEADLESS = False            # agar login/cookie terlihat
WAIT_AFTER_ACTION = 10.0    # waktu tunggu setelah klik XPATH_2 (detik)
DELAY_BETWEEN_URLS = (1.0, 2.5)
SCROLL_WAIT = 10.0          # tunggu setelah scroll (detik)
ANSWERS_CLICK_WAIT = 5.0    # tunggu setelah klik tombol "answers" (detik)

# Paste cookie lengkap di bawah ini (harus full, tidak boleh terpotong)
COOKIE_STRING = (
    "m-login=1; m-b=5f0I6CqTwhUnrA3TViu5kg==; m-b_lax=5f0I6CqTwhUnrA3TViu5kg==; "
    "m-b_strict=5f0I6CqTwhUnrA3TViu5kg==; m-s=iN1d8p2XVQlb_9HNhWBOIg==; m-uid=3109495667; "
    "m-ql10n_id=https%3A%2F%2Fqsbr.cf2.quoracdn.net%2F-4-l10n_main-30-id-5296391d0f939c4c.translation.json; "
    "m-theme=dark; m-dynamicFontSize=regular; m-themeStrategy=auto;"
)

# ==================================================
# XPaths dan CSS Selectors
# ==================================================
XPATH_1 = "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'all related')]"
XPATH_2 = "//div[contains(@class, 'q-text') and contains(@class, 'qu-dynamicFontSize--small') and contains(@class, 'qu-color--gray_dark') and contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'answers')]"

READ_MORE_SPAN_SELECTOR = ".qt_read_more"
OPINION_DIV_SELECTOR = "div.q-box.spacing_log_answer_content.puppeteer_test_answer_content"

ANSWERS_CSS = (
    ".q-click-wrapper.c1nud10e.qu-display--inline-block.qu-tapHighlight--white"
    ".qu-cursor--pointer.qu-hover--textDecoration--underline"
)

# ==================================================
# Fungsi Utilitas
# ==================================================
def parse_cookie_string(cookie_string, domain=".quora.com"):
    cookies = []
    parts = [p.strip() for p in cookie_string.split(";") if p.strip()]
    for p in parts:
        if "=" not in p:
            continue
        name, val = p.split("=", 1)
        cookies.append({
            "name": name.strip(),
            "value": val.strip(),
            "domain": domain,
            "path": "/",
        })
    return cookies

def read_input_urls(input_csv):
    if not os.path.exists(input_csv):
        raise FileNotFoundError(f"Input file not found: {input_csv}")
    urls = []
    with open(input_csv, newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        if not any(h.lower() == "url" for h in reader.fieldnames):
            raise ValueError("Input CSV must contain 'url' column")
        for row in reader:
            u = row.get("url") or row.get("URL") or row.get("Url")
            if u:
                urls.append(u.strip())
    return urls

def write_rows_append(output_csv, rows):
    file_exists = os.path.exists(output_csv)
    with open(output_csv, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(["url", "opinion"])
        for r in rows:
            writer.writerow([r["url"], r["opinion"]])

def safe_click_locator(page, locator_str, by_xpath=False, timeout=5000):
    """Try to wait for and click the first element of locator."""
    try:
        locator = page.locator(f"xpath={locator_str}") if by_xpath else page.locator(locator_str)
        locator.first.wait_for(timeout=timeout)
        locator.first.click(timeout=timeout)
        return True
    except Exception:
        return False

def scroll_to_bottom(page):
    page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
    page.evaluate("""() => { window.scrollBy(0, -50); window.scrollBy(0, 50); }""")

def count_opinions_on_page(page):
    try:
        return page.locator(OPINION_DIV_SELECTOR).count()
    except Exception:
        return 0

def find_and_click_answers_buttons(page):
    """Cari tombol dengan teks 'answers' dan klik yang pertama."""
    try:
        els = page.locator(ANSWERS_CSS)
        n = els.count()
        for i in range(n):
            el = els.nth(i)
            txt = el.inner_text(timeout=2000).strip().lower()
            if "answers" in txt:
                el.scroll_into_view_if_needed(timeout=3000)
                el.click(timeout=5000)
                return True
        return False
    except Exception:
        return False

def expand_all_read_more(page):
    try:
        spans = page.locator(READ_MORE_SPAN_SELECTOR)
        count = spans.count()
        for i in range(count):
            s = spans.nth(i)
            s.scroll_into_view_if_needed(timeout=3000)
            s.click(timeout=3000)
            time.sleep(0.3 + random.random() * 0.6)
    except Exception:
        pass

def extract_all_opinions(page):
    opinions = []
    try:
        loc = page.locator(OPINION_DIV_SELECTOR)
        cnt = loc.count()
        for i in range(cnt):
            text = loc.nth(i).inner_text(timeout=5000).strip()
            if text:
                opinions.append(text)
    except Exception:
        pass
    return opinions

# ==================================================
# Fungsi Utama
# ==================================================
def run():
    urls = read_input_urls(INPUT_CSV)
    print(f"Found {len(urls)} URLs in {INPUT_CSV}")

    cookies = parse_cookie_string(COOKIE_STRING)
    print(f"Parsed {len(cookies)} cookies (domain .quora.com).")

    with sync_playwright() as p:
        browser = p.firefox.launch(headless=HEADLESS)
        context = browser.new_context(
            user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/120.0.0.0 Safari/537.36"),
            viewport={"width": 1280, "height": 900},
        )

        try:
            context.add_cookies(cookies)
            print("Cookies added to browser context.")
        except Exception as e:
            print("Warning: failed to add cookies to context:", e)

        page = context.new_page()
        time.sleep(1.0)

        total_saved = 0
        for idx, url in enumerate(urls, start=1):
            print(f"\n[{idx}/{len(urls)}] Opening: {url}")
            try:
                page.goto(url, timeout=30000)
            except Exception as e:
                print("  ! Error loading url:", e)
                continue

            while True:
                if safe_click_locator(page, XPATH_1, by_xpath=True, timeout=3000):
                    print("  clicked XPATH_1")
                else:
                    print("  XPATH_1 not found / skipped")
                time.sleep(1)
                if safe_click_locator(page, XPATH_2, by_xpath=True, timeout=3000):
                    print("  clicked XPATH_2")
                    break
                else:
                    print("  XPATH_2 not found / retrying page reload...")
                    page.goto(url)
                    continue

            print(f"  waiting {WAIT_AFTER_ACTION:.1f}s after XPATH clicks...")
            time.sleep(WAIT_AFTER_ACTION)

            prev_count = count_opinions_on_page(page)
            print(f"  initial opinion count: {prev_count}")

            while True:
                scroll_to_bottom(page)
                print(f"  scrolled to bottom, waiting {SCROLL_WAIT:.1f}s...")
                time.sleep(SCROLL_WAIT)

                cur_count = count_opinions_on_page(page)
                print(f"  opinion count after scroll: {cur_count}")

                if cur_count > prev_count:
                    print("  opinions increased; continue scrolling...")
                    prev_count = cur_count
                    time.sleep(0.5 + random.random() * 0.8)
                    continue

                print("  no increase after scroll; trying 'answers' buttons...")
                clicked_answers = find_and_click_answers_buttons(page)
                if clicked_answers:
                    print(f"  clicked 'answers' button. waiting {ANSWERS_CLICK_WAIT:.1f}s...")
                    time.sleep(ANSWERS_CLICK_WAIT)
                    prev_count = count_opinions_on_page(page)
                    continue

                print("  no more opinions/buttons found.")
                break

            print("  expanding all 'read more' spans...")
            expand_all_read_more(page)
            time.sleep(0.8 + random.random() * 0.6)

            opinions = extract_all_opinions(page)
            print(f"  extracted {len(opinions)} opinions from page.")

            if opinions:
                rows = [{"url": url, "opinion": op} for op in opinions]
                write_rows_append(OUTPUT_CSV, rows)
                total_saved += len(rows)
                print(f"  saved {len(rows)} rows to {OUTPUT_CSV}.")
            else:
                print("  no opinions found on this page.")

            wait = random.uniform(*DELAY_BETWEEN_URLS)
            print(f"  waiting {wait:.2f}s before next URL...")
            time.sleep(wait)

        print("\nDone. Total opinions saved:", total_saved)

        try:
            context.close()
            browser.close()
        except Exception:
            pass


# ==================================================
# Entry Point
# ==================================================
if __name__ == "__main__":
    run()


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

# === Unduh NLTK data ===
print("Downloading NLTK data...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab', quiet=True)
print("‚úì Download selesai!\n")

# === Fungsi Preprocessing ===
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""

    # Bersihkan HTML tag dan noise
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()

    # Case folding
    text = text.lower()

    # Tokenizing
    tokens = word_tokenize(text)

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

# === Baca file Excel ===
input_file = '/content/quora_opinion_israel.xlsx'
print(f"üìÇ Membaca file: {input_file}")
df = pd.read_excel(input_file)
print(f"‚úì Berhasil membaca {len(df)} baris data\n")

# === Pilih kolom teks ===
text_column = 'opinion' if 'opinion' in df.columns else df.columns[0]
print(f"Menggunakan kolom: '{text_column}' untuk preprocessing\n")

# === Jalankan preprocessing ===
print("üîÑ Memproses teks...")
df['cleaned_text'] = df[text_column].apply(preprocess_text)
print("‚úì Preprocessing selesai!\n")

# === Tampilkan hasil ===
print("="*80)
print("CONTOH HASIL PREPROCESSING:")
print("="*80)
for i in range(min(3, len(df))):
    print(f"\nüìù Data ke-{i+1}:")
    print(f"ORIGINAL : {df[text_column].iloc[i][:150]}...")
    print(f"CLEANED  : {df['cleaned_text'].iloc[i][:150]}...")
    print("-"*80)

# === Verifikasi sebelum simpan ===
print(f"\nüìä Statistik:")
print(f"   - Total baris: {len(df)}")
print(f"   - Kolom asli: {text_column}")
print(f"   - Kolom baru: cleaned_text")
print(f"   - Baris dengan teks kosong setelah cleaning: {(df['cleaned_text'] == '').sum()}")

# === Simpan ke Excel dengan nama berbeda ===
output_file = '/content/quora_opinion_israel_CLEANED.xlsx'
print(f"\nüíæ Menyimpan hasil ke: {output_file}")

# Simpan hanya kolom yang diperlukan untuk memastikan
df_output = df[[text_column, 'cleaned_text']].copy()
df_output.to_excel(output_file, index=False, engine='openpyxl')

print("‚úì File berhasil disimpan!")

# === Verifikasi file yang tersimpan ===
print("\nüîç Memverifikasi file yang tersimpan...")
df_verify = pd.read_excel(output_file)
print(f"‚úì File terverifikasi dengan {len(df_verify)} baris")
print(f"‚úì Kolom yang tersimpan: {list(df_verify.columns)}")

# Tampilkan sample dari file yang tersimpan
print("\nüìÑ Sample dari file yang TERSIMPAN:")
print(df_verify.head(2).to_string())

print("\n" + "="*80)
print("‚úÖ PROSES SELESAI!")
print("="*80)

Downloading NLTK data...
‚úì Download selesai!

üìÇ Membaca file: /content/quora_opinion_israel.xlsx
‚úì Berhasil membaca 3658 baris data

Menggunakan kolom: 'opinion' untuk preprocessing

üîÑ Memproses teks...
‚úì Preprocessing selesai!

CONTOH HASIL PREPROCESSING:

üìù Data ke-1:
ORIGINAL : While I agree that there are morally consistent Jewish-Israelis in Israel. Who don't support abhorrent conduct of Israel towards Palestinians.

But th...
CLEANED  : agre moral consist jewishisra israel dont support abhorr conduct israel toward palestinian societi major percent vocal genocid toward palestinian dont...
--------------------------------------------------------------------------------

üìù Data ke-2:
ORIGINAL : Bahai Garden in Haifa

Well! Israel is a strong country in defense, safety, research, education, development, tourism, and, of course, Jewish culture....
CLEANED  : bahai garden haifa well israel strong countri defens safeti research educ develop tourism cours jewish cultur 