In [None]:
from google_play_scraper import reviews, Sort
import pandas as pd
import time
import random
from urllib.parse import urlparse, parse_qs

def extract_app_id(playstore_url):
    """
    Mengekstrak ID aplikasi dari URL Google Play Store.
    """
    parsed_url = urlparse(playstore_url)
    query_params = parse_qs(parsed_url.query)
    if "id" in query_params:
        return query_params["id"][0]
    raise ValueError("Invalid Play Store URL. Could not find 'id' parameter.")

def scrape_reviews_from_url(playstore_url, lang='id', country='id', num_reviews=100):
    """
    Melakukan scraping ulasan dari Google Play Store dengan pengecekan duplikasi.
    Jika ada ulasan duplikat, akan dihapus dan diambil ulang hingga jumlahnya sesuai.
    """
    app_id = extract_app_id(playstore_url)
    print(f"Scraping reviews for App ID: {app_id}")
    all_reviews = set()  # Menggunakan set agar hanya ulasan unik yang tersimpan
    count = 0
    continuation_token = None  

    while count < num_reviews:
        result, continuation_token = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=Sort.NEWEST,
            count=num_reviews - count,
            continuation_token=continuation_token
        )

        # **Tambahkan ulasan unik ke dalam set**
        for r in result:
            all_reviews.add((r['userName'], r['content'], r['score']))

        count = len(all_reviews)
        if not continuation_token:  
            break  

    # **Konversi hasil unik ke DataFrame**
    df = pd.DataFrame(list(all_reviews), columns=['user', 'komentar', 'rating'])

    # **Jika jumlah ulasan setelah filtering kurang, ambil ulang secara acak**
    while len(df) < num_reviews:
        print(f"Jumlah ulasan unik masih kurang ({len(df)}/{num_reviews}), mengambil ulang...")
        time.sleep(random.uniform(1, 3))  # Hindari pemblokiran dengan jeda acak
        
        result, _ = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=Sort.MOST_RELEVANT,
            count=num_reviews - len(df),
        )

        # **Tambahkan ulasan baru**
        for r in result:
            all_reviews.add((r['userName'], r['content'], r['score']))

        df = pd.DataFrame(list(all_reviews), columns=['user', 'komentar', 'rating'])

    return df

def add_labels_to_reviews(df):
    """
    Menambahkan label sentimen ke DataFrame ulasan berdasarkan rating.
    """
    def label_rating(rating):
        if rating in [1, 2]:
            return 'negatif'
        elif rating in [3, 4, 5]:
            return 'positif'

    df['label'] = df['rating'].apply(label_rating)
    df['sentimen'] = df['label']  # Kolom 'sentimen' sama dengan 'label'
    return df

def main_data_latih(playstore_url, num_reviews, output_file):
    """
    Fungsi utama untuk scraping data latih dengan penghapusan duplikasi.
    """
    print(f"Memulai scraping {num_reviews} ulasan untuk data latih dari: {playstore_url}")
    reviews_df = scrape_reviews_from_url(playstore_url, num_reviews=num_reviews)

    # **Menambahkan label sentimen ke data latih**
    reviews_df = add_labels_to_reviews(reviews_df)

    # **Simpan data latih ke file Excel**
    reviews_df.to_excel(output_file, index=False)
    print(f"Data latih disimpan di: {output_file}")

# **Contoh penggunaan**
if __name__ == "__main__":
    playstore_url = "https://play.google.com/store/apps/details?id=premium.gotube.adblock.utube&hl=id"
    num_reviews = 200  # Jumlah ulasan yang diambil untuk data latih
    output_file = "web/static/data/gotube/data_latih.xlsx"  # File untuk menyimpan data latih

    main_data_latih(playstore_url, num_reviews, output_file)


In [None]:
from google_play_scraper import reviews, Sort
import pandas as pd
import random
import time
from urllib.parse import urlparse, parse_qs

def extract_app_id(playstore_url):
    """
    Mengekstrak ID aplikasi dari URL Google Play Store.
    """
    parsed_url = urlparse(playstore_url)
    query_params = parse_qs(parsed_url.query)
    if "id" in query_params:
        return query_params["id"][0]
    raise ValueError("Invalid Play Store URL. Could not find 'id' parameter.")

def scrape_reviews_from_url(playstore_url, lang='id', country='id', num_reviews=100, offset=0):
    """
    Melakukan scraping ulasan dari Google Play Store dengan offset tertentu.
    Jika ada duplikat, ulasan akan diambil ulang hingga jumlahnya sesuai.
    """
    app_id = extract_app_id(playstore_url)
    print(f"Scraping reviews for App ID: {app_id}")
    all_reviews = set()  # Menggunakan set untuk menghindari duplikasi
    count = 0
    continuation_token = None  

    # **Skip ulasan sebanyak offset agar data uji tidak sama dengan data latih**
    while count < offset:
        result, continuation_token = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=Sort.NEWEST,  
            count=offset - count,  
            continuation_token=continuation_token
        )
        count += len(result)
        if not continuation_token:  
            break  

    # **Mengambil ulasan unik setelah offset**
    count = 0
    while count < num_reviews:
        result, continuation_token = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=Sort.NEWEST,  
            count=num_reviews - count,  
            continuation_token=continuation_token
        )
        
        # **Tambahkan ulasan unik ke dalam set**
        for r in result:
            all_reviews.add((r['userName'], r['content'], r['score']))

        count = len(all_reviews)  
        if not continuation_token:  
            break  

    # **Mengubah hasil menjadi DataFrame**
    df = pd.DataFrame(list(all_reviews), columns=['user', 'komentar', 'rating'])

    # **Jika jumlah ulasan setelah filtering kurang, ambil ulang secara acak**
    while len(df) < num_reviews:
        print(f"Jumlah ulasan unik masih kurang ({len(df)}/{num_reviews}), mengambil ulang...")
        time.sleep(random.uniform(1, 3))  # Hindari ban dengan jeda acak
        
        result, _ = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=Sort.NEWEST,
            count=num_reviews - len(df),
        )

        # **Tambahkan ulasan baru**
        for r in result:
            all_reviews.add((r['userName'], r['content'], r['score']))

        df = pd.DataFrame(list(all_reviews), columns=['user', 'komentar', 'rating'])

    return df

def main_data_uji(playstore_url, num_reviews, offset, output_file):
    """
    Fungsi utama untuk scraping data uji dengan offset agar tidak sama dengan data latih.
    """
    print(f"Memulai scraping {num_reviews} ulasan untuk data uji dari: {playstore_url}")
    reviews_df = scrape_reviews_from_url(playstore_url, num_reviews=num_reviews, offset=offset)

    # **Simpan data uji ke file Excel**
    reviews_df.to_excel(output_file, index=False)
    print(f"Data uji disimpan di: {output_file}")

# **Contoh penggunaan**
if __name__ == "__main__":
    playstore_url = "https://play.google.com/store/apps/details?id=premium.gotube.adblock.utube&hl=id"
    num_reviews = 50  
    offset = 200  
    output_file = "web/static/data/gotube/data_uji_test.xlsx"  

    main_data_uji(playstore_url, num_reviews, offset, output_file)
