In [2]:
from google_play_scraper import reviews, Sort
import pandas as pd
import os
from urllib.parse import urlparse, parse_qs

def extract_app_id(playstore_url):
    parsed_url = urlparse(playstore_url)
    query_params = parse_qs(parsed_url.query)
    if "id" in query_params:
        return query_params["id"][0]
    raise ValueError("Invalid Play Store URL. Could not find 'id' parameter.")

def scrape_reviews_from_url(playstore_url, lang='id', country='id', num_reviews=100):
    app_id = extract_app_id(playstore_url)
    print(f"Scraping reviews for App ID: {app_id}")
    all_reviews = []
    count = 0

    while count < num_reviews:
        result, _ = reviews(
            app_id,
            lang=lang,
            country=country,
            sort=Sort.NEWEST,  # Correctly use Sort enum for sorting
            count=min(1000, num_reviews - count)
        )
        all_reviews.extend(result)
        count += len(result)
        if len(result) < 100:
            break

    # Convert to DataFrame
    df = pd.DataFrame(all_reviews)
    df = df[['userName', 'content', 'score']]
    df.rename(columns={'userName': 'user', 'content': 'komentar', 'score': 'rating'}, inplace=True)

    return df

def add_labels_to_reviews(df):
    # Add labeling based on rating
    def label_rating(rating):
        if rating in [1, 2]:
            return 'negatif'
        elif rating in [3, 4, 5]:
            return 'positif'

    df['label'] = df['rating'].apply(label_rating)
    df['sentimen'] = df['label']  # Add 'sentimen' column identical to 'label'
    return df

# Contoh penggunaan
playstore_url = "https://play.google.com/store/apps/details?id=premium.gotube.adblock.utube&hl=id"
data_latih_file = "web/static/data/gotube/data_uji.xlsx"
data_latih_labeled_file = "web/static/data/gotube/data_test.xlsx"

if os.path.exists(data_latih_file):
    print(f"File '{data_latih_file}' ditemukan. Menambahkan label pada data yang ada.")
    reviews_df = pd.read_excel(data_latih_file)
    reviews_df = add_labels_to_reviews(reviews_df)
    reviews_df.to_excel(data_latih_labeled_file, index=False)
    print(f"Data dengan label disimpan di '{data_latih_labeled_file}'.")
else:
    print(f"File '{data_latih_file}' tidak ditemukan. Melakukan scraping.")
    reviews_df = scrape_reviews_from_url(playstore_url, num_reviews=200)
    reviews_df = add_labels_to_reviews(reviews_df)
    reviews_df.to_excel(data_latih_file, index=False)
    reviews_df.to_excel(data_latih_labeled_file, index=False)
    print(f"Scraping selesai. Data disimpan di '{data_latih_file}' dan '{data_latih_labeled_file}'.")

# Output hasil
print("Proses selesai.")


File 'web/static/data/gotube/data_uji.xlsx' tidak ditemukan. Melakukan scraping.
Scraping reviews for App ID: premium.gotube.adblock.utube
Scraping selesai. Data disimpan di 'web/static/data/gotube/data_uji.xlsx' dan 'web/static/data/gotube/data_test.xlsx'.
Proses selesai.
