## Import Library yang akan digunakan

In [55]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
Installing collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [58]:
from google_play_scraper import reviews
import pandas as pd
import string
import requests
from io import StringIO
import csv
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [59]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Scraping Komentar pada Aplikasi Genshin Impact

In [198]:
# Scraping komentar dengan bintang 1-5
# Content, Score, At
_content = []
_score = []
_date = []

for i in range(1,6):
    print(f"Mulai Scraping com.miHoYo.GenshinImpact Review Bintang {i}")
    results, _ = reviews(
    "com.miHoYo.GenshinImpact",
    count=3000,
    filter_score_with= i,
    lang="id",
    country="id"
)
    print(f"Append hasil ke dalam list")
    for result in results:
        _content.append(result["content"])
        _score.append(result["score"])
        _date.append(result["at"])
    print(f"Scraping com.miHoYo.GenshinImpact Review Bintang {i} Selesai..")

Mulai Scraping com.miHoYo.GenshinImpact Review Bintang 1
Append hasil ke dalam list
Scraping com.miHoYo.GenshinImpact Review Bintang 1 Selesai..
Mulai Scraping com.miHoYo.GenshinImpact Review Bintang 2
Append hasil ke dalam list
Scraping com.miHoYo.GenshinImpact Review Bintang 2 Selesai..
Mulai Scraping com.miHoYo.GenshinImpact Review Bintang 3
Append hasil ke dalam list
Scraping com.miHoYo.GenshinImpact Review Bintang 3 Selesai..
Mulai Scraping com.miHoYo.GenshinImpact Review Bintang 4
Append hasil ke dalam list
Scraping com.miHoYo.GenshinImpact Review Bintang 4 Selesai..
Mulai Scraping com.miHoYo.GenshinImpact Review Bintang 5
Append hasil ke dalam list
Scraping com.miHoYo.GenshinImpact Review Bintang 5 Selesai..


In [199]:
# Membuat dataframe untuk hasil scraping
df_scraping = pd.DataFrame({"Review Date": _date, "Ratings" : _score, "Comment" : _content})
df_scraping["Ratings"].value_counts()

Ratings
1    3000
3    3000
4    3000
5    3000
Name: count, dtype: int64

In [200]:
df_scraping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Review Date  12000 non-null  datetime64[ns]
 1   Ratings      12000 non-null  int64         
 2   Comment      12000 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 281.4+ KB


In [201]:
df_scraping['Comment'].to_csv("Data.csv", index=False, header=False)

## Pre-Processing Text

#### Text Slang

In [202]:
text_slang = {
    'gue' : 'saya',
    'gw' : 'saya',
    'gua' : 'saya',
    'ak' : 'saya',
    'aq' : 'saya',
    'gak' : 'tidak',
    'gk' : 'tidak',
    'ga' : 'tidak',
    'g' : 'tidak',
    'cpek' : 'lelah',
    'cpk' : 'lelah',
    'capk' : 'lelah',
    'gini' : 'ini',
    'kek' : "seperti",
    'kayak' : 'seperti',
    'kyk' : 'seperti',
    'kyknya' : 'sepertinya', 
    'kayaknya' : 'sepertinya',
    'udh' : 'sudah',
    'dhlh' : 'sudahlah',
    'sdh' : 'sudah',
    'dh' : 'sudah',
    'yg' : 'yang',
    'yng' : 'yang',
    'pdhl' : 'padahal',
    'kikir' : 'pelit',
    'bgt' : 'bangent',
    'masi' : 'masih',
    'gitu' : 'begitu',
    'gtu' : 'begitu',
    'gt' : 'begitu',
    'mw' : 'mau',
    'smuanya' : 'semuanya',
    'smua' : 'semua',
    'bosenin' : 'membosankan',
    'ngebosenin' : 'membosankan',
    'gtau' : 'tidak tahu',
    'gatau' : 'tidak tahu',
    'gaada' : 'tidak ada',
    'gda' : 'tidak ada',
    'gd' : 'tidak ada',
    'gamau' : 'tidak mau',
    'gmau' : 'tidak mau',
    'gmw' : 'tidak mau',
    'skrg' : 'sekarang',
    'skrng' : 'sekarang',
    'kikir' : 'pelit',
    'kli' : 'sekali',
    'banget' : 'sekali',
    'kali' : 'sekali', 
    'lu' : 'kamu',
    'lo' : 'kamu',
    'ampas' : 'jelek',
    'slalu' : 'selalu',
    'sllu' : 'selalu',
    'ngedengerin' : 'mendengarkan',
    'dengerin' : 'mendengarkan',
    'denger' : 'dengar',
    'ancur' : 'hancur',
    'tpi' : 'tapi',
    'tp' : 'tapi',
    'klo' : 'jika',
    'kl' : 'jika',
    'minim' : 'sedikit',
    'budeg' : 'tuli',
    'ilang' : 'hilang',
    'balikin' : 'kembalikan',
    'f2p' : 'free to play',
    'p2w' : 'pay to win',
    'apes' : 'sial',
    'lg' : 'lagi',
    'lgi' : 'lagi',
    'bentar' : 'sebentar',
    'ngikuti' : 'mengikuti', 
    'dev' : 'developer',
    'bener' : 'benar',
    'guna' : 'berguna',
    'mc' : 'main character',
    'npc' : 'non player character',
    'makin' : 'semakin',
    'ngotak' : 'pakai otak',
    'nunggu' : 'menunggu'
    }

In [203]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
listStopwords = set(stopwords.words('indonesian')) | set(stopwords.words('english'))
listStopwords.update(['bet','iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy", "anjir", "jir", "anjirr", "bjir", "anjirrr", "njirrr"])

In [204]:
def regex_text(text):
    print(f"Processing Regex Pattern on :  \n{text}")
    text = text.lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'@[A-Za-z0-9]+|#[A-Za-z0-9]+|RT[\s]|http\S+|\d+', ' ', text)
    text = text.strip()
    print(f"Result of Regex:  \n{text}\n")
    return text

def slang_word(text):
    print(f"Processing Slang Word on : \n{text}")
    words = text.split()
    text = []
    for word in words:
        if word in text_slang:
            text.append(text_slang[word])
        else :
            text.append(word)
    text = " ".join(text)
    print(f"Result of Slang:  \n{text}\n")
    return text

def stemming_word(text):
    print(f"Processing Stemming and stop words on :  \n{text}")
    words = word_tokenize(text)
    words = [word for word in words if word not in listStopwords]
    words = [stemmer.stem(word) for word in words]
    text = " ".join(words)
    print(f"Done Processing Text \n{text}\n")
    return text


def cleaning_data(text):
    regex = regex_text(text)
    slang_words = slang_word(regex)
    stemming = stemming_word(slang_words)
    return stemming

In [None]:
df_scraping["Comment Cleaned"] = df_scraping["Comment"].apply(cleaning_data)
df_scraping["Comment Cleaned"].to_csv("Data Cleaned.csv", header=False, index=False)    

Processing Regex Pattern on :  
udh game kikir,,game yapping,,muka NPC nya template pula,,masih bagusan game adek nya si zzz🫵😂
Result of Regex:  
udh game kikir  game yapping  muka npc nya template pula  masih bagusan game adek nya si zzz

Processing Slang Word on : 
udh game kikir  game yapping  muka npc nya template pula  masih bagusan game adek nya si zzz
Result of Slang:  
sudah game pelit game yapping muka non player character nya template pula masih bagusan game adek nya si zzz

Processing Stemming and stop words on :  
sudah game pelit game yapping muka non player character nya template pula masih bagusan game adek nya si zzz
Done Processing Text 
game pelit game yapping muka non player character template bagus game adek si zzz

Processing Regex Pattern on :  
Saran buat dev dioptimalkan lagi game ya, 2020 Edit th 2024: ini ulasan saya setelah 3 tahun lebih di PV 4.2. Buset dah, story nya.... Tidak menyangka game yang saya kasih bintang satu ternyata bagus bener Lore storynya. J

In [None]:
def labelling_by_score(score):
    if score == 5:
        return "Positive"
    elif score == 3 or score == 4:
        return "Neutral"
    else :
        return "Negative"
    
df_scraping["Label"] = df_scraping["Ratings"].apply(labelling_by_score)

In [None]:
df_scraping.head()

Unnamed: 0,Review Date,Ratings,Comment,Comment Cleaned,Label
0,2025-04-21 06:34:55,1,"game apa ini , Gacha Scam , katanya 50/50 , as...",game gacha scam asli rate aja kasih menang huu...,Negative
1,2025-04-16 06:01:17,1,"udh game kikir,,game yapping,,muka NPC nya tem...",game pelit game yapping muka non player charac...,Negative
2,2025-04-09 21:08:42,1,"Saran buat dev dioptimalkan lagi game ya, 2020...",saran developer optimal game edit th ulas pv b...,Negative
3,2025-04-09 21:06:01,1,Keren,keren,Negative
4,2025-04-09 20:53:58,1,bisa tidak sih kasih mata buat F2P? mereka ter...,kasih mata f p siksa cari chest kalah goodbye,Negative
