In [1]:
from joblib import dump, load
import numpy as np

from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
gpr = load_dataset("jakartaresearch/google-play-review")

Downloading builder script: 100%|██████████| 3.09k/3.09k [00:00<00:00, 1.31MB/s]
Downloading metadata: 100%|██████████| 1.48k/1.48k [00:00<00:00, 538kB/s]
Downloading readme: 100%|██████████| 2.84k/2.84k [00:00<00:00, 2.54MB/s]


Downloading and preparing dataset google-play-review/default (download: 646.80 KiB, generated: 681.31 KiB, post-processed: Unknown size, total: 1.30 MiB) to /home/andreas/.cache/huggingface/datasets/jakartaresearch___google-play-review/default/1.0.0/df84e67f495cc6639ab0bbf74ff0190498a0b22294fdaca26a5b25e090671c29...


Downloading data: 100%|██████████| 456k/456k [00:00<00:00, 1.23MB/s]
Downloading data: 100%|██████████| 206k/206k [00:00<00:00, 797kB/s] 
                                                                                     

Dataset google-play-review downloaded and prepared to /home/andreas/.cache/huggingface/datasets/jakartaresearch___google-play-review/default/1.0.0/df84e67f495cc6639ab0bbf74ff0190498a0b22294fdaca26a5b25e090671c29. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 331.91it/s]


In [3]:
gpr

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'stars'],
        num_rows: 7028
    })
    validation: Dataset({
        features: ['text', 'label', 'stars'],
        num_rows: 3012
    })
})

In [4]:
gpr["train"].set_format('pandas')
gpr["validation"].set_format('pandas')

In [5]:
tfidf = TfidfVectorizer()

In [6]:
d_train = gpr['train'][:]
d_valid = gpr['validation'][:]

In [7]:
d_train.head()

Unnamed: 0,text,label,stars
0,Halo\n blibli. Sedikit saran untuk gratis ong...,pos,4
1,So far so good. Respon cepat.,pos,5
2,thank,neg,3
3,Aplikasi sering not responding di hp saya (as...,neg,2
4,Gak ada komentar.,pos,5


In [8]:
d_valid.head()

Unnamed: 0,text,label,stars
0,bagussss,pos,5
1,Pengiriman gratis dan produk bervariasi,pos,5
2,Praktis dan mudah aplikasinya digunakan,pos,5
3,No comment,neg,2
4,Aplikasi Belanja Online terbaik di Indonesia ...,pos,5


In [21]:
d_valid['text_len'] = d_valid.text.apply(len)

In [30]:
d_valid[d_valid.text_len.between(40, 80)].sort_values('text_len', ascending=False).text.head(10).tolist()

[' Really easy to order. Free installment. Free delivery. Soooo recomended. \nGreat',
 ' Good Application with good service...need improvement for the varian of product',
 ' Perbanyak lagi item penjualannya, soale masih kalah jauh dengan olshop tetangga',
 'Very good app Loadingnya cepat, cukup mudah penggunaannya, filter kurang lengkap',
 ' Better service than previous year. No more missing or lacking items in package.',
 ' sebenarnya sangat mudah pengoperasiannya, tp untuk pemula mungkin agak bingung',
 ' kerjasama promo dgn vendornya lbh variatif lagi dong, intip2 ecommerce sebelah',
 ' ok bgt.. gratis ongkir.. cuma barang blm begitu banyak, blom begitu bervariasi',
 ' aplikasi shoping trhancur yng pernah sya install... verifikasi hp bloon banget',
 ' application was good but still my phone lagging very often pls fixed it thanks']

In [9]:
d_train_mat = tfidf.fit_transform(d_train.text)

In [10]:
d_valid_mat = tfidf.transform(d_valid.text)

In [11]:
rfc = RandomForestClassifier()

In [12]:
rfc.fit(d_train_mat, d_train.label)

In [13]:
y_pred = rfc.predict(d_valid_mat)

In [14]:
print(classification_report(d_valid.label, y_pred))

              precision    recall  f1-score   support

         neg       0.81      0.63      0.71       480
         pos       0.93      0.97      0.95      2532

    accuracy                           0.92      3012
   macro avg       0.87      0.80      0.83      3012
weighted avg       0.91      0.92      0.91      3012



In [15]:
dump(rfc, 'model.joblib')

['model.joblib']

In [16]:
dump(tfidf, 'tfidf.joblib')

['tfidf.joblib']

## Inference

In [None]:
model = load('model.joblib')

In [None]:
tfidf = load('tfidf.joblib')

In [None]:
vector = tfidf.transform(["So far so good. Respon cepat."])

In [None]:
model.predict(vector)