<a href="https://colab.research.google.com/github/henrytantyo/Test2/blob/main/Review_Product_Analysis_W_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Library
!pip install fastapi nest-asyncio pyngrok uvicorn
!pip install transformers
!pip install mysql-connector-python
!pip install pymysql

Collecting fastapi
  Downloading fastapi-0.103.1-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyngrok
  Downloading pyngrok-7.0.0.tar.gz (718 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.7/718.7 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting uvicorn
  Downloading uvicorn-0.23.2-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting starlette<0.28.0,>=0.27.0 (from fastapi)
  Downloading starlette-0.27.0-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:
# Clone Data Repository
!git clone https://github.com/IndoNLP/indonlu.git
%cd indonlu/
%ls

Cloning into 'indonlu'...
remote: Enumerating objects: 500, done.[K
remote: Counting objects: 100% (184/184), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 500 (delta 115), reused 139 (delta 110), pack-reused 316[K
Receiving objects: 100% (500/500), 9.45 MiB | 15.53 MiB/s, done.
Resolving deltas: 100% (235/235), done.
/content/indonlu
CODE_OF_CONDUCT.md    [0m[01;34mexamples[0m/  predict.py    requirements.txt      [01;34mtutorial[0m/
CONTRIBUTING.md       LICENSE    predict.sh    [01;32mrun_all_tasks.sh[0m*     [01;34mutils[0m/
[01;34mdataset[0m/              main.py    README.id.md  [01;32mrun_single_task.sh[0m*
[01;34mdata_utils_notebook[0m/  [01;34mmodules[0m/   README.md     [01;34msubmission_examples[0m/


In [3]:
# Import library & module
import os, sys
sys.path.append('/content/indonlu')
os.chdir('/content/indonlu')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from utils.forward_fn import forward_sequence_classification
from utils.metrics import document_sentiment_metrics_fn
from utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

from fastapi import FastAPI
import nest_asyncio
from pyngrok import ngrok
import uvicorn
import pymysql.cursors
import mysql.connector

In [4]:
  # Sentiment Analysis function (ETA:12-15 minutes)
  def set_seed(seed):
      random.seed(seed)
      np.random.seed(seed)
      torch.manual_seed(seed)
      torch.cuda.manual_seed(seed)

  def count_param(module, trainable=False):
      if trainable:
          return sum(p.numel() for p in module.parameters() if p.requires_grad)
      else:
          return sum(p.numel() for p in module.parameters())

  def get_lr(optimizer):
      for param_group in optimizer.param_groups:
          return param_group['lr']

  def metrics_to_string(metric_dict):
      string_list = []
      for key, value in metric_dict.items():
          string_list.append('{}:{:.2f}'.format(key, value))
      return ' '.join(string_list)

  # Set random seed
  set_seed(26092020)

  # Load Tokenizer and Config
  tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
  config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
  config.num_labels = DocumentSentimentDataset.NUM_LABELS

  # Instantiate model
  model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

  train_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
  valid_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
  test_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

  train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
  valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
  test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

  train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
  valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
  test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

  w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
  print(w2i)
  print(i2w)

  optimizer = optim.Adam(model.parameters(), lr=3e-6)
  model = model.cuda()

  # Train
  n_epochs = 5
  for epoch in range(n_epochs):
      model.train()
      torch.set_grad_enabled(True)

      total_train_loss = 0
      list_hyp, list_label = [], []

      train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
      for i, batch_data in enumerate(train_pbar):
          # Forward model
          loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

          # Update model
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

          tr_loss = loss.item()
          total_train_loss = total_train_loss + tr_loss

          # Calculate metrics
          list_hyp += batch_hyp
          list_label += batch_label

          train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
              total_train_loss/(i+1), get_lr(optimizer)))

      # Calculate train metric
      metrics = document_sentiment_metrics_fn(list_hyp, list_label)
      print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
          total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

      # Evaluate on validation
      model.eval()
      torch.set_grad_enabled(False)

      total_loss, total_correct, total_labels = 0, 0, 0
      list_hyp, list_label = [], []

      pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
      for i, batch_data in enumerate(pbar):
          batch_seq = batch_data[-1]
          loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

          # Calculate total loss
          valid_loss = loss.item()
          total_loss = total_loss + valid_loss

          # Calculate evaluation metrics
          list_hyp += batch_hyp
          list_label += batch_label
          metrics = document_sentiment_metrics_fn(list_hyp, list_label)

          pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))

      metrics = document_sentiment_metrics_fn(list_hyp, list_label)
      print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
          total_loss/(i+1), metrics_to_string(metrics)))

  # Evaluate on test
  model.eval()
  torch.set_grad_enabled(False)

  total_loss, total_correct, total_labels = 0, 0, 0
  list_hyp, list_label = [], []

  pbar = tqdm(test_loader, leave=True, total=len(test_loader))
  for i, batch_data in enumerate(pbar):
      _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
      list_hyp += batch_hyp

  # Save prediction
  df = pd.DataFrame({'label':list_hyp}).reset_index()
  df.to_csv('pred.txt', index=False)

  # print(df)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


(Epoch 1) TRAIN LOSS:0.3510 LR:0.00000300: 100%|██████████| 344/344 [02:36<00:00,  2.20it/s]


(Epoch 1) TRAIN LOSS:0.3510 ACC:0.87 F1:0.81 REC:0.78 PRE:0.85 LR:0.00000300


VALID LOSS:0.1923 ACC:0.92 F1:0.89 REC:0.89 PRE:0.89: 100%|██████████| 40/40 [00:08<00:00,  4.83it/s]


(Epoch 1) VALID LOSS:0.1923 ACC:0.92 F1:0.89 REC:0.89 PRE:0.89


(Epoch 2) TRAIN LOSS:0.1606 LR:0.00000300: 100%|██████████| 344/344 [02:42<00:00,  2.12it/s]


(Epoch 2) TRAIN LOSS:0.1606 ACC:0.95 F1:0.93 REC:0.92 PRE:0.93 LR:0.00000300


VALID LOSS:0.1812 ACC:0.93 F1:0.90 REC:0.90 PRE:0.91: 100%|██████████| 40/40 [00:08<00:00,  4.80it/s]


(Epoch 2) VALID LOSS:0.1812 ACC:0.93 F1:0.90 REC:0.90 PRE:0.91


(Epoch 3) TRAIN LOSS:0.1199 LR:0.00000300: 100%|██████████| 344/344 [02:45<00:00,  2.08it/s]


(Epoch 3) TRAIN LOSS:0.1199 ACC:0.96 F1:0.95 REC:0.94 PRE:0.95 LR:0.00000300


VALID LOSS:0.1802 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92: 100%|██████████| 40/40 [00:08<00:00,  4.92it/s]


(Epoch 3) VALID LOSS:0.1802 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92


(Epoch 4) TRAIN LOSS:0.0928 LR:0.00000300: 100%|██████████| 344/344 [02:45<00:00,  2.08it/s]


(Epoch 4) TRAIN LOSS:0.0928 ACC:0.97 F1:0.96 REC:0.96 PRE:0.97 LR:0.00000300


VALID LOSS:0.1932 ACC:0.93 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 40/40 [00:08<00:00,  4.61it/s]


(Epoch 4) VALID LOSS:0.1932 ACC:0.93 F1:0.91 REC:0.91 PRE:0.92


(Epoch 5) TRAIN LOSS:0.0681 LR:0.00000300: 100%|██████████| 344/344 [02:46<00:00,  2.06it/s]


(Epoch 5) TRAIN LOSS:0.0681 ACC:0.98 F1:0.97 REC:0.97 PRE:0.98 LR:0.00000300


VALID LOSS:0.1873 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 40/40 [00:08<00:00,  4.65it/s]


(Epoch 5) VALID LOSS:0.1873 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92


100%|██████████| 16/16 [00:02<00:00,  5.44it/s]


In [None]:
from transformers.utils.hub import has_file
# Server side API
import json
app = FastAPI()

conn = mysql.connector.connect(host='sql6.freesqldatabase.com',
                        user='sql6641527',
                        password='P8DTs5Wvsc',
                        db='sql6641527'
)


@app.get('/analysis')
async def analysis(url: str):
    cursor1 = conn.cursor()

    # query = f"SELECT `review` FROM `Reviews` WHERE `url` = 'https://www.tokopedia.com/rosanshop/samsung-galaxy-note-10-plus-512gb-12gb-bnib-original-note10-256gb/review'"
    query = f"SELECT `review` FROM `Reviews` WHERE `url` = '{url}'"


    cursor1.execute(query)
    review_data = cursor1.fetchall()
    # return review_data

    if review_data is not None:
        # review = review_data.split("; ")
        # review_data = "; ".join(review_data)
        # review = review_data.split("; ")
        review = [review[0] for review in review_data]
        cursor1.close()

        hasil = {"Review": [],
                  "Label": []}

        def analysis_sentiment(review):
          text = review
          subwords = tokenizer.encode(text)
          subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

          logits = model(subwords)[0]
          label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()


          hasil["Review"].append(text)
          hasil["Label"].append(i2w[label])

        for review in review:
          analysis_sentiment(review)

        # def return_hasil(hasil):
        #   for n in hasil:

        return hasil

    else:
        return None


ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)



Public URL: https://b479-35-239-2-50.ngrok.io


INFO:     Started server process [297]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     103.119.140.111:0 - "GET / HTTP/1.1" 404 Not Found
INFO:     103.119.140.111:0 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO:     103.119.140.111:0 - "GET /analysis?url=https://www.tokopedia.com/rosanshop/samsung-galaxy-note-10-plus-512gb-12gb-bnib-original-note10-256gb/review HTTP/1.1" 200 OK
INFO:     103.119.140.111:0 - "GET /analysis?url=https://www.tokopedia.com/rosanshop/samsung-galaxy-note-10-plus-512gb-12gb-bnib-original-note10-256gb/review HTTP/1.1" 200 OK
INFO:     103.119.140.111:0 - "GET /analysis?url=https://www.tokopedia.com/rosanshop/samsung-galaxy-note-10-plus-512gb-12gb-bnib-original-note10-256gb/review HTTP/1.1" 200 OK
INFO:     103.119.140.111:0 - "GET /analysis?url=https://www.tokopedia.com/rosanshop/samsung-galaxy-note-10-plus-512gb-12gb-bnib-original-note10-256gb/review HTTP/1.1" 200 OK
INFO:     103.119.140.111:0 - "GET /analysis?url=https://www.tokopedia.com/007shop-n/xiaomi-redmi-4a-ram-2-16gb-garansi-distributor/review HTTP/1.1" 200 OK
INFO:     1