In [1]:
from Parallelizer import make_parallel
from common import  Common
import re
import pandas as pd
import time
from urllib.parse import unquote
from utils import *
import json
from math import ceil


# TIKTOK

In [2]:
import requests
from bs4 import BeautifulSoup
from time import sleep
count = 0

@make_parallel
def request_tiktok(url):
    payload = {}
    headers = {
        'authority': 'oec-api.tiktokv.com',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'en-GB,en;q=0.9,en-US;q=0.8',
        'cache-control': 'max-age=0',
        'sec-ch-ua': '"Microsoft Edge";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'none',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47'
    }
    try_time = 0
    json_obj = None
    while not json_obj:
        
        # url = f"https://oec-api.tiktokv.com/view/product/{product_id}"
        response = requests.request("GET", url, headers=headers, data=payload)
        try_time += 1
        print(f"{try_time}: {url}")
        try:
            soup = BeautifulSoup(response.text, 'html.parser')

            # get content of script id = RENDER_DATA
            script = soup.find('script', id='RENDER_DATA')
            # get content of script
            script_content = script.contents[0]
            # get data from script
            # print(response.text)
            json_obj = json.loads(unquote(script_content))
            # print(json_obj)
        except Exception as e:
            print(e)
            json_obj = None
            print(f"fail: {url} {response.text}")
            sleep(2)
    return json_obj

In [3]:
@make_parallel
def transform(response, results=[]):

    product_transformed = {
        "product_id": "failed",
        "name": "failed",
        "link": "failed",
        "type": "Sp lẻ",
        "min_price": "failed",
        "max_price": "failed",
        "sold_count": "failed",
        "min_original_price": "failed",
        "max_original_price": "failed",
    }

    for i in range(5):
        product_transformed[f"image_{i+1}"] = "failed"

    try:
        product_base = response['2']['initialData']['productInfo']['product_base']
        mapping_product_base = {
            "real_price": ["price", "real_price"],
            "original_price": ["price", "original_price"],
            "images": ["images"],
            "sold_count": ["sold_count"],
            "title": ["title"]
        }
        transformed_product_base = Common.mapping_data(
            product_base, mapping_product_base)
        product_id = response['2']['initialData']['productInfo']['product_id']
        print(product_id)
        name = transformed_product_base['title']
        price_pattern = re.compile(r'(\d+\.?\d*)')
        try:
            is_flash_sale = bool(
                response['2']['initialData']['productInfo'].get("flash_sale", False)
            )
            prices = price_pattern.findall(transformed_product_base['real_price'].replace('.', ''))
            if len(prices) == 2:
                min_price = int(prices[0].replace('.', ''))
                max_price = int(prices[1].replace('.', ''))
            else:
                min_price = None
                max_price = int(prices[0].replace('.', ''))
            original_price = price_pattern.findall(
                transformed_product_base['original_price'])
            if len(original_price) == 2:
                min_original_price = int(original_price[0].replace('.', ''))
                max_original_price = int(original_price[1].replace('.', ''))
            else:
                min_original_price = None
                if len(original_price) == 0:
                    max_original_price = None
                else:
                    max_original_price = int(original_price[0].replace('.', ''))
        except Exception as e:
            print("not found price")
            min_price = None
            max_price = None
            min_original_price = None
            max_original_price = None

        sold_count = transformed_product_base['sold_count']

        images = [None]*5
        for i,image in enumerate(transformed_product_base["images"][:5]):
            images[i] = f'=image("{image["thumb_url_list"][0]}")'

        product_transformed = {
            "product_id": product_id,
            "name": name,
            "link": f"https://oec-api.tiktokv.com/view/product/{product_id}",
            "type": "Sp lẻ",
            "min_price": min_price,
            "max_price": max_price,
            "sold_count": sold_count,
            "min_original_price": min_original_price,
            "max_original_price": max_original_price,
        }
        
        if is_flash_sale:
            if min_original_price is not None:
                product_transformed["min_price"] = min_original_price
            if max_original_price is not None:
                product_transformed["max_price"] = max_original_price

        for i in range(5):
            product_transformed[f"image_{i+1}"] = images[i]

    except Exception as e:
        print(f"fail: {e}")
        if product_id:=response['_location'].split('/')[-1]:
            product_transformed["product_id"] = product_id
            product_transformed['link'] =  f"https://oec-api.tiktokv.com/view/product/{product_id}"
            print(f"failed {product_id}")

    results.append(product_transformed)

In [57]:
links = [
    
'https://oec-api.tiktokv.com/view/product/1729595232898288185',
'https://oec-api.tiktokv.com/view/product/1729596002892220985',
'https://oec-api.tiktokv.com/view/product/1729599230965811769',
'https://oec-api.tiktokv.com/view/product/1729604441810111033',
'https://oec-api.tiktokv.com/view/product/1729604445784279609',
'https://oec-api.tiktokv.com/view/product/1729604448073124409',
'https://oec-api.tiktokv.com/view/product/1729692411441351225',
'https://oec-api.tiktokv.com/view/product/1729702031042054713',
'https://oec-api.tiktokv.com/view/product/1729713427685804601',
'https://oec-api.tiktokv.com/view/product/1729721594483477049',
'https://oec-api.tiktokv.com/view/product/1729743774574348857',
'https://oec-api.tiktokv.com/view/product/1729743808155257401',
'https://oec-api.tiktokv.com/view/product/1729778293230635577',
]

In [58]:
product_infos = request_tiktok(links)

1: https://oec-api.tiktokv.com/view/product/1729604445784279609
1: https://oec-api.tiktokv.com/view/product/1729604448073124409
1: https://oec-api.tiktokv.com/view/product/1729604441810111033
1: https://oec-api.tiktokv.com/view/product/1729595232898288185
1: https://oec-api.tiktokv.com/view/product/1729743808155257401
1: https://oec-api.tiktokv.com/view/product/1729596002892220985
1: https://oec-api.tiktokv.com/view/product/1729713427685804601
1: https://oec-api.tiktokv.com/view/product/1729743774574348857
1: https://oec-api.tiktokv.com/view/product/1729721594483477049
1: https://oec-api.tiktokv.com/view/product/1729692411441351225
1: https://oec-api.tiktokv.com/view/product/1729702031042054713
1: https://oec-api.tiktokv.com/view/product/1729778293230635577
1: https://oec-api.tiktokv.com/view/product/1729599230965811769


In [59]:
len(product_infos), len(links)

(13, 13)

In [60]:
# product_infos = []
# for link in links:
#     product_infos.append(request_tiktok(link))
#     print(len(product_infos))

In [61]:
results = []
transform(product_infos, results)

fail: 'NoneType' object is not subscriptable
failed 1729595232898288185
1729596002892220985
1729599230965811769
fail: 'NoneType' object is not subscriptablefail: 'NoneType' object is not subscriptable
fail: 'NoneType' object is not subscriptable
failed 1729604448073124409
failed 1729604445784279609
1729692411441351225
1729702031042054713
1729713427685804601

failed 1729604441810111033
1729721594483477049
1729743774574348857
1729743808155257401
1729778293230635577


[]

In [62]:
brand = "longthanhsport22"

In [63]:
df = pd.DataFrame(results)
df['type'] = 'Sp lẻ'
colums = ['product_id','name','link','type','min_price','max_price','sold_count','image_1','image_2','image_3','image_4','image_5','min_original_price','max_original_price','product_code']
df.shape

(13, 14)

In [64]:
import re

PATTERN = r'\b\d{3,}\b'
PATTERN = r"[A-Z]\d+|\sQ\d+|\sC\d+|\sQ\d+|\s\d{3,}"
PATTERN = r"\b[\w+]+\d+"

df['name'] = df['name'].apply(lambda x: x.replace('_', ' '))
df['product_code'] = df['name'].apply(lambda x: re.findall(PATTERN, x)[0] if len(re.findall(PATTERN, x)) > 0 else None)
df['product_code'] = df['product_code'].apply(lambda x: x.upper() if x else None)
# df['product_code'] = df['name'].apply(lambda x: x.split(' ')[-1])


In [65]:
df.to_csv(f"{brand}/tiktok.csv", index=False, columns=colums)


In [66]:
# count product_code not null
df[df['product_code'].notnull()].shape

(8, 15)

# Shopee

In [40]:
from pathlib import Path

In [41]:
all_items = []

In [42]:
with open(f"{brand}/all.json", "r", encoding="utf8") as f:
    all_cralwed = json.load(f)
    for crawled in all_cralwed:
        if data:=crawled.get('data'):
            all_items += crawled['data']['items']
        else:
            all_items += crawled['items']

In [43]:
results = []

In [44]:


def transform_product(item):
    try:
        if "item_basic" in item:
            item = item['item_basic']
        product_id = item['itemid']
        shop_id = item['shopid']
        name = item['name']
        link = transform_link(name,shop_id,product_id)
        max_price = item['price_max'] /100000
        min_price = item['price_min'] / 100000 if item['price_min'] and item['price_min'] !=item['price_max'] else None
        sold_count = int(round(item['historical_sold']/100) * 100) if item['historical_sold'] >=1000 else item['historical_sold']
        max_original_price = item['price_max_before_discount'] / 100000 if item['price_max_before_discount'] > 0 else None
        min_original_price = item['price_min_before_discount'] / 100000 if item['price_min_before_discount'] >0 and item['price_min_before_discount'] != item['price_max_before_discount']  else None
        images = [None]*5
        for i,image in enumerate(item["images"][:5]):
            images[i] = f'=image("https://down-vn.img.susercontent.com/file/{image}")'
            
        product_transformed = {
            "product_id": product_id,
            "name": name,
            "link": link,
            "type": "Sp lẻ",
            "min_price": min_price,
            "max_price": max_price,
            "sold_count": sold_count,
            "min_original_price": min_original_price,
            "max_original_price": max_original_price,
        }
        
        for i in range(5):
            product_transformed[f"image_{i+1}"] = images[i]
        return product_transformed
    except Exception as e:
        print(e)
        print(json.dumps(item, indent=4))
    

In [45]:
len(all_items)

618

In [46]:
transformed_products = []
for item in all_items:
    transformed_products.append(transform_product(item))

In [47]:
len(transformed_products)

618

In [48]:
# PATTERN = r"\b[A-Z]\d+|\sQ\d+|\sC\d+|\sQ\d+|\s\d{3,}\b"


In [49]:
df = pd.DataFrame(transformed_products)
columns = ["product_id","name","link","type","min_price","max_price","sold_count","image_1","image_2","image_3","image_4","image_5","min_original_price","max_original_price","product_code"]

df['type'] = 'Sp lẻ'
df['name'] = df['name'].apply(lambda x: x.replace('_', ' '))
df['product_code'] = df['name'].apply(lambda x: re.findall(PATTERN, x)[-1] if len(re.findall(PATTERN, x)) > 0 else None)
df['product_code'] = df['product_code'].apply(lambda x: x.upper() if x else None)
print(df.shape)
df.to_csv(f"{brand}/shopee.csv", index=False, columns=columns)

(618, 15)


In [36]:
# df['product_code'] = df['name'].apply(lambda x: x.split(' ')[-1])
df['product_code'] = df['name'].apply(lambda x: re.findall(PATTERN, x)[0] if len(re.findall(PATTERN, x)) > 0 else None)

In [37]:
df.shape

(1266, 15)