In [19]:
import os
from loguru import logger
from apis.pc_apis import XHS_Apis
from xhs_utils.common_utils import init
from xhs_utils.data_util import handle_note_info, download_note, save_to_xlsx, timestamp_to_str
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import time
import pandas as pd
from itertools import cycle
import logging
from tqdm.auto import tqdm 


# logging as before…
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
class Data_Spider():
    def __init__(self):
        self.xhs_apis = XHS_Apis()

    def spider_search_note_all_comment(self, note_info: dict, cookies_str: str, proxies: dict = None):
        """
            Crawling all comments of a note
            :param note_info: Note information
            :param cookies_str: Cookies string
            :param proxies: Proxy settings
            :return: success, msg, list_all_comment
        """
        list_all_comment = []
        try:
            time.sleep(random.uniform(1, 3))
            note_url = note_info["note_url"]
            success, msg, list_all_comment = self.xhs_apis.get_note_all_comment(note_url, cookies_str, proxies)
            if not success:
                raise Exception(msg)
        except Exception as e:
            success = False
            msg = e
        logger.info(f'Retrieving all comments - {note_url}: {success}, msg: {msg}')
        return success, msg, list_all_comment
    
    def spider_note(self, note_url: str, cookies_str: str, proxies=None):
        """
            爬取一个笔记的信息
            :param note_url:
            :param cookies_str:
            :return:
        """
        note_info = None
        try:
            success, msg, note_info = self.xhs_apis.get_note_info(note_url, cookies_str, proxies)
            if success:
                note_info = note_info['data']['items'][0]
                note_info['url'] = note_url
                note_info = handle_note_info(note_info)
        except Exception as e:
            success = False
            msg = e
        logger.info(f'Retrieving note info - {note_url}: {success}, msg: {msg}')

        try:
            # Add on comments in note_info (think like scrape 1 note_info and its all comments in next steps)
            time.sleep(random.uniform(1, 3))
            success, msg, list_all_comment = self.spider_search_note_all_comment(note_info, cookies_str, proxies)
            if success:
                note_info['comments'] = list_all_comment
            else:
                note_info['comments'] = []
        except Exception as e:
            success = False
            msg = e
        return success, msg, note_info

    def spider_some_note(self, list_notes_url: list, query:str, cookies_str: str, proxies=None):
        """
        爬取一些笔记的信息
        :param list_notes:
        :param cookies_str:
        :param base_path:
        :return:
        """
        list_note_info = []
        outer_pbar = tqdm(total=len(list_notes_url), desc=f"Processing note_info on keyword '{query}'", unit="note_info", leave=False)
        for note_url in list_notes_url:
            success, msg, note_info = self.spider_note(note_url, cookies_str, proxies)
            if note_info is not None and success:
                list_note_info.append(note_info)
            else:
                logger.error(f"Failed to retrieve note info: {msg}")
            outer_pbar.update(1)
        outer_pbar.close()
        logger.info(f'Processed {len(list_notes_url)} notes.')
        return list_note_info

    def spider_some_search_note(self, query: str, require_num: int, cookies_str: str, sort="general", proxies=None, set_url=None):
        """
            Search and crawl notes, according to the keyword and number of notes
            Args:
                query (str): search keyword
                require_num (int): number of notes to be crawled
                cookies_str (str): Cookies
                sort (str, optional): general
                proxies (dict | None, optional): proxies settings, to avoid IP blockage
                set_url (set | None, optional): To remove duplicate URLs

            Returns:
                tuple[bool, str, list[dict]]: (success, msg, list_note_info)
        """
        time.sleep(3)
        list_note_url = []
        max_retries = 3
        attempt = 1
        while attempt <= max_retries:
            try:
                success, msg, list_notes = self.xhs_apis.search_some_note(query, require_num, cookies_str, sort, proxies)
                if success:
                    list_notes = list(filter(lambda x: x['model_type'] == "note", list_notes))
                    logger.info(f'Searching keyword "{query}", note count: {len(list_notes)}')
                    for note in list_notes:
                        note_url = f"https://www.xiaohongshu.com/explore/{note['id']}?xsec_token={note['xsec_token']}"
                        if isinstance(set_url, set):
                            if note_url in set_url:
                                continue
                            set_url.add(note_url)
                        list_note_url.append(note_url)
                list_note_info = self.spider_some_note(list_note_url, query, cookies_str, proxies)
                logger.info(f'Searching keyword "{query}", note: {success}, msg: {msg}')
                return success, msg, list_note_info
            
            except Exception as e:
                success = False
                msg = e
                attempt += 1
                logger.warning(f"Attempt {attempt} failed: {msg}")
                time.sleep(2 ** attempt)


In [None]:
# # Flatten helper
# def handle_list_comment(note_comment):
#     list_dict_comment = []
#     list_dict_subcomment = []
#     for comment in note_comment:
#         list_dict_comment.append(comment)
#         if "sub_comments" in comment:
#             for sub in comment["sub_comments"]:
#                 sub["comment_id"] = comment["id"]
#                 list_dict_subcomment.append(sub)
#     return list_dict_comment, list_dict_subcomment

# # Retry boilerplate
# MAX_RETRIES = 3
# BACKOFF_BASE = 2

# def with_retry(fn):
#     def wrapped(*args, **kwargs):
#         for attempt in range(1, MAX_RETRIES + 1):
#             try:
#                 return fn(*args, **kwargs)
#             except Exception as e:
#                 if attempt < MAX_RETRIES:
#                     delay = BACKOFF_BASE ** attempt + random.random()
#                     logger.warning(f"[{fn.__name__}] attempt {attempt} failed: {e!r}. retrying in {delay:.1f}s")
#                     time.sleep(delay)
#                 else:
#                     logger.error(f"[{fn.__name__}] all {MAX_RETRIES} attempts failed: {e!r}")
#                     raise
#     return wrapped

# @with_retry
# def delayed_spider_search_note(keyword, cookies_str, xhs_spider, num_search, set_url):
#     # fixed 7s pre-delay
#     time.sleep(random.uniform(4, 7))
#     success, msg, notes = xhs_spider.spider_some_search_note(keyword, num_search, cookies_str, set_url=set_url)
#     if not success:
#         raise RuntimeError(f"API returned success=False, msg={msg!r}")
#     # fixed 7s post-delay
#     time.sleep(random.uniform(4, 7))
#     return notes

# @with_retry
# def delayed_spider_search_comments(note_info, cookies_str, xhs_spider):
#     # fixed 10s pre-delay
#     time.sleep(random.uniform(4, 7))
#     success, msg, comments = xhs_spider.spider_search_note_all_comment(note_info, cookies_str)
#     if not success:
#         raise RuntimeError(f"API returned success=False, msg={msg!r}")
#     # fixed 10s post-delay
#     time.sleep(random.uniform(4, 7))
#     return comments

### XHS Note scraping

In [20]:
from xhs_utils.common_utils import init

cookies_str="abRequestId=b09f6390-3782-5b9a-8a12-527192621d8d; webBuild=4.62.3; a1=1963764c6d7ib7qqfcmnh47ilgwy6mmrhrbosup1830000356016; webId=841fd19f1e873a1b805ec456c22d6b01; gid=yjKqWK4i2YxqyjKqWK4SKF7UfW3DWAAiS6IxDyY4W31kElq8vK66FM888q2K8yK8dYJyqK8q; xsecappid=xhs-pc-web; acw_tc=0a4a88f517459845072174917e3606078ee10324bf2fa004970c47f5c17cd5; websectiga=2a3d3ea002e7d92b5c9743590ebd24010cf3710ff3af8029153751e41a6af4a3; sec_poison_id=1911d4df-e6d5-4311-b50c-2984efb14d91; web_session=040069b68989bc79b3e637d4243a4b4eaae933; unread={%22ub%22:%2267fb28f0000000001202c922%22%2C%22ue%22:%2267f641ec000000001c0304f0%22%2C%22uc%22:33}"
def trans_cookies(cookies_str):
    if '; ' in cookies_str:
        ck = {i.split('=')[0]: '='.join(i.split('=')[1:]) for i in cookies_str.split('; ')}
    else:
        ck = {i.split('=')[0]: '='.join(i.split('=')[1:]) for i in cookies_str.split(';')}
    return ck

def reverse_trans_cookies(cookies_dict):
    return '; '.join([f'{k}={v}' for k, v in cookies_dict.items()])

cookies_dict = {
    'a1': '196374a2390wm4erg4g8raaupcvf5iyfq8gnp3kqy30000144617',
    'web_session': '040069b96c5aba102af269522d3a4b34ec0697'
}
cookies_str = reverse_trans_cookies(cookies_dict)

In [None]:
url = "https://www.xiaohongshu.com/explore/67e30ae1000000000903b3b7?xsec_token=ABLXM1oEHnZmA6xiBLs4CDB8iResK8T1E-0-xHTZdbrig=&xsec_source=pc_search&source=web_explore_feed"


def reverse_trans_cookies(cookies_dict):
    return '; '.join([f'{k}={v}' for k, v in cookies_dict.items()])

cookies_dict = {
    'a1': '196374a2390wm4erg4g8raaupcvf5iyfq8gnp3kqy30000144617',
    'web_session': '040069b96c5aba102af269522d3a4b34ec0697'
}
cookies_str = reverse_trans_cookies(cookies_dict)

xhs_api = Data_Spider()
set_url = set()
data = xhs_api.spider_some_search_note("ifast global bank", 20, cookies_str, set_url=set_url)
# data = xhs_api.spider_search_note_all_comment("ifast global bank", cookies_str, xhs_api, 3, set_url=set_url)


INFO:__main__:Searching keyword "ifast global bank", note count: 3


Processing note_info on keyword 'ifast global bank':   0%|          | 0/3 [00:00<?, ?note_info/s]

INFO:__main__:Retrieving note info - https://www.xiaohongshu.com/explore/681858d0000000002202edb1?xsec_token=AB0IsqDuvVKfeix1TVyEZyJ3nQQgEzN9dqQehkuFJM-EM=: True, msg: success
INFO:__main__:Retrieving all comments - https://www.xiaohongshu.com/explore/681858d0000000002202edb1?xsec_token=AB0IsqDuvVKfeix1TVyEZyJ3nQQgEzN9dqQehkuFJM-EM=: True, msg: success
INFO:__main__:Retrieving note info - https://www.xiaohongshu.com/explore/65fc0a27000000001203d8b4?xsec_token=ABzmyuLp8zek497CohhVbRO0vL2LFuo07F8MjhXopGmfU=: True, msg: success
INFO:__main__:Retrieving all comments - https://www.xiaohongshu.com/explore/65fc0a27000000001203d8b4?xsec_token=ABzmyuLp8zek497CohhVbRO0vL2LFuo07F8MjhXopGmfU=: True, msg: success
INFO:__main__:Retrieving note info - https://www.xiaohongshu.com/explore/67e29d0a000000000703645e?xsec_token=AB3Ky9JehAYxfqKlRugirr9npRQslw-WQaV3AlSe9ceNc=: True, msg: success
INFO:__main__:Retrieving all comments - https://www.xiaohongshu.com/explore/67e29d0a000000000703645e?xsec_token=AB

In [23]:
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    # Initialize
    xhs_spider = Data_Spider()
    cookies_str, base_path = init()

    list_keywords = [
        "ifast global bank", "ifast gb", #"ifast debit card", "ifast银行", "ifast借记卡",
        #  "ifast cashback", "奕丰环球银行", "ifast开户", "ifast入金", "ifast返现"
    ]
    num_search = 3
    list_search_note = []
    set_note_url = set()
    # === Sequential note searches ===
    for kw in list_keywords:
        try:
            list_note_info_per_keyword = xhs_api.spider_some_search_note(kw, num_search, cookies_str, set_url=set_note_url)
            for note_info in list_note_info_per_keyword:
                note_info["keyword"] = kw
            list_search_note.extend(list_note_info_per_keyword)
        except Exception as e:
            logger.error(f"[ERROR] Note search for '{kw}' aborted: {e}")


    # Define folder_path 
    folder_path = "datas/parquet_datas"

    # Note info dataframe
    schema_name = "mimir_igb_dpb_external_data.igb_xhs_scrape_data"
    df_pandas_note_info = pd.DataFrame(list_search_note)
    # df_pandas_note_info = df_pandas_note_info.drop(columns=["avatar"])
    df_pandas_note_info = df_pandas_note_info.rename(columns={"upload_time": "create_time", "nickname": "author_nickname", "user_url": "author_url"})
    df_pandas_note_info.to_parquet(f"{folder_path}/xhs_api_note_info.parquet", engine="pyarrow", index=False)


INFO:__main__:Searching keyword "ifast global bank", note count: 0


Processing note_info on keyword 'ifast global bank': 0note_info [00:00, ?note_info/s]

INFO:__main__:Processed 0 notes.
INFO:__main__:Searching keyword "ifast global bank", note: True, msg: success
ERROR:__main__:[ERROR] Note search for 'ifast global bank' aborted: 'bool' object does not support item assignment
INFO:__main__:Searching keyword "ifast gb", note count: 0


Processing note_info on keyword 'ifast gb': 0note_info [00:00, ?note_info/s]

INFO:__main__:Processed 0 notes.
INFO:__main__:Searching keyword "ifast gb", note: True, msg: success
ERROR:__main__:[ERROR] Note search for 'ifast gb' aborted: 'bool' object does not support item assignment


In [None]:
from pydantic import BaseModel
from openai import AzureOpenAI
from typing import List
import os
import json
from dotenv import load_dotenv

load_dotenv()

# Azure OpenAI configuration
AOAI_API_TYPE = os.getenv("AOAI_API_TYPE")
AOAI_API_VERSION = os.getenv("AOAI_API_VERSION")
AOAI_API_KEY = os.getenv("AOAI_API_KEY")
AOAI_API_ENDPOINT = os.getenv("AOAI_API_ENDPOINT")
AOAI_DEPLOYMENT_NAME = os.getenv("AOAI_DEPLOYMENT_NAME")


In [None]:
client = AzureOpenAI(
    api_key=AOAI_API_KEY,
    api_version=AOAI_API_VERSION,
    azure_endpoint=AOAI_API_ENDPOINT,
    azure_deployment=AOAI_DEPLOYMENT_NAME
)

class RelevancyScoreOutput(BaseModel):
    relevancy_score: float
    relevancy_reasoning: str

class BatchRelevancyOutput(BaseModel):
    results: List[RelevancyScoreOutput]

def evaluate_relevance_batch(posts: List[dict], batch_size: int = 10)-> List[RelevancyScoreOutput]:
    system_prompt = """
You are a bilingual Chinese–English language model focused on identifying relevance to iFAST's financial services.

Each post includes:
    - keywords
    - hashtags
    - title
    - description

Your task:
	- Ignore posts about jobs, careers, interviews, or working at iFAST (score = 0.00)
    - For all others, assess relevance to iFAST’s offerings, including:
    - Account opening or management
    - Investment products/platforms
    - Digital banking & financial services
    - Currency, cash management, or customer service

Evaluate using:
	1.	Keywords and hashtags
	2.	Semantic match of title + description

For each post:
	1.	Assign a relevancy_score (0.00–1.00, two decimals)
	2.	Provide a brief relevancy_reasoning (1–2 sentences)

"""
    all_outputs: List[RelevancyScoreOutput] = []

    for i in range(0, len(posts), batch_size):
        batch = posts[i : i + batch_size]
        payload = json.dumps({"posts": batch}, ensure_ascii=False)

        # Use the .parse endpoint to get Pydantic parsing
        resp = client.beta.chat.completions.parse(
            model=AOAI_DEPLOYMENT_NAME,
            messages=[
                {"role": "system",  "content": system_prompt},
                {"role": "user",    "content": payload}
            ],
            temperature=0.2,
            max_tokens=2000,
            response_format=BatchRelevancyOutput
        )

        # resp.choices[0].message.parsed will be a BatchRelevancyOutput
        batch_output: BatchRelevancyOutput = resp.choices[0].message.parsed
        all_outputs.extend(batch_output.results)

    return all_outputs

# Example usage
if __name__ == "__main__":
    posts = [
        {
            "keywords":    row["keyword"],
            "hashtag":     row["tags"],
            "title":       row["title"],
            "description": row["desc"]
        }
        for _, row in df_pandas_note_info.iterrows()
    ]

    outputs = evaluate_relevance_batch(posts, batch_size=10)

    df_pandas_note_info_output = df_pandas_note_info.copy()

    df_pandas_note_info_output = pd.concat([
        df_pandas_note_info.reset_index(drop=True),
        pd.DataFrame([o.dict() for o in outputs])
    ], axis=1)

    df_pandas_note_info_output.to_csv("test.csv", encoding="utf_8_sig")

In [None]:
df_pandas_note_info_output = pd.concat([
    df_pandas_note_info.reset_index(drop=True),
    pd.DataFrame([o.dict() for o in outputs])
], axis=1)

df_pandas_note_info_output = df_pandas_note_info_output[df_pandas_note_info_output["relevancy_score"] >= 0.6]

df_pandas_note_info_output

### XHS Note comment scraping

In [None]:
df = pd.read_parquet("/Users/enghui.lau/Documents/Work/igb_scrape/xhs_spider/datas/parquet_datas/xhs_api_note_info.parquet")

len(df[df["comment_count"].astype(int) > 0])

In [None]:
if __name__ == "__main__":
    list_search_comment = []
    list_search_subcomment = []

    # === Sequential comment fetches ===
    for note in list_search_note:
        # cookie = next(cookie_cycle_comments)
        try:
            raw_comments = delayed_spider_search_comments(note, cookies_str, xhs_spider)
            comments, subcomments = handle_list_comment(raw_comments)
            list_search_comment.extend(comments)
            list_search_subcomment.extend(subcomments)
        except Exception as e:
            logger.error(f"[ERROR] Comments fetch aborted for note ID {note.get('id')}: {e}")


    # Main Comment dataframe
    df_pandas_main_comment = pd.DataFrame(list_search_comment)
    df_pandas_main_comment = df_pandas_main_comment.drop(columns=["pictures", "user_info", "at_users", "status", "sub_comment_cursor", "sub_comment_has_more", "show_tags", "sub_comments"])
    df_pandas_main_comment["create_time"] = df_pandas_main_comment["create_time"].apply(lambda x: timestamp_to_str(x))
    df_pandas_main_comment = df_pandas_main_comment.rename(columns={"comment_id": "main_comment_id", "content": "main_comment"})
    df_pandas_main_comment.to_parquet(f"{folder_path}/xhs_api_main_comment.parquet", engine="pyarrow", index=False)

    # Sub Comment dataframe
    df_pandas_sub_comment = pd.DataFrame(list_search_subcomment)
    df_pandas_sub_comment = df_pandas_sub_comment.drop(columns=["status", "user_info", "show_tags", "target_comment", "at_users", "pictures"])
    df_pandas_sub_comment["create_time"] = df_pandas_sub_comment["create_time"].apply(lambda x: timestamp_to_str(x))
    df_pandas_sub_comment = df_pandas_sub_comment.rename(columns={"comment_id": "main_comment_id", "content": "sub_comment"})
    df_pandas_sub_comment.to_parquet(f"{folder_path}/xhs_api_sub_comment.parquet", engine="pyarrow", index=False)
