In [None]:
import argparse
import random
import time
import sys
import re
import os
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from markdownify import markdownify as md


In [None]:
root_url = 'https://www.biomart.cn/'
output_dir = '/public/home/liujunwu/workdir/MEPC/ExperimentMethod'
# 重试间隔和时间（秒）
max_retries = 5
retry_interval = 2

In [None]:
def try_request(url):
    response = requests.get(url)
    for attempt in range(max_retries):
        try:
            # 发送请求
            response = requests.get(url, timeout=3)  # 设置超时时间为 5 秒
            response.raise_for_status()  # 如果响应状态码不是 200，抛出异常
            soup = BeautifulSoup(response.text, 'html.parser')
            break  # 请求成功，退出循环
        except requests.exceptions.RequestException as e:
            if attempt < max_retries - 1:
                time.sleep(retry_interval)
            else:
                print("Max retries reached. Giving up.")
                soup = None
    return soup

In [None]:
def get_method(url3,title3,dir3):
    full_url = root_url + url3
    soup = try_request(full_url)
    #divs_with_ids  = soup.find_all('div', id=True)
    divs_with_contents  = soup.find_all('div', class_='rich-box')
    divs_with_ids = soup.find_all('div', class_='tw-flex tw-items-center tw-my-20')
    #print (len(divs_with_contents),len(divs_with_ids))
    result_id = []
    result_content = []
    for div in divs_with_ids:
        #div_id = div['id']
        #if div_id in ['__next','main-box','right-container','j-dxy-bottom']:continue
        paragraphs = div.find('p').get_text()
        paragraphs = re.sub(r'[^\w\u4e00-\u9fff\sa-zA-Z]', '', paragraphs).strip()
        result_id.append({
            'id': paragraphs
        })
    for div in divs_with_contents:
        div_html = str(div)
        markdown_content =  md(div_html)
        result_content.append({
            'markdown': markdown_content
        })

## 转markdown
    outfile_name = f"{dir3}/{title3}"
    with open(f"{outfile_name}.md", "w", encoding="utf-8") as f:
        for i in range(0,len(result_content)):
            #f.write(f"## {item['id']}\n\n")
            f.write(f"## {result_id[i]['id']}\n\n")
            f.write(result_content[i]['markdown'])
            f.write("\n\n---\n\n")
    time.sleep(2)

In [None]:
get_method('/lab-web/method/34ndj3ogo2e00.html','test',output_dir)

In [None]:
def exp_level2(url2,dir2):
    full_url = root_url + url2
    #print (full_url)
    soup = try_request(full_url)
    ## 找到Methods
    links  = soup.find_all('a', class_='tw-mb-10 tw-inline-block tw-w-full tw-cursor-pointer tw-rounded-8 tw-bg-other-400 tw-p-20 last:tw-mb-none hover:tw-bg-other-300 hover:tw-text-current md:tw-mb-20')
    for link in links:
        method_title = link.find('b').get_text()
        method_title = re.sub(r'[^\w\u4e00-\u9fff\sa-zA-Z]', '', method_title).strip()
        href = link.get('href')
        #text = link.get_text()
        if href:
            print(f'{href}' + '\t' + f'{method_title}')
            get_method(href,method_title,dir2)
        else:
            #pass
            print ('not found')
        time.sleep(1)
        #break
    

In [None]:
def exp_level1(url1,dir1):
    full_url = root_url + url1
    soup = try_request(full_url)
    total_page = soup.find_all('li', title=True)
    max_page_id = 1
    for page in total_page:
        try:
            all_page_id = page.find('a').get_text()
            max_page_id = all_page_id
        except:
            pass
    print (max_page_id)
    print ('\n')
    
    for i in range(1,int(max_page_id)+1):
        if (i == 1):
            soup_new = soup
        else:
            new_url = full_url.rstrip('/') + '-' + str(i)
            print (new_url)
            soup_new = try_request(new_url)
        divs = soup_new.find_all('div', class_='tw-overflow-hidden tw-bg-white tw-rounded-8')
        #print (divs)
        for div in divs:
            links = div.find_all('a')
            for link in links:
                href = link.get('href')
                text = link.get_text()
                text = re.sub(r'[^\w\u4e00-\u9fff\sa-zA-Z]', '', text).strip()
                if href:
                    print(f'{href}' + '\t' + f'{text}')
                    outdir = dir1 + '/' + text
                    os.makedirs(outdir,exist_ok=True)
                    exp_level2(href,outdir)
                else:
                    pass
            time.sleep(2)
            #break
        time.sleep(2)
        #break


In [None]:
exp_level1('/lab-web/exp/316nrk8go403k/',output_dir)

In [None]:
basic_url = root_url + 'lab-web/exp/'
response = requests.get(basic_url)
if response.status_code == 200:
    # 使用 BeautifulSoup 解析 HTML 内容
    #print (response.text)
    soup = BeautifulSoup(response.text, 'html.parser')
    # 提取特定类名或 ID 的内容
    # 例如，提取 class="example-class" 的 <div> 标签的内容
    divs = soup.find_all('div', class_='ant-tabs-tab')
    for div in divs:
        #print (div)
        links = div.find_all('a')
        #遍历所有 <a> 标签并提取 href 属性
        for link in links:
            href = link.get('href')
            text = link.get_text()
            text = re.sub(r'[^\w\u4e00-\u9fff\sa-zA-Z]', '', text).strip()
            if href:
                print(f'{href}' + '\t' + f'{text}')
                outdir = output_dir + '/' + text
                os.makedirs(outdir,exist_ok=True)
                exp_level1(href,outdir) 
                time.sleep(1)
            #break
        time.sleep(5)
        #break     
    else:
        pass
else:
    print(f'Failed to retrieve the webpage. Status code: {response.status_code}')

In [None]:
def rerun(new_url,dir1):
        response_new = requests.get(new_url)
        if response.status_code == 200:
            soup_new = BeautifulSoup(response_new.text, 'html.parser')
        else:
             time.sleep(5)
             soup_new = BeautifulSoup(response_new.text, 'html.parser')
        divs = soup_new.find_all('div', class_='tw-overflow-hidden tw-bg-white tw-rounded-8')
        #print (divs)
        for div in divs:
            links = div.find_all('a')
            for link in links:
                href = link.get('href')
                text = link.get_text()
                text = re.sub(r'[^\w\u4e00-\u9fff\sa-zA-Z]', '', text).strip()
                if href:
                    print(f'{href}' + '\t' + f'{text}')
                    outdir = dir1 + '/' + text
                    os.makedirs(outdir,exist_ok=True)
                    exp_level2(href,outdir)
                else:
                    pass
        print ('done')

In [None]:
rerun_data = pd.read_csv('/public/home/liujunwu/workdir/MEPC/ExperimentMethod/rerun.list3',header=0,sep='\t')
#rerun_data['dir'] = rerun_data['dir'].apply(lambda x:'/public/home/liujunwu/workdir/MEPC/ExperimentMethod/'+str(x))
print (rerun_data.shape)
rerun_data.head()

In [None]:
for index, row in rerun_data.iterrows():
    print (row['url'],row['dir'])
    print (index)
    rerun(row['url'],row['dir'])
    time.sleep(10)

In [None]:
### 重新爬取空目录下的文档
