### Load the CR dataspace
Only consider the CRs meeting the requirements:
- TSG approved: roughly can be seen as ground truth
- WG Tdoc available:

In [None]:
import pandas as pd

# Read the CSV file
csv_path = "/path/to/your/csv/file.csv"
df = pd.read_csv(csv_path)
df = df[df['TSG-level status'] == 'approved']

# Convert 'WG Tdoc' column to string type
df['WG Tdoc'] = df['WG Tdoc'].astype(str)

def has_letters_and_sufficient_length(input_str):
    return any(char.isalpha() for char in input_str) and len(input_str) >= 5 and "-" in input_str

# Apply the filter
df = df[df['WG Tdoc'].apply(has_letters_and_sufficient_length)]

# Print the filtered DataFrame
print(df)


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.auto import tqdm
import multiprocessing
import os

# Define a function to get the FTP path
def get_ftp_path(wg_tdoc):
    url = f"http://netovate.com/doc-search/?fname={wg_tdoc}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', class_='crtable')
    ftp_paths = []
    if table:
        rows = table.find_all('tr')
        for row in rows[1:]:
            cols = row.find_all('td')
            if len(cols) >= 2 and wg_tdoc in cols[1].text:
                ftp_path = cols[1].find('a').get('href').replace('http://3gpp.org/ftp', '')
                ftp_paths.append(ftp_path)
    # print(f"{wg_tdoc}: {ftp_paths}")
    return ','.join(ftp_paths)

# Define a function to handle retries
def get_ftp_path_with_retries(wg_tdoc, retries=3):
    for _ in range(retries):
        try:
            return get_ftp_path(wg_tdoc)
        except Exception as e:
            print(f"Error fetching {wg_tdoc}: {e}")
    return None

# Function to apply multiprocessing
def apply_multiprocessing(df, func, num_processes):
    with multiprocessing.Pool(num_processes) as pool:
        tqdm.pandas()
        df['FTP Path'] = list(tqdm(pool.imap(func, df['WG Tdoc']), total=len(df)))
    return df

# Load your DataFrame
# df = pd.read_csv('your_file.csv')

# Reverse the DataFrame
df = df[::-1].reset_index(drop=True)

# Apply the function to each row to get the FTP path using multiprocessing
num_processes = multiprocessing.cpu_count() * 10
df = apply_multiprocessing(df, get_ftp_path_with_retries, num_processes)

# Save intermediate results to support resume functionality
output_file = 'output_with_ftp_paths.csv'
if os.path.exists(output_file):
    existing_df = pd.read_csv(output_file)
    df = pd.concat([existing_df, df[~df['WG Tdoc'].isin(existing_df['WG Tdoc'])]])

df.to_csv(output_file, index=False)

# Print results
print(df[['WG Tdoc', 'FTP Path']])

