# 下载所有Release文件

In [None]:
import os
import requests
import shutil
import zipfile

def download_and_extract(download_url, filename, extract_dir):
    print("download_url", download_url)
    print("filename", filename)
    print("extract_dir", extract_dir)
    # 下载zip文件
    r = requests.get(download_url, stream=True)
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

    # 解压zip文件
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)

    # 删除zip文件
    os.remove(filename)

def main(username, repo_name, auth_token):
    url = f'https://api.github.com/repos/{username}/{repo_name}/releases'
    headers = {'Authorization': f'token {auth_token}'}
    response = requests.get(url, headers=headers)

    # 解析JSON响应并获取release信息
    releases = []
    if response.ok:
        json_data = response.json()
        for release in json_data:
            release_id = release['id']
            name = release['name']
            tag_name = release['tag_name']
            if(not tag_name.startswith('movielens-1m-detail')):
                continue

            # 下载并提取zip文件中的数据文件
            for asset in release['assets']:
                if asset['content_type'] == 'application/zip':
                    download_url = asset['browser_download_url']
                    filename = asset['name']
                    extract_dir = f'{repo_name}_extracted_{release_id}'
                    os.makedirs(extract_dir, exist_ok=True)
                    download_and_extract(download_url, filename, extract_dir)
                    data_path = os.path.join(extract_dir, f'app/data')
                    print("data_path", data_path)
                    if not os.path.exists('out'):
                        os.makedirs('out', exist_ok=True)
                    if os.path.exists(data_path):
                        # 遍历data_path目录下的所有csv文件，拷贝到out目录中
                        for root, dirs, files in os.walk(data_path):
                            for file in files:
                                if file.endswith('.csv') and file.startswith('output'):
                                    src_file = os.path.join(root, file)
                                    dest_file = f'out/{repo_name}_{tag_name}_{release_id}_outputs.csv'
                                    shutil.copyfile(src_file, dest_file)
                    # 删除解压后的文件夹
                    shutil.rmtree(extract_dir)

auth_token = ""
with open('AUTH_TOKEN','r') as f:
    auth_token = f.read().strip()
main("includeno", "spider_movielens_1m", auth_token)


# 合并out文件夹内所有csv

In [1]:
import os
import pandas as pd

input_folder = 'out'  # 指定文件夹名称
csv_files = [os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.csv')]

dfs = []
count=0
for csv_file in csv_files:
    df = pd.read_csv(csv_file, encoding='utf-8')
    dfs.append(df)
    count=count+1
print(count)

FOLDER='data'
if not os.path.exists(FOLDER):
    os.makedirs(FOLDER, exist_ok=True)

df = pd.concat(dfs, axis=0, ignore_index=True)
df.drop_duplicates(subset=['url'], keep='last', inplace=True)
df.to_csv(f'{FOLDER}/concat.csv', index=False, encoding='utf-8')


16
