In [13]:
import torch
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import time
import re
from google.colab import drive
from tqdm import tqdm
drive.mount('/content/drive')
import zipfile
import os
import gc

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# 定义路径
zip_path = '/content/drive/MyDrive/academic_tree/data/Scinet/SciSciNet_PaperDetails.tsv.zip'
extract_dir = '/content/drive/MyDrive/academic_tree/data/Scinet/'  # 解压到同一目录

In [None]:
# 解压到指定目录
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)
    print("文件已解压，包含以下文件：")
    print(zip_ref.namelist())

文件已解压，包含以下文件：
['SciSciNet_PaperDetails.tsv']


In [8]:
# ========== 配置路径 ==========
input_path = '/content/drive/MyDrive/academic_tree/data/Scinet/SciSciNet_PaperDetails.tsv'
output_dir = '/content/drive/MyDrive/academic_tree/data/Scinet/chunks/'  # 新输出目录

# 创建输出目录（如果不存在）
os.makedirs(output_dir, exist_ok=True)
output_prefix = os.path.join(output_dir, 'chunk_')

chunk_size = 5_000_000  # 每次处理500万行
# ========== 分块读取、过滤并保存 ==========
total_saved = 0
total_dropped = 0

for i, chunk in enumerate(pd.read_csv(input_path, sep='\t', chunksize=chunk_size, low_memory=False)):
    # 过滤：删除 RetractionType 为空的行
    before_filter = len(chunk)
    chunk_filtered = chunk[chunk['RetractionType'].notna()]  # 保留非空值
    after_filter = len(chunk_filtered)
    dropped = before_filter - after_filter
    total_saved += after_filter
    total_dropped += dropped

    # 保存过滤后的数据
    output_path = f"{output_prefix}{i:03d}.csv"
    chunk_filtered.to_csv(output_path, index=False)

    print(f"✓ 第 {i+1} 块: 原{before_filter:,}行 → 保留{after_filter:,}行, 删除{dropped:,}行 | 保存至: {output_path}")

    del chunk, chunk_filtered  # 显式删除
    gc.collect()

print(f"\n{'='*50}")
print(f"处理完成！")
print(f"总计保留: {total_saved:,} 行")
print(f"总计删除: {total_dropped:,} 行")
print(f"文件保存在: {output_dir}")

✓ 第 1 块: 原5,000,000行 → 保留1,240行, 删除4,998,760行 | 保存至: /content/drive/MyDrive/academic_tree/data/Scinet/chunks/chunk_000.csv
✓ 第 2 块: 原5,000,000行 → 保留1,212行, 删除4,998,788行 | 保存至: /content/drive/MyDrive/academic_tree/data/Scinet/chunks/chunk_001.csv
✓ 第 3 块: 原5,000,000行 → 保留1,193行, 删除4,998,807行 | 保存至: /content/drive/MyDrive/academic_tree/data/Scinet/chunks/chunk_002.csv
✓ 第 4 块: 原5,000,000行 → 保留1,107行, 删除4,998,893行 | 保存至: /content/drive/MyDrive/academic_tree/data/Scinet/chunks/chunk_003.csv
✓ 第 5 块: 原5,000,000行 → 保留1,142行, 删除4,998,858行 | 保存至: /content/drive/MyDrive/academic_tree/data/Scinet/chunks/chunk_004.csv
✓ 第 6 块: 原5,000,000行 → 保留1,198行, 删除4,998,802行 | 保存至: /content/drive/MyDrive/academic_tree/data/Scinet/chunks/chunk_005.csv
✓ 第 7 块: 原5,000,000行 → 保留1,184行, 删除4,998,816行 | 保存至: /content/drive/MyDrive/academic_tree/data/Scinet/chunks/chunk_006.csv
✓ 第 8 块: 原5,000,000行 → 保留1,207行, 删除4,998,793行 | 保存至: /content/drive/MyDrive/academic_tree/data/Scinet/chunks/chunk_007.csv
✓ 第 9 块: 原5,000,

In [20]:
from tqdm import tqdm
# ========== 1. 合并所有分块文件 ==========
input_dir = '/content/drive/MyDrive/academic_tree/data/Scinet/chunks/'
output_file = '/content/drive/MyDrive/academic_tree/data/Scinet/retraction.csv'

# 获取所有chunk文件并排序
csv_files = sorted([f for f in os.listdir(input_dir) if f.startswith('chunk') and f.endswith('.csv')])

print(f"找到 {len(csv_files)} 个文件，开始合并...")

# 首次写入（包含表头）
first_file = True
for file in tqdm(csv_files, desc="合并进度"):
    chunk = pd.read_csv(os.path.join(input_dir, file))
    chunk.to_csv(output_file, mode='a', index=False, header=first_file)
    first_file = False  # 后续文件不重复写入表头

print(f"\n✓ 合并完成！保存至: {output_file}")

# ========== 2. 统计 PaperID 匹配数量 ==========
authorship = pd.read_csv('/content/drive/MyDrive/academic_tree/data/authorship.csv')
target_paper_ids = set(authorship['MAGPaperID'].unique())
print(f"target_paper_ids 数量: {len(target_paper_ids):,}")

# 直接读取完整合并文件
print("读取合并文件...")
df_merged = pd.read_csv(output_file)

# 统计匹配
total_rows = len(df_merged)
matched_rows = df_merged['PaperID'].isin(target_paper_ids).sum()

print(f"\n统计结果:")
print(f"  合并文件总行数: {total_rows:,}")
print(f"  匹配 PaperID 数: {matched_rows:,}")
print(f"  匹配比例: {matched_rows/total_rows*100:.2f}%")

找到 28 个文件，开始合并...


合并进度: 100%|██████████| 28/28 [00:01<00:00, 24.36it/s]



✓ 合并完成！保存至: /content/drive/MyDrive/academic_tree/data/Scinet/merged_papers.csv
target_paper_ids 数量: 16,942,415
读取合并文件...

统计结果:
  合并文件总行数: 32,218
  匹配 PaperID 数: 3,033
  匹配比例: 9.41%


In [24]:
df=df_merged[df_merged['PaperID'].isin(target_paper_ids)]

In [25]:
df.columns

Index(['PaperID', 'DOI', 'DocType', 'PaperTitle', 'BookTitle', 'Year', 'Date',
       'Publisher', 'JournalID', 'ConferenceSeriesID', 'Volume', 'Issue',
       'FirstPage', 'LastPage', 'ReferenceCount', 'CitationCount',
       'OriginalVenue', 'FamilyID', 'RetractionType'],
      dtype='object')

In [34]:
df.drop(['DOI', 'Publisher', 'JournalID', 'ConferenceSeriesID', 'Volume', 'Issue',
       'FirstPage', 'LastPage', 'ReferenceCount', 'CitationCount', 'FamilyID'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['DOI', 'Publisher', 'JournalID', 'ConferenceSeriesID', 'Volume', 'Issue',


In [28]:
df.rename({'PaperID':'MAGPaperID'},axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({'PaperID':'MAGPaperID'},axis=1,inplace=True)


In [38]:
df=authorship.merge(df,on='MAGPaperID')

In [39]:
df

Unnamed: 0,PID,MAGPaperID,DocType,PaperTitle,BookTitle,Year,Date,OriginalVenue,RetractionType
0,135,2054246163,Journal,Synaptic Changes in Layer 2/3 Underlying Map P...,,2004.0,2004-04-30,Science,"""Retracted Publication"""
1,169,2069203092,Journal,"Dual Signaling Regulated by Calcyon, a D1 Dopa...",,2000.0,2000-03-03,Science,"""Retracted Publication"""
2,299,2018622922,Journal,Organization of connections of the basal and a...,,2000.0,2000-11-01,European Journal of Neuroscience,"""Retracted Publication"""
3,314,1999062247,Journal,Combinatorial effects of odorant mixes in olfa...,,2006.0,2006-03-10,Science,"""Retracted Publication"""
4,314,2075088261,Journal,Genetic tracing reveals a stereotyped sensory ...,,2001.0,2001-11-08,Nature,"""Retracted Publication"",""Retraction Notice"""
...,...,...,...,...,...,...,...,...,...
5096,838765,2168321568,Journal,Three BUB1 and BUBR1/MAD3-related spindle asse...,,2015.0,2015-01-01,New Phytologist,"""Retracted Publication"""
5097,839360,2897418729,Journal,Breast Cancer Chemo-immunotherapy through Lipo...,,2018.0,2018-10-16,ACS Nano,"""Retracted Publication"",""Retraction Notice"""
5098,839487,2057911103,Journal,RETRACTED: Microwave-activated direct synthesi...,,2010.0,2010-04-06,Applied Catalysis B-environmental,"""Retracted Publication"""
5099,839487,2548061639,Journal,Retracted Article: Efficient microwave-promote...,,2009.0,2009-07-06,Green Chemistry,"""Retracted Publication"""


In [40]:
pid_retraction = df.groupby('PID').agg({
    'MAGPaperID': lambda x: list(x.unique()),  # 所有被撤稿的论文ID
    'Year': lambda x: list(x.dropna().unique()) if 'Year' in df.columns else []  # 撤稿年份
}).reset_index()

In [44]:
pid_retraction.rename({'Year':'RetractionYear'},axis=1,inplace=True)

In [31]:
mentorship=pd.read_csv('/content/drive/MyDrive/academic_tree/data/mentorship.csv')

In [45]:
# ========== 为 mentorship 添加导师和学员的撤稿信息 ==========

# 创建 PID -> 撤稿信息的字典，方便快速查找
pid_to_papers = dict(zip(pid_retraction['PID'], pid_retraction['MAGPaperID']))
pid_to_years = dict(zip(pid_retraction['PID'], pid_retraction.get('RetractionYear', [[]]*len(pid_retraction))))

# 函数：获取撤稿信息（处理无撤稿记录的情况）
def get_retraction_info(pid, info_dict):
    if pd.isna(pid):
        return []
    return info_dict.get(int(pid), []) if isinstance(info_dict.get(int(pid)), list) else []

# 导师撤稿信息
mentorship['mentor_retra'] = mentorship['MentorID'].apply(
    lambda x: pid_to_papers.get(x, [])
)
mentorship['mentor_retra_year'] = mentorship['MentorID'].apply(
    lambda x: pid_to_years.get(x, [])
)

# 学员撤稿信息
mentorship['mentee_retra'] = mentorship['MenteeID'].apply(
    lambda x: pid_to_papers.get(x, [])
)
mentorship['mentee_retra_year'] = mentorship['MenteeID'].apply(
    lambda x: pid_to_years.get(x, [])
)

# 统计信息
print("导师有撤稿记录的行数:", (mentorship['mentor_retra'].apply(len) > 0).sum())
print("学员有撤稿记录的行数:", (mentorship['mentee_retra'].apply(len) > 0).sum())
print("导师或学员有撤稿记录的行数:", (
    (mentorship['mentor_retra'].apply(len) > 0) |
    (mentorship['mentee_retra'].apply(len) > 0)
).sum())

mentorship.head(10)

导师有撤稿记录的行数: 12972
学员有撤稿记录的行数: 3131
导师或学员有撤稿记录的行数: 15552


Unnamed: 0,CID,MenteeID,MentorID,MentorshipType,Institution,InstitutionMAGID,StartYear,StopYear,mentor_retra,mentor_retra_year,mentee_retra,mentee_retra_year
0,2,2,3,1,"University of California, Berkeley",95457486.0,2000,2005,[],[],[],[]
1,3,4,3,2,"University of California, Berkeley",95457486.0,2003,2006,[],[],[],[]
2,5,6,3,1,"University of California, Berkeley",95457486.0,2002,2008,[],[],[],[]
3,6,18761,9,1,"University of California, Berkeley",95457486.0,-1,1984,[],[],[],[]
4,7,10,16,2,"Washington University, Saint Louis",204465549.0,-1,-1,[],[],[],[]
5,8,3,16,2,"Washington University, Saint Louis",204465549.0,-1,-1,[],[],[],[]
6,9,3,26,1,Yale University,32971472.0,-1,-1,[],[],[],[]
7,13,13,14,2,Massachusetts Institute of Technology,63966007.0,-1,-1,[],[],[],[]
8,14,14,23,1,Princeton University,20089843.0,-1,-1,[],[],[],[]
9,15,14,25,2,National Institute of Mental Health,,-1,-1,[],[],[],[]


In [48]:
# ========== 快速查看统计 ==========

print("=" * 60)
print("撤稿影响统计")

print("\n【导师撤稿】")
print(f"  有撤稿记录的导师数量: {mentorship[mentorship['mentor_retra'].apply(len) > 0]['MentorID'].nunique():,}")
print(f"  受影响的指导关系数量: {(mentorship['mentor_retra'].apply(len) > 0).sum():,}")

print("\n【学员撤稿】")
print(f"  有撤稿记录的学员数量: {mentorship[mentorship['mentee_retra'].apply(len) > 0]['MenteeID'].nunique():,}")
print(f"  受影响的指导关系数量: {(mentorship['mentee_retra'].apply(len) > 0).sum():,}")

print("\n【按指导类型统计导师撤稿】")
mentorship[mentorship['mentor_retra'].apply(len) > 0]['MentorshipType'].value_counts()

撤稿影响统计

【导师撤稿】
  有撤稿记录的导师数量: 1,901
  受影响的指导关系数量: 12,972

【学员撤稿】
  有撤稿记录的学员数量: 1,975
  受影响的指导关系数量: 3,131

【按指导类型统计导师撤稿】


Unnamed: 0_level_0,count
MentorshipType,Unnamed: 1_level_1
1,8763
2,3271
0,403
4,279
3,256


In [51]:
mentorship

Unnamed: 0,CID,MenteeID,MentorID,MentorshipType,Institution,InstitutionMAGID,StartYear,StopYear,mentor_retra,mentor_retra_year,mentee_retra,mentee_retra_year
0,2,2,3,1,"University of California, Berkeley",95457486.0,2000,2005,[],[],[],[]
1,3,4,3,2,"University of California, Berkeley",95457486.0,2003,2006,[],[],[],[]
2,5,6,3,1,"University of California, Berkeley",95457486.0,2002,2008,[],[],[],[]
3,6,18761,9,1,"University of California, Berkeley",95457486.0,-1,1984,[],[],[],[]
4,7,10,16,2,"Washington University, Saint Louis",204465549.0,-1,-1,[],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...
743171,1784798,839679,287445,1,"University of Massachusetts, Amherst",24603500.0,-1,1991,[],[],[],[]
743172,1784800,796638,839680,1,University of Wales,97429440.0,-1,1986,[],[],[],[]
743173,1784802,839681,699264,1,"University of California, Santa Barbara",154570441.0,2017,-1,[],[],[],[]
743174,1784803,699264,38114,1,,,-1,-1,[],[],[],[]


In [53]:
# ========== 计算共同撤稿 ==========
mentorship = mentorship[mentorship['MentorshipType'].isin([1, 2])].copy()
# 共同撤稿的论文
mentorship['together_retra_set'] = mentorship.apply(
    lambda row: set(row['mentor_retra']) & set(row['mentee_retra']),
    axis=1
)
# 共同撤稿的年份（导师和学员在同一年有撤稿）
mentorship['together_retra_year_set'] = mentorship.apply(
    lambda row: set(row['mentor_retra_year_set']) & set(row['mentee_retra_year_set']),
    axis=1
)

# 转换为输出格式
mentorship['together_retra'] = mentorship['together_retra_set'].apply(lambda x: list(x))
mentorship['together_retra_year'] = mentorship['together_retra_year_set'].apply(lambda x: sorted(list(x)))
mentorship['together_retra_count'] = mentorship['together_retra_set'].apply(len)

# 删除临时集合列
mentorship = mentorship.drop(['mentor_retra_set', 'mentor_retra_year_set',
                              'mentee_retra_set', 'mentee_retra_year_set',
                              'together_retra_set', 'together_retra_year_set'], axis=1)

# ========== 统计信息 ==========

print("\n" + "="*60)
print("共同撤稿统计")

has_together = mentorship['together_retra_count'] > 0
print(f"有共同撤稿的指导关系数量: {has_together.sum():,}")
print(f"共同撤稿涉及的总论文数: {mentorship.loc[has_together, 'together_retra_count'].sum():,}")

print(f"\n共同撤稿数量分布:")
print(mentorship['together_retra_count'].value_counts().sort_index())

# 查看具体例子
print("\n" + "="*60)
print("共同撤稿示例:")
mentorship[has_together][['CID', 'MenteeID', 'MentorID', 'MentorshipType',
                          'mentor_retra', 'mentee_retra',
                          'together_retra', 'together_retra_year', 'together_retra_count']].head(10)

KeyError: 'mentor_retra_year_set'