In [1]:
import pandas as pd
import warnings


# 忽略 UserWarning 类警告
warnings.filterwarnings("ignore", category=UserWarning)

# 1. 读取原始数据，指定列和列名
origin_data = pd.read_csv('csv-export.csv', usecols=[1,3,6,7,8,9], header=None)
origin_data.columns = ['id', 'abstract', 'term', 'journal', 'title', 'time']

# 2. 初步筛选：term 是 'geology' 或 'thisisref'，且 abstract 不为 'no abstract' 或 'no abstracts'
clean_data = origin_data[
    origin_data['term'].isin(['geology', 'thisisref']) & ~origin_data['abstract'].isin(['no abstract', 'no abstracts'])
]

# 3. 构建需要排除的特定文本的正则表达式（包含四位年份）
pattern = (
    r"This article is in Free Access Publication and may be downloaded using the “Download Full Text PDF” link at right\. © \d{4}, by the Association for the Sciences of Limnology and Oceanography, Inc\."
)

# 4. 创建布尔掩码，标记包含上述文本的行
mask = clean_data['abstract'].str.contains(pattern, regex=True, na=False)

# 5. 最终清洗：排除匹配的文本，且保留 abstract 长度 ≥ 100 的行
clean_data2 = clean_data[~mask & (clean_data['abstract'].str.len() >= 100)]

# 6. 输出清洗后的数据
clean_data2

  from pandas.core import (


Unnamed: 0,id,abstract,term,journal,title,time
2,2-s2.0-85109210680,Abstract: Natural gas hydrate has great develo...,geology,Geomechanics and Geophysics for Geo-Energy and...,Mechanical behavior and constitutive model of ...,2021
5,2-s2.0-85105584695,The spatial and temporal interaction of riftin...,geology,Tectonophysics,Cenozoic structure and tectonics of North subb...,2021
7,2-s2.0-85111688511,The Groningen gas field in the Netherlands is ...,geology,Remote Sensing,In-reservoir waveform retrieval for monitoring...,2021
11,2-s2.0-85106252695,The current tectonic activity at the frontal p...,geology,Tectonophysics,Active out-of-sequence thrusting in the Molass...,2021
12,2-s2.0-85058209803,"Since launch in November 2018, the VIIRS on-bo...",geology,International Geoscience and Remote Sensing Sy...,Early results from NOAA-20 (JPSS-1) VIIRS on-o...,2018
...,...,...,...,...,...,...
3164072,2-s2.0-84984979172,Controlling sub-10 nm ligament sizes and open-...,thisisref,Small,Nanoporous Gold Bowls: A Kinetic Approach to C...,2016
3164073,2-s2.0-84903513056,This paper reports a strategy to assemble apop...,thisisref,Biosensors and Bioelectronics,Peptide-based electrochemical approach for apo...,2014
3164074,2-s2.0-85013192697,"Nanoporous gold (NPG) structures, which posses...",thisisref,Nanoscale,Kinetically controlled synthesis of nanoporous...,2017
3164075,2-s2.0-85028411103,Wearable electronics are essential for the con...,thisisref,Nanoscale,Single wearable sensing energy device based on...,2017


In [2]:
cite = pd.read_csv('ref.csv')
cite

Unnamed: 0,eid,ref_eid
0,2-s2.0-85111011520,2-s2.0-0000638369
1,2-s2.0-85111011520,2-s2.0-0028405041
2,2-s2.0-85111011520,2-s2.0-0027331194
3,2-s2.0-85111011520,2-s2.0-85083822098
4,2-s2.0-85111011520,2-s2.0-79961139006
...,...,...
8227318,2-s2.0-33846530516,2-s2.0-0032661056
8227319,2-s2.0-33846530516,2-s2.0-20444398941
8227320,2-s2.0-33846530516,2-s2.0-33645974170
8227321,2-s2.0-33846530516,2-s2.0-0042411003


In [3]:
clean_cite = cite[cite['eid'].isin(clean_data2['id']) & cite['ref_eid'].isin(clean_data2['id'])]
clean_cite.to_csv('clean_cite.csv', index=False)
clean_cite

Unnamed: 0,eid,ref_eid
1,2-s2.0-85111011520,2-s2.0-0028405041
2,2-s2.0-85111011520,2-s2.0-0027331194
3,2-s2.0-85111011520,2-s2.0-85083822098
4,2-s2.0-85111011520,2-s2.0-79961139006
5,2-s2.0-85111011520,2-s2.0-85067258849
...,...,...
8227185,2-s2.0-84921959907,2-s2.0-0023477638
8227186,2-s2.0-84921959907,2-s2.0-84864556957
8227187,2-s2.0-84921959907,2-s2.0-0021405378
8227188,2-s2.0-84921959907,2-s2.0-0023119813


In [4]:
clean_data2[
    clean_data2['id'].isin(clean_cite['eid']) |
    clean_data2['id'].isin(clean_cite['ref_eid'])
].to_csv('clean_data.csv', index=False)


In [5]:
clean_data2[
    clean_data2['id'].isin(clean_cite['eid']) |
    clean_data2['id'].isin(clean_cite['ref_eid'])
]


Unnamed: 0,id,abstract,term,journal,title,time
2,2-s2.0-85109210680,Abstract: Natural gas hydrate has great develo...,geology,Geomechanics and Geophysics for Geo-Energy and...,Mechanical behavior and constitutive model of ...,2021
5,2-s2.0-85105584695,The spatial and temporal interaction of riftin...,geology,Tectonophysics,Cenozoic structure and tectonics of North subb...,2021
7,2-s2.0-85111688511,The Groningen gas field in the Netherlands is ...,geology,Remote Sensing,In-reservoir waveform retrieval for monitoring...,2021
11,2-s2.0-85106252695,The current tectonic activity at the frontal p...,geology,Tectonophysics,Active out-of-sequence thrusting in the Molass...,2021
12,2-s2.0-85058209803,"Since launch in November 2018, the VIIRS on-bo...",geology,International Geoscience and Remote Sensing Sy...,Early results from NOAA-20 (JPSS-1) VIIRS on-o...,2018
...,...,...,...,...,...,...
3163983,2-s2.0-0036028564,During the biodegradation of crude oil in the ...,thisisref,Organic Geochemistry,Formation of carboxylic acids during aerobic b...,2002
3163985,2-s2.0-33644969398,Synthetic corundum (Al<sub>2</sub>O<sub>3</sub...,thisisref,Journal of Colloid and Interface Science,XPS study of the major minerals in bauxite: Gi...,2006
3163986,2-s2.0-0034804123,High-quality Raman spectra were used for the c...,thisisref,Journal of Raman Spectroscopy,Comparison of Raman spectra in characterizing ...,2001
3163997,2-s2.0-0027796238,"In a Terrebonne Basin (Louisiana) marsh, <sup>...",thisisref,Marine Ecology Progress Series,Relationship between vegetation and soil forma...,1993
