In [61]:
import re
import pandas as pd

with open('/Users/renke/Desktop/NLPT/project/med/all_med.txt', 'r', encoding='utf-8') as file:
    text = file.read()

paragraphs = text.split('\n\n')

extracted_info = []

for para in paragraphs:
    pmid_match = re.search(r'PMID- (\d+)', para)
    pmid = pmid_match.group(1) if pmid_match else "No PMID"


    title_match = re.search(r'TI  - (.+?)(?=\n[A-Z]{2}  - |\Z)', para, re.DOTALL)
    title = ' '.join(title_match.group(1).split()).strip() if title_match else "No title"
    if title and title.endswith('.'):
        title = title[:-1]

    abstract_match = re.search(r'AB  - (.+?)(?=\n[A-Z]{2}  - |\Z)', para, re.DOTALL)
    abstract = ' '.join(abstract_match.group(1).split()).strip() if abstract_match else "No abstract"

    first_author_match = re.search(r'FAU - (.+?)\n', para)
    first_author = first_author_match.group(1).strip() if first_author_match else "No author"

    journal_title_match = re.search(r'JT  - (.+)', para) or re.search(r'BTI - ([\w\s\d]+)', para)
    journal_title = journal_title_match.group(1).strip() if journal_title_match else "No book or journal"

    extracted_info.append({
        'PMID': pmid,
        'Title': title,
        'Abstract': abstract,
        'First Author': first_author,
        'Journal Title': journal_title
    })

In [63]:
manually_extract_df = pd.DataFrame(extracted_info)
manually_extract_df.drop_duplicates(subset='PMID', inplace=True)

output_csv_path = '/Users/renke/Desktop/NLPT/project/med/manually_extract_data.csv'
manually_extract_df.to_csv(output_csv_path, index=False, encoding='utf-8-sig')

In [3]:
file_names = []
for i in range(2,10):
    file_names.append("/Users/renke/Desktop/NLPT/project/csv/csv-intelligen-set-" + str(i) + ".csv")

automatic_generate_df = [pd.read_csv(file) for file in file_names]

automatic_generate_df = pd.concat(automatic_generate_df)

automatic_generate_df.drop_duplicates(subset='PMID', inplace=True)

automatic_generate_df.to_csv('/Users/renke/Desktop/NLPT/project/csv/automatic_generate_data.csv', index=False)


In [None]:
pmids_df1 = set(manually_extract_df['PMID'])
pmids_df2 = set(automatic_generate_df['PMID'])

unique_to_df1 = pmids_df1 - pmids_df2

unique_to_df2 = pmids_df2 - pmids_df1

print("Unique to df1:", unique_to_df1)
print("Unique to df2:", unique_to_df2)

Unique to df1: set()
Unique to df2: set()


In [9]:
merged_df = pd.merge(automatic_generate_df, manually_extract_df, on='PMID', how='outer')

In [None]:
filtered_df = merged_df[['PMID', 'First Author', 'Title_x', 'Authors', 'Journal/Book', 'Publication Year', 'Abstract']]

In [12]:
filtered_df.rename(columns={'Title_x': 'Title'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [14]:
filtered_df['Abstract'] = filtered_df['Abstract'].fillna('No abstract available')
filtered_df['Authors'] = filtered_df['Authors'].fillna('No authors available')
filtered_df['First Author'] = filtered_df['First Author'].fillna('No first author available')
filtered_df['Journal/Book'] = filtered_df['Journal/Book'].fillna('No journal/book available')
filtered_df['Publication Year'] = filtered_df['Publication Year'].fillna('No publication year available')
filtered_df['Title'] = filtered_df['Title'].fillna('No title available')

filtered_df = filtered_df[filtered_df['Abstract'] != "No abstract available"]
filtered_df = filtered_df[filtered_df['Abstract'] != "No abstract available."]

filtered_df.to_csv('/Users/renke/Desktop/NLPT/project/all_med_data.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['Abstract'] = filtered_df['Abstract'].fillna('No abstract available')
