## Sample Data from the Whole Dataset

Sample data from 2019 to 2023 using chunk

In [None]:
import pandas as pd
import numpy as np
import os
import sys
from datetime import datetime

# Get the directory of the script
script_dir = os.path.dirname(os.path.abspath(__file__))

def process_chunk(chunk):
    try:
        # Convert 'Date' to datetime
        chunk['Date'] = pd.to_datetime(chunk['Date'], format='%Y-%b', errors='coerce')

        # Fill missing dates with the last valid date
        chunk['Date'] = chunk['Date'].fillna(method='ffill')

        # Filter for dates between 2019 and 2023
        mask = (chunk['Date'] >= '2019-01-01') & (chunk['Date'] <= '2023-12-31')
        return chunk.loc[mask]
    except Exception as e:
        print(f"Error processing chunk: {e}")
        return pd.DataFrame()

def process_file(input_file, output_file):
    chunksize = 100000  # Increased chunk size

    with open(output_file, 'w') as temp_file:
        header_written = False
        for i, chunk in enumerate(pd.read_csv(input_file, chunksize=chunksize, engine='python', on_bad_lines='skip')):
            try:
                processed_chunk = process_chunk(chunk)
                if not processed_chunk.empty:
                    processed_chunk.to_csv(temp_file, header=not header_written, index=False)
                    header_written = True  # Ensure header is written only once
                print(f"Processed chunk {i} from {input_file}")
            except Exception as e:
                print(f"Error processing chunk {i} from {input_file}: {e}")

    print(f"Processed data saved to {output_file}")

# File names
large_file = 'large_file.csv'
small_file = 'small_file.csv'

# Full paths
large_file_path = os.path.join(script_dir, large_file)
small_file_path = os.path.join(script_dir, small_file)

# Check if files exist
if not os.path.exists(large_file_path):
    print(f"Error: {large_file} not found in the script directory.")
    sys.exit(1)
if not os.path.exists(small_file_path):
    print(f"Error: {small_file} not found in the script directory.")
    sys.exit(1)

# Process each file
for i, file in enumerate([large_file_path, small_file_path]):
    output_file = os.path.join(script_dir, f'processed_file_{i+1}.csv')
    process_file(file, output_file)

print("Processing complete. Please check the output CSV files.")

Since the datasets are quite large, we process the full dataset using chunk and save in several smaller files. Now we need to combine them and get the final sampled data.

In [None]:
import pandas as pd
import os
from tqdm import tqdm

def process_chunk(chunk):
    # Combine title and abstract
    if 'title' in chunk.columns and 'abstract' in chunk.columns:
        chunk['full_text'] = chunk['title'] + ' ' + chunk['abstract'].fillna('')
        chunk['full_text'] = chunk['full_text'].str.lower()

    # Process MeSH terms and keywords if they exist
    if 'meshheading' in chunk.columns:
        chunk['mesh_terms'] = chunk['meshheading'].str.split(';')
    if 'keywords' in chunk.columns:
        chunk['keywords'] = chunk['keywords'].str.split(';')

    # Convert date to datetime if it exists
    if 'date' in chunk.columns:
        chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')

    return chunk

def process_file(file_path, is_large=False):
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None

    try:
        if is_large:
            # Process large file in chunks
            chunks = pd.read_csv(file_path, chunksize=100000)  # Adjust chunksize as needed
            processed_chunks = []
            for chunk in tqdm(chunks, desc=f"Processing {file_path}"):
                processed_chunk = process_chunk(chunk)
                processed_chunks.append(processed_chunk)
            df = pd.concat(processed_chunks, ignore_index=True)
        else:
            # Process small file
            df = pd.read_csv(file_path)
            df = process_chunk(df)

        return df
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

# Process both files
file_paths = ['processed_file_1.csv', 'processed_file_2.csv']
dataframes = []

for i, file_path in enumerate(file_paths):
    print(f"Processing {file_path}...")
    df = process_file(file_path, is_large=(i == 1))  # Assume the second file is large
    if df is not None:
        dataframes.append(df)
        print(f"Shape of {file_path}: {df.shape}")
        print(f"Columns in {file_path}: {df.columns.tolist()}")
        print(f"Sample data from {file_path}:\n{df.head()}")
        print(f"Missing values in {file_path}:\n{df.isnull().sum()}")
    print("="*50)

# Combine the dataframes if any were successfully processed
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print(f"Shape of combined dataframe: {combined_df.shape}")

    # Save the processed and combined data
    combined_df.to_csv('sampled_pubmed_data.csv', index=False)
    print("Processed and combined data saved to 'sampled_pubmed_data.csv'")
else:
    print("No dataframes were successfully processed.")