In [1]:
import json
import re
import pandas as pd

# Load and Prepare Data

In [2]:
# Load the JSON file containing the search results
# with open("../data/02_document_search_results.json", "r", encoding="utf-8") as file:
#     data = json.load(file)
data = pd.read_json("../data/02_document_search_results.json")

# Print count for verification
print(f"Total articles processed: {len(data)}")

Total articles processed: 973


# Define Patterns 
Define patterns to differentiate from the introduction and contribution part of the abstract.

In [3]:
# Define expanded regex patterns to match descriptive sentences
patterns = [
    r"\b[Tt]his (article|work|study|paper|research|review|survey|chapter|viewpoint)\b",
    r"\b[Ii]n this (work|study|paper|research|review|survey|chapter|viewpoint)\b",
    r"\b[Ww]e (propose|introduce|present|develop|describe|demonstrate|report|discuss|analyze|examine|investigate|explore|evaluate|address|outline)\b",
    r"\b[Ii]n this (manuscript|article|contribution|viewpoint|approach|framework|investigation|analysis|implementation)\b",
    r"\b[Tt]he (article|paper|study|work|research|review|survey|manuscript|current study|present study|present work|current work)\b",
    r"\b[Oo]ur (work|study|paper|chapter|viewpoint|research|approach|framework|method|system|contribution|focus|aim|objective|goal)\b",
    r"\b[Tt]his (manuscript|contribution|investigation|viewpoint|analysis|implementation|approach|framework|method|system)\b",
    r"\b[Tt]he (purpose|aim|goal|objective) of this (paper|work|study|research|article|chapter|manuscript)\b",
    r"\b[Hh]ere(,)? we\b",
]

# Extract Introduction and Contribution

In [4]:
def extract_introduction_text(text, patterns):
    """
    Extracts text from the beginning of the abstract until it reaches a sentence
    that matches one of the specified patterns (typically where authors start
    describing their specific work).

    If no pattern is found, returns the entire text.

    Args:
        text (str): The text to process
        patterns (list): List of regex patterns to match

    Returns:
        str: Text until the first pattern match, or the entire text if no pattern is found
    """
    # Handle empty text
    if not text or not text.strip():
        return text
    
    # Split text into sentences
    sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)

    # Collect sentences until a pattern match is found
    result_sentences = []
    pattern_found = False

    for sentence in sentences:
        # Check if sentence matches any pattern
        if any(re.search(pattern, sentence) for pattern in patterns):
            # Only consider it a match if we're not on the first sentence
            if i > 0:
                pattern_found = True
                break

        result_sentences.append(sentence)

    # If no pattern was found, return the original text
    if not pattern_found or len(result_sentences) < 1 or len(" ".join(result_sentences)) < 50:
        return text

    # Join the collected sentences back together with spaces
    result_text = " ".join(result_sentences)
    return result_text

In [5]:
def extract_contribution(text, patterns):
    """
    Extracts text from the first sentence matching any of the specified patterns
    until the end of the text (typically where authors start describing their specific work).

    If no pattern is found, returns an empty string.

    Args:
        text (str): The text to process
        patterns (list): List of regex patterns to match

    Returns:
        str: Text from the first pattern match to the end of the text, or empty string if no pattern is found
    """
    # Split text into sentences
    sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)

    # Find the index of the first sentence matching any pattern
    start_idx = -1
    for i, sentence in enumerate(sentences):
        if any(re.search(pattern, sentence) for pattern in patterns):
            start_idx = i
            break

    # If no pattern was found, return the input text
    if start_idx == -1:
        return text

    # Join the sentences from the matching sentence until the end
    result_text = " ".join(sentences[start_idx:])
    return result_text



In [6]:
# Add introduction text to the cleaned_data dictionary
for i, item in enumerate(data.to_dict('records')):
    # Check if abstract exists before processing and is not NaN
    if "abstract" in item and item["abstract"] and not pd.isna(item["abstract"]):
        item["introduction"] = extract_introduction_text(item["abstract"], patterns)
    else:
        # Handle missing or NaN abstracts
        item["introduction"] = ""
    
    # Update the DataFrame with the modified record
    data.loc[i, "introduction"] = item["introduction"]

# Count abstracts with non-empty introductions
abstracts_with_intros = [item for item in data.to_dict('records') if isinstance(item.get("introduction"), str) and item.get("introduction", "").strip()]

# Count how many abstracts are missing
abstracts_missing = len(data[data["abstract"].isna() | (data["abstract"] == "")])

# Count abstracts where no pattern was found (full text was returned as introduction)
full_text_abstracts = data[
    data["abstract"].notna() & 
    (data["abstract"] != "") & 
    (data["introduction"] == data["abstract"])
].to_dict('records')

# Display statistics
print(f"Number of abstracts processed: {len(data)}")
print(f"Number of abstracts missing or empty: {abstracts_missing}")
print(f"Number of abstracts with introductions extracted: {len(abstracts_with_intros)}")

# Statistics about abstracts where no pattern was found
print(f"\nNumber of abstracts where no pattern was found: {len(full_text_abstracts)}")
if len(data) > 0:
    print(
        f"Percentage with no pattern found: {len(full_text_abstracts) / (len(data) - len(full_text_abstracts)) * 100:.2f}%"
    )
    print(
        f"Percentage with pattern found: {(len(data) - len(full_text_abstracts)) / len(data) * 100:.2f}%"
    )
else:
    print("Percentage with no pattern found: N/A (no valid abstracts)")

# Display the first few introductory texts (up to 500 chars) along with their DOIs
# for i, item in enumerate(abstracts_with_intros[:3]):
#     print(f"\nIntroduction {i+1} (DOI: {item.get('doi', 'No DOI')}):")
#     intro = item["introduction"]
#     print(f"{intro[:500]}..." if len(intro) > 500 else intro)
#     print("-" * 80)

Number of abstracts processed: 973
Number of abstracts missing or empty: 2
Number of abstracts with introductions extracted: 971

Number of abstracts where no pattern was found: 204
Percentage with no pattern found: 26.53%
Percentage with pattern found: 79.03%


In [8]:
# Add contribution text to the DataFrame
for i, item in enumerate(data.to_dict('records')):
    # Check if abstract exists before processing and is not NaN
    if "abstract" in item and item["abstract"] and not pd.isna(item["abstract"]):
        item["contribution"] = extract_contribution(item["abstract"], patterns)
    else:
        # Handle missing or NaN abstracts
        item["contribution"] = ""
    
    # Update the DataFrame with the modified record
    data.loc[i, "contribution"] = item["contribution"]

# Count abstracts with non-empty contributions
abstracts_with_contributions = [
    item for item in data.to_dict('records') if item.get("contribution", "").strip()
]

# Display statistics about extracted text segments
print(f"Number of abstracts processed: {len(data)}")
print(
    f"Number of abstracts with contributions extracted: {len(abstracts_with_contributions)}"
)
print(
    f"Percentage with contributions extracted: {len(abstracts_with_contributions) / (len(data) - abstracts_missing) * 100:.2f}%"
)

# Calculate average length only if there are contributions
if abstracts_with_contributions:
    average_length = sum(len(item['contribution']) for item in abstracts_with_contributions) / len(abstracts_with_contributions)
    print(f"Average length of contributions: {average_length:.2f} characters")
else:
    print("Average length of contributions: N/A (no contributions found)")

# Display the first few contribution texts (up to 500 chars) along with their DOIs
# for i, item in enumerate(abstracts_with_contributions[:3]):
#     print(f"\nArticle {i+1} (DOI: {item.get('doi', 'No DOI')}):")
#     contribution = item["contribution"]
#     print(f"{contribution[:500]}..." if len(contribution) > 500 else contribution)
#     print("-" * 80)

Number of abstracts processed: 973
Number of abstracts with contributions extracted: 971
Percentage with contributions extracted: 100.00%
Average length of contributions: 1096.36 characters


# Clean records without introduction or contribution

In [9]:
# Clean records without introductions or contributions
cleaned_data = data[(data["introduction"].str.strip() != "") | (data["contribution"].str.strip() != "")]
print(f"Number of records with introductions or contributions: {len(cleaned_data)}")

# For backward compatibility, create a list of dictionaries if needed
cleaned_data_records = cleaned_data.to_dict('records')

Number of records with introductions or contributions: 971


## Save the Results

In [10]:
# Save the cleaned data to a JSON file using pandas
cleaned_data.to_json("../data/04_document_search_results_with_intros_and_contributions.json", orient="records", indent=4)

In [None]:
# Print all the contributions in a json format
with open("../data/processed/04_document_search_results_contributions.json", "w") as file:
    json.dump(contribution_texts, file, indent=4)