#Open Access Entrepreneurship Research Paper Scrape & LLM Project

##Installation of all dependencies and libraries

In [None]:
!python -m spacy download en_core_web_trf

In [4]:
import os
import requests
from google.colab import drive
import re
import json

import fitz
from collections import Counter
from tqdm.auto import tqdm

import spacy
from spacy.matcher import Matcher
from transformers import pipeline
import en_core_web_sm
import en_core_web_trf

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Mount Google Drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# Mount Google Secret (for Springer API)
from google.colab import userdata
userdata.get('springer_api_key')

##Querying the Springer API

In [None]:
# Define the API query URL
url = "https://api.springernature.com/meta/v2/json"

# Define the query parameters
params = {
    "q": "language:en openaccess:true journalonlinefirst:true (keyword:\"entrepreneurship\" OR keyword:\"accelerator\" OR keyword:\"startup\" OR keyword:\"incubator\" OR keyword:\"university entrepreneurship\")",
    "p": 100,
    "api_key": "springer_api_key"
}

# Send the request to the API
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    # Extract relevant information from the response
    papers = data.get('records', [])

    # Print the titles of the papers
    for paper in papers:
        print("\n" + paper.get("abstract"))
else:
    print("Error:", response.status_code)


##Research Paper PDF Downloader

In [None]:
def download_papers(papers, folder_path):
    # Create the folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Iterate over the papers and download them
    for i, paper in enumerate(papers, 1):
        title = paper.get("title", "paper_" + str(i))
        file_name = f"{title}.pdf"
        file_path = os.path.join(folder_path, file_name)

        # Find the PDF URL
        pdf_url = None
        for url_info in paper.get("url", []):
            if url_info.get("format") == "pdf":
                pdf_url = url_info.get("value")
                break

        # Check if PDF URL is available
        if pdf_url:
            try:
                # Download the paper
                response = requests.get(pdf_url)
                response.raise_for_status()  # Raise an error for 4xx or 5xx status codes

                # Save the paper to Google Drive
                with open(file_path, 'wb') as file:
                    file.write(response.content)

                print(f"Downloaded {file_name} ({i}/{len(papers)})")
            except Exception as e:
                print(f"Error downloading {file_name}: {e}")
                continue  # Skip to the next paper if there's an error
        else:
            print(f"No PDF URL found for {title}")

# Example usage
folder_path = '/content/drive/My Drive/llm_fine-tuning/research_scraper/research_papers'
download_papers(papers, folder_path)

##Text Extraction from PDF Script

In [None]:
pdf_path = '/content/drive/My Drive/llm_fine-tuning/research_scraper/research_papers/test.pdf'
doc = fitz.open(pdf_path)

full_text = ""
for page in doc:
    full_text += page.get_text()

for page in tqdm(doc, desc="Extracting pages"):
    text_segments.append(page.get_text())

# Load a zero-shot classification model
classifier = pipeline("zero-shot-classification", model="typeform/distilbert-base-uncased-mnli")

# Classify each segment
section_labels = ["introduction", "methods", "results", "discussion"]
introduction_text = ""

for segment in text_segments:
    result = classifier(segment, candidate_labels=section_labels, hypothesis_template="This text is about {}.")
    print(result)
    if result["labels"][0] == "introduction":
        introduction_text += segment + "\n"

print("Extracted text sample:", full_text[:500])  # Print a sample to verify extraction

# Export the introduction text
with open("introduction_extracted.txt", "w") as f:
    f.write(introduction_text)