In [1]:
from pathlib import Path
import scrapy
from scrapy.crawler import CrawlerProcess
from bs4 import BeautifulSoup 
import json
import pprint


In [2]:
class OgsSpider(scrapy.Spider):
    name = "ogs_spider"
    allowed_domains = [", for "]
    start_urls = ["https://international.northeastern.edu/ogs/"]

    def parse(self, response):
        # parse the response
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # page content
        title = soup.title.string
        body_orig = soup.find('div', id='content') if soup.find('div', id='content') else "No content found"
        body = body_orig.get_text(separator=" ", strip=True)

        sections_list = []
        sections = body_orig.find_all('section')
        for section in sections:
            header = section.find('div', class_='heading').get_text(separator=" ", strip=True) if section.find('div', class_='heading') else "No Header found"
            
            sections_list.append({
                'section_header': header,
                'section_content': section.get_text(separator=" ", strip=True)
            })

        # log
        self.log(f"Scraped: {response.url}")
        
        yield {
            'url': response.url,
            'title': title,
            'content': body,
            'sections': sections_list,
        }

        # follow links to other pages
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.startswith('http') or href.startswith('/'):
                yield response.follow(href, self.parse)

In [3]:

class JsonWriterPipeline:
    def open_spider(self, spider):
        self.items = []
    
    def close_spider(self, spider):
        # output JSON
        with open("output.json", "w", encoding="utf-8") as file:
            json.dump(self.items, file, ensure_ascii=False, indent=4)
    
    def process_item(self, item, spider):
        self.items.append(item) 
        return item

# crawler configuration
process = CrawlerProcess(settings={
    # "FEEDS": {"output.json": {"format": "json"}},  # one way to output JSON, close_spider function appears to work better
    "ITEM_PIPELINES": {
        '__main__.JsonWriterPipeline': 1,  # activate custom pipeline
    },
    "LOG_LEVEL": "INFO",  # reduce log noise
})

# start
process.crawl(OgsSpider)
process.start()


2025-02-03 20:14:34 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2025-02-03 20:14:34 [scrapy.utils.log] INFO: Versions: lxml 5.2.1.0, libxml2 2.10.4, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:03:56) [MSC v.1929 64 bit (AMD64)], pyOpenSSL 24.0.0 (OpenSSL 3.0.14 4 Jun 2024), cryptography 42.0.5, Platform Windows-10-10.0.19045-SP0
2025-02-03 20:14:34 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2025-02-03 20:14:34 [scrapy.extensions.telnet] INFO: Telnet Password: 15a9ecfe9fc76a82
2025-02-03 20:14:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2025-02-03 20:14:34 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEV

In [5]:
# read the file line by line
data = []
with open('output.json', 'r', encoding="utf8") as file:
    data = json.load(file) 
pprint.pprint(data)

[{'content': 'Welcome to the Office of Global Services The Office of Global '
             'Services (OGS) is an active resource to over 20,000 '
             'international students and scholars from 147 different nations '
             'across the world. We provide the professional expertise and '
             'support you need to maintain compliance through immigration, '
             'academic, and your employment experiences—helping you remain a '
             'valuable member of the Northeastern community. Learn More '
             "Traveling to Northeastern's Campuses There are many different "
             'rules and regulations to be aware of when traveling to your '
             'campus destination, and it’s your responsibility to make sure '
             'you comply with these regulations. Please select your campus '
             'destination to learn more about what you need to do and when to '
             'arrive for your first day of class. Traveling to the U.S. '
      

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [14]:
import tiktoken
enc = tiktoken.get_encoding("o200k_base")

def num_tokens_from_string(string, encoding_name):
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens
    
# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4o")



In [18]:
# reorg / clean
document_list = []
p_size = 0
t_size = 0
s_size = 0
w_size = 0
for page in data:
    # top level page
    doc = {
        'title': page['title'],
        'url': page['url'],
        'content': page['content']
    }
    p_size += 1
    w_size += len(page['content'])
    t_size += num_tokens_from_string(page['content'], "o200k_base")
    document_list.append(doc)
    for section in page['sections']:
        # sections
        doc = {
            'title': section['section_header'],
            'url': page['url'],
            'content': section['section_content']
        }
        s_size +=1
        document_list.append(doc)

print(t_size)
print(p_size)
print(s_size)
print(w_size)

232864
306
963
1152057


In [10]:
print(page['content'])

How to Apply Applying to Global Pathways Step One: Submit an application The application to apply for Fall 2024 is now available. To apply, please visit enroll.northeastern.edu/apply . Please note that as we continually look for ways to improve and enhance the student experience, Northeastern University has recently moved to Slate, an admissions application platform designed to provide each student with a unique and personalized admissions experience.  Thank you for your interest in Northeastern University! Step Two: Upload your documents Please have a copy of the following documents ready to successfully complete your application: Proof of English Language Proficiency TOEFL®, IELTS, Pearson PTE, Duolingo, or other university-approved equivalent language tests. If you are submitting a TOEFL score , you must also send the official score report to Northeastern University through your TOEFL iBT account (code 4999) before your application can be reviewed. If you are submitting an IELTS sco

In [None]:

with open("output.json", "w", encoding="utf-8") as file:
    json.dump(self.items, file, ensure_ascii=False, indent=4)
pprint.pprint(document_list[1])