In [21]:
!pip3 install --upgrade scpscraper
import scpscraper as scraper
!pip3 install --upgrade html2text
import html2text as h2t
import json
import re
from tqdm import tqdm



In [22]:
# Method to obtain the object/containment class of the SCP entities directly from their wiki entries' source code
def get_obj_classes(scp_list):
    obj_classes = {}
    for scp in scp_list:
        print("Getting object class of SCP#" + str(scp))
        try:
            scrape = scraper.get_scp(scp)
            scp_tags = [tag for tag in scrape['tags']]
            if 'safe' in scp_tags:
                obj_classes[scp] = 'Safe'
            else:
                if 'euclid' in scp_tags:
                    obj_classes[scp] = 'Euclid'
                else:
                    if 'keter' in scp_tags:
                        obj_classes[scp] = 'Keter'
                    else:
                        obj_classes[scp] = 'Outside of scope'
        except:
            obj_classes[scp] = 'Unavailable'
    return obj_classes

In [23]:
# Method to obtain the descriptions of the SCP entities directly from their wiki entries' source code
def get_descriptions(scp_list):
    descriptions = {}
    for scp in scp_list:
        print("Getting description of SCP#" + str(scp))
        try:
            scrape = scraper.get_single_scp(scp)
            body = ' '.join([h2t.html2text(str(p)) for p in scrape.find_all('p')]).split("**")
            descloc = [index for index in range(len(body)) if body[index] == 'Description:' or body[index] == 'Description'][0]
            descriptions[scp] = ' '.join((body[(descloc+1)]).split('\n'))
        except:
            descriptions[scp] = 'Unavailable'
        
    return descriptions


In [24]:
# Method to obtain the special containment procedures of the SCP entities directly from their wiki entries' source code
def get_spconprocs(scp_list):
    spconprocs = {}
    for scp in scp_list:
        print("Getting special containment procedures of SCP#" + str(scp))
        try:
            scrape = scraper.get_single_scp(scp)
            body = ' '.join([h2t.html2text(str(p)) for p in scrape.find_all('p')]).split("**")
            descloc = [index for index in range(len(body)) if body[index] == 'Special Containment Procedures:' or body[index] == 'Special Containment Procedures'][0]
            spconprocs[scp] = ' '.join((body[(descloc+1)]).split('\n'))
        except:
            spconprocs[scp] = 'Unavailable'
            
    return spconprocs

In [25]:
# Method to build the json object with the id, the object class, the description and the ...
# ... containment procedures of each SCP entity, and export it to a file.
def build_json(scp_list):
    json_dict = {}
    obj_classes = get_obj_classes(scp_list)
    descs = get_descriptions(scp_list)
    spconprocs = get_spconprocs(scp_list)
    print("Building json...")
    for scp in scp_list:
        json_dict[scp] = {
            "obj_class": re.sub("[^a-zA-Z\d\s:]", "", str(obj_classes[scp])),
            "desc": re.sub("[^a-zA-Z\d\s:]", "", str(descs[scp])),
            "spconproc": re.sub("[^a-zA-Z\d\s:]", "", str(spconprocs[scp]))
        }
    json_string = json.dumps(json_dict)
    with open("scps.json", "w") as json_output:
        json_output.write(json_string)
    print("Done!")

In [26]:
# Formatting the SCP IDs to use them for SCPScraper
def add_zeroes(number_string):
    if len(number_string) == 1:
        return '00' + number_string
    if len(number_string) == 2:
        return '0' + number_string
    return number_string

# Here the SCPs to be included in the scraping can be indicated
int_array = range(1,7000)
string_array = [str(i) for i in int_array]
formatted_array = [add_zeroes(i) for i in string_array]

In [None]:
build_json(formatted_array)

Getting object class of SCP#001
Getting object class of SCP#002
Getting object class of SCP#003
Getting object class of SCP#004
Getting object class of SCP#005
Getting object class of SCP#006
Getting object class of SCP#007
Getting object class of SCP#008
Getting object class of SCP#009
Getting object class of SCP#010
Getting object class of SCP#011
Getting object class of SCP#012
Getting object class of SCP#013
Getting object class of SCP#014
Getting object class of SCP#015
Getting object class of SCP#016
Getting object class of SCP#017
Getting object class of SCP#018
Getting object class of SCP#019
Getting object class of SCP#020
Getting object class of SCP#021
Getting object class of SCP#022
Getting object class of SCP#023
Getting object class of SCP#024
Getting object class of SCP#025
Getting object class of SCP#026
Getting object class of SCP#027
Getting object class of SCP#028
Getting object class of SCP#029
Getting object class of SCP#030
Getting object class of SCP#031
Getting 

Getting object class of SCP#258
Getting object class of SCP#259
Getting object class of SCP#260
Getting object class of SCP#261
Getting object class of SCP#262
Getting object class of SCP#263
Getting object class of SCP#264
Getting object class of SCP#265
Getting object class of SCP#266
Getting object class of SCP#267
Getting object class of SCP#268
Getting object class of SCP#269
Getting object class of SCP#270
Getting object class of SCP#271
Getting object class of SCP#272
Getting object class of SCP#273
Getting object class of SCP#274
Getting object class of SCP#275
Getting object class of SCP#276
Getting object class of SCP#277
Getting object class of SCP#278
Getting object class of SCP#279
Getting object class of SCP#280
Getting object class of SCP#281
Getting object class of SCP#282
Getting object class of SCP#283
Getting object class of SCP#284
Getting object class of SCP#285
Getting object class of SCP#286
Getting object class of SCP#287
Getting object class of SCP#288
Getting 

Getting object class of SCP#515
Getting object class of SCP#516
Getting object class of SCP#517
Getting object class of SCP#518
Getting object class of SCP#519
Getting object class of SCP#520
Getting object class of SCP#521
Getting object class of SCP#522
Getting object class of SCP#523
Getting object class of SCP#524
Getting object class of SCP#525
Getting object class of SCP#526
Getting object class of SCP#527
Getting object class of SCP#528
Getting object class of SCP#529
Getting object class of SCP#530
Getting object class of SCP#531
Getting object class of SCP#532
Getting object class of SCP#533
Getting object class of SCP#534
Getting object class of SCP#535
Getting object class of SCP#536
Getting object class of SCP#537
Getting object class of SCP#538
Getting object class of SCP#539
Getting object class of SCP#540
Getting object class of SCP#541
Getting object class of SCP#542
Getting object class of SCP#543
Getting object class of SCP#544
Getting object class of SCP#545
Getting 

Getting object class of SCP#772
Getting object class of SCP#773
Getting object class of SCP#774
Getting object class of SCP#775
Getting object class of SCP#776
Getting object class of SCP#777
Getting object class of SCP#778
Getting object class of SCP#779
Getting object class of SCP#780
Getting object class of SCP#781
Getting object class of SCP#782
Getting object class of SCP#783
Getting object class of SCP#784
Getting object class of SCP#785
Getting object class of SCP#786


In [20]:
# Output formatting stuff, please ignore
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)