In [1]:
import datetime
import json
import os
import requests
import rdflib
import random
import zipfile
start = datetime.datetime.utcnow()
LDP = rdflib.Namespace("http://www.w3.org/ns/ldp#")

In [2]:
HEADERS = { "Prefer": "return=representation; include=\"http://www.trellisldp.org/ns/trellis#PreferAudit\""}

def save_rts(path, resource_templates):
    current = datetime.datetime.utcnow()
    print(f"Started save of {len(resource_templates)} Resource Templates")
    for i, row in enumerate(resource_templates):
        if not i%25:
            print(f".", end="")
        file_path = f"{path}/{current.year}/{current.month:02d}/{current.day:02d}/resource_templates/{i:05d}.ttl"
        with open(file_path, "w+") as fo:
            json.dump(row, fo, sort_keys=True, indent=2)
    end = datetime.datetime.utcnow()
    print(f"Finished at {end}, total time {(end-current).seconds}")
        

def save_ttl(path, type_of, ttls):
    current = datetime.datetime.utcnow()
    for i, row in enumerate(ttls):
        file_path = f"{path}/{current.year}/{current.month:02d}/{current.day:02d}/{type_of}/{i:05d}.ttl"
        with open(file_path, "w+") as ttl:
            ttl.write(row)
            
def harvest_trellis_rts(ld4p_url='https://trellis.stage.sinopia.io/repository/ld4p'):
    start = datetime.datetime.utcnow()
    print(f"Harvesting Resource Templates for {ld4p_url} at {start}")
    ld4p_result = requests.get(ld4p_url)
    ld4p_graph = rdflib.ConjunctiveGraph()
    ld4p_graph.parse(data=ld4p_result.text, format='turtle')
    resource_templates = []
    for o in ld4p_graph.objects(predicate=LDP.contains):
        if not len(resource_templates)%1000:
            print(f"{len(resource_templates):,}", end="")
        else:
            print(".", end="")
        template_request = requests.get(str(o))
        resource_templates.append(template_request.text)
    end = datetime.datetime.utcnow()
    print(f"Finished harvesting {len(resource_templates):,} from {ld4p_url} at {end}]\nTotal time {((end-start).seconds / 60):,}")
    return resource_templates
    
def harvest_trellis(trellis_url = 'https://trellis.stage.sinopia.io/repository'):
    start = datetime.datetime.utcnow()
    print(f"Starting Harvest of {trellis_url} at {start}")
    home = requests.get(trellis_url)
    home_graph = rdflib.ConjunctiveGraph()
    home_graph.parse(data=home.text, format='turtle')
    test, training = [], []
    for o in home_graph.objects(predicate=LDP.contains):
        group_result = requests.get(str(o))
        if str(o).find("ld4p") > 0:
            continue
        print(f"Institution URI {o}")
        group_graph = rdflib.ConjunctiveGraph()
        group_graph.parse(data=group_result.text, format='turtle')
        for i, resource_uri in enumerate(
            group_graph.objects(predicate=LDP.contains)):
            resource_result = requests.get(str(resource_uri), headers=HEADERS)
            print(".", end="")
            output = resource_result.text
            output = output.replace("<>", f"<{resource_uri}>")
            choice = random.random()
            if choice <= .80:
                training.append(output)
            else:
                test.append(output)
        print(f"Finished with {i}")
    end = datetime.datetime.utcnow()
    print(f"""Finished Harvesting at {end}, total time: {(end-start).seconds} seconds, 
    Test size {len(test)}
    Training {len(training)}""")
    
    return test, training, home_graph

In [3]:
def save_data(zipped_file, data):
    for i, row in enumerate(data):
        zipped_file.writestr(f"{i:05d}.ttl", row)
        if not i%25:
            print(f"{i:,}", end="")
        else:
            print(".", end="")
    
def persist(path, testing, training, resource_templates):
    training_zip = zipfile.ZipFile(os.path.join(path, 'training.zip'), 'w')
    testing_zip = zipfile.ZipFile(os.path.join(path, 'testing.zip'), 'w')
    rt_zip = zipfile.ZipFile(os.path.join(path, 'resource-templates.zip'), 'w')
    save_data(training_zip, training)
    save_data(testing_zip, testing)
    save_data(rt_zip, resource_templates)
    

In [4]:
print(start)

2019-11-22 17:33:56.596959


# RDF Sinopia Harvest on Stage
## 2019-11-22 17:33:56.596959


In [5]:
test, training, home = harvest_trellis()
stage_rts = harvest_trellis_rts()

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-11-22 17:39:20.614261
Institution URI https://trellis.stage.sinopia.io/repository/tamu
......Finished with 5
Institution URI https://trellis.stage.sinopia.io/repository/michigan
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/yale
.......................................................................Finished with 70
Institution URI https://trellis.stage.sinopia.io/repository/minnesota
.........Finished with 8
Institution URI https://trellis.stage.sinopia.io/repository/nlm
....Finished with 3
Institution URI https://trellis.stage.sinopia.io/repository/boulder
.................Finished with 16
Institution URI https://trellis.stage.sinopia.io/repository/ucdavis
.......Finished with 6
Institution URI https://trellis.stage.sinopia.io/repository/chicago
..........Finished with 9
Institution URI https://trellis.stage.sinopia.io/repository/ucsd
...............................................F

In [9]:
save_ttl("/Users/jpnelson/2019/sinopia-data", "train", training)
save_ttl("/Users/jpnelson/2019/sinopia-data", "test", test)
save_rts("/Users/jpnelson/2019/sinopia-data", stage_rts)

Started save of 844 Resource Templates
..................................Finished at 2019-11-22 17:54:44.979376, total time 0


In [10]:
stage_graph = rdflib.Graph()
print(f"Start size {len(stage_graph)}")
for row in test:
    stage_graph.parse(data=row, format='turtle')
print(f"After test size is {len(stage_graph)}")
for row in training:
    stage_graph.parse(data=row, format='turtle')
print(f"After training total size is {len(stage_graph)}")

Start size 0
After test size is 7137
After training total size is 34003


In [12]:
sinopia_rt = rdflib.URIRef("http://sinopia.io/vocabulary/hasResourceTemplate")
delete_rt = rdflib.Literal("UCSD:RT:bf2:Monograph:Item_OpenAccess")

In [14]:
for s,o in stage_graph.subject_objects(predicate=sinopia_rt):
    if o == delete_rt:
        print(s)
    else:
        print(".", end="")

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

# RDF Sinopia Harvest on Stage
## 2019-11-08 17:37:53.440412

### Staging

In [20]:
test, training, home = harvest_trellis()
stage_rts = harvest_trellis_rts()

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-11-08 17:49:34.461056
Institution URI https://trellis.stage.sinopia.io/repository/michigan
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/boulder
.................Finished with 16
Institution URI https://trellis.stage.sinopia.io/repository/frick
.....................Finished with 20
Institution URI https://trellis.stage.sinopia.io/repository/penn
..............................Finished with 29
Institution URI https://trellis.stage.sinopia.io/repository/duke
Finished with 29
Institution URI https://trellis.stage.sinopia.io/repository/nlm
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/northwestern
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/pcc
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/hrc
........Finished with 7
Institution URI https://trellis.stage.sinopia.io/repository/alberta
.....

In [21]:
save_ttl("/Users/jpnelson/2019/sinopia-data", "train", training)
save_ttl("/Users/jpnelson/2019/sinopia-data", "test", test)
save_rts("/Users/jpnelson/2019/sinopia-data", stage_rts)

Started save of 818 Resource Templates
.................................Finished at 2019-11-08 18:25:46.508654, total time 0


### Production

In [25]:
prod_test, prod_train, home = harvest_trellis('https://trellis.sinopia.io/repository/ld4p')
prod_rts = harvest_trellis_rts('https://trellis.sinopia.io/repository/ld4p')

Starting Harvest of https://trellis.sinopia.io/repository/ld4p at 2019-11-08 19:07:47.206413
Finished Harvesting at 2019-11-08 19:10:01.042557, total time: 133 seconds, 
    Test size 0
    Training 0
Harvesting Resource Templates for https://trellis.sinopia.io/repository/ld4p at 2019-11-08 19:10:01.042806
0............................................................................................................................................................................................................................................................................................................................................................Finished harvesting 349 from https://trellis.sinopia.io/repository/ld4p at 2019-11-08 19:12:19.615141]
Total time 2.3


In [27]:
save_ttl("/Users/jpnelson/2019/sinopia-data/prod", "train", prod_train)
save_ttl("/Users/jpnelson/2019/sinopia-data/prod", "test", prod_test)
save_rts("/Users/jpnelson/2019/sinopia-data/prod", prod_rts)

Started save of 349 Resource Templates
..............Finished at 2019-11-08 19:16:37.870985, total time 0


# RDF Sinopia Harvest on Stage
## 2019-10-29 15:32:51.002907


In [5]:
test, training, home = harvest_trellis()
stage_rts = harvest_trellis_rts()

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-10-29 15:34:08.597105
Institution URI https://trellis.stage.sinopia.io/repository/penn
.............Finished with 12
Institution URI https://trellis.stage.sinopia.io/repository/frick
......Finished with 5
Institution URI https://trellis.stage.sinopia.io/repository/alberta
.............Finished with 12
Institution URI https://trellis.stage.sinopia.io/repository/ucsd
........Finished with 7
Institution URI https://trellis.stage.sinopia.io/repository/dlc
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/ucdavis
......Finished with 5
Institution URI https://trellis.stage.sinopia.io/repository/princeton
........Finished with 7
Institution URI https://trellis.stage.sinopia.io/repository/tamu
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/hrc
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/yale
...................................Fini

In [6]:
save_ttl("/Users/jpnelson/2019/sinopia-data", "train", training)
save_ttl("/Users/jpnelson/2019/sinopia-data", "test", test)

In [7]:
save_rts("/Users/jpnelson/2019/sinopia-data", stage_rts)

Started save of 767 Resource Templates
...............................Finished at 2019-10-29 16:21:02.410966, total time 0


# RDF Sinopia Harvest on Stage
## 2019-10-24 21:25:23.248357

In [4]:
test, training, home = harvest_trellis)

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-10-24 21:26:18.730170
Institution URI https://trellis.stage.sinopia.io/repository/chicago
..........Finished with 9
Institution URI https://trellis.stage.sinopia.io/repository/alberta
..........Finished with 9
Institution URI https://trellis.stage.sinopia.io/repository/harvard
....................................................Finished with 51
Institution URI https://trellis.stage.sinopia.io/repository/pcc
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/ucsd
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/dlc
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/michigan
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/penn
............Finished with 11
Institution URI https://trellis.stage.sinopia.io/repository/cornell
..................Finished with 17
Institution URI https://trellis.stage.sinopia.io

In [None]:
save_ttl("/Users/jpnelson/2019/sinopia-data", "train", training)
save_ttl("/Users/jpnelson/2019/sinopia-data", "test", test)

In [None]:
stage_rts = harvest_trellis_rts()
save_rts("/Users/jpnelson/2019/sinopia-data", stage_rts)

Harvesting Resource Templates for https://trellis.stage.sinopia.io/repository/ld4p at 2019-10-24 21:30:19.252080
0.......................................................................................................................................

# RDF Sinopia Harvest on Stage
## 2019-10-14 18:07:12.823309

In [4]:
test, training, home = harvest_trellis()

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-10-14 18:08:20.995250
Institution URI https://trellis.stage.sinopia.io/repository/pcc
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/yale
.................Finished with 16
Institution URI https://trellis.stage.sinopia.io/repository/dlc
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/penn
....Finished with 3
Institution URI https://trellis.stage.sinopia.io/repository/alberta
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/frick
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/princeton
.......Finished with 6
Institution URI https://trellis.stage.sinopia.io/repository/tamu
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/ucdavis
.....Finished with 4
Institution URI https://trellis.stage.sinopia.io/repository/cornell
............Finished with 11
Institution URI https://t

In [11]:
save_ttl("/Users/jpnelson/2019/sinopia-data", "train", training)
save_ttl("/Users/jpnelson/2019/sinopia-data", "test", test)

In [15]:
stage_rts = harvest_trellis_rts()

Harvesting Resource Templates for https://trellis.stage.sinopia.io/repository/ld4p at 2019-10-14 18:26:11.483249
0.........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................Finished harvesting 714 from https://trellis.stage.sinopia.io/repository/ld4p at 2019-10-14 18:30:38.171775]
Total time 4.433333333333334


In [17]:
save_rts("/Users/jpnelson/2019/sinopia-data", stage_rts)

Started save of 714 Resource Templates
.............................Finished at 2019-10-14 18:31:27.414453, total time 0


# RDF Sinopia Harvest on Stage
## 2019-10-03 18:20:34.077127

In [15]:
test, training, home = harvest_trellis()

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-10-03 20:17:55.992259
Institution URI https://trellis.stage.sinopia.io/repository/princeton
.......Finished with 6
Institution URI https://trellis.stage.sinopia.io/repository/dlc
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/duke
Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/washington
.....Finished with 4
Institution URI https://trellis.stage.sinopia.io/repository/nlm
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/stanford
...............Finished with 14
Institution URI https://trellis.stage.sinopia.io/repository/yale
.................Finished with 16
Institution URI https://trellis.stage.sinopia.io/repository/pcc
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/michigan
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/chicago
.Finished with 0
Institution URI https:

In [16]:
stage_rts = harvest_trellis_rts()

Harvesting Resource Templates for https://trellis.stage.sinopia.io/repository/ld4p at 2019-10-03 20:18:25.631349
0....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................Finished harvesting 661 from https://trellis.stage.sinopia.io/repository/ld4p at 2019-10-03 20:20:12.703037]
Total time 1.7833333333333334


In [9]:
print(len(test),len(training), len(stage_rts))

30 111 661


## Production

In [14]:
test, training, home = harvest_trellis('https://trellis.sinopia.io/repository/ld4p')

Starting Harvest of https://trellis.sinopia.io/repository/ld4p at 2019-10-03 20:15:48.898723
Finished Harvesting at 2019-10-03 20:16:23.951683, total time: 35 seconds, 
    Test size 0
    Training 0


In [12]:
'https://trellis.sinopia.io/repository/ld4p'.find("more")

-1

# RDF Sinopia Harvest on Stage
## 2019-09-24 15:27:55.652209

In [9]:
test, training, home = harvest_trellis()

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-09-24 15:32:49.770525
Institution URI https://trellis.stage.sinopia.io/repository/tamu
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/ucdavis
.....Finished with 4
Institution URI https://trellis.stage.sinopia.io/repository/duke
Finished with 4
Institution URI https://trellis.stage.sinopia.io/repository/michigan
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/pcc
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/stanford
...............Finished with 14
Institution URI https://trellis.stage.sinopia.io/repository/cornell
...........Finished with 10
Institution URI https://trellis.stage.sinopia.io/repository/frick
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/hrc
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/nlm
.Finished with 0
Institution URI https://trellis.stage.sino

In [10]:
stage_rts = harvest_trellis_rts()

Harvesting Resource Templates for https://trellis.stage.sinopia.io/repository/ld4p at 2019-09-24 15:38:27.940804
0.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................Finished harvesting 614 from https://trellis.stage.sinopia.io/repository/ld4p at 2019-09-24 15:42:10.759376]
Total time 3.7


In [11]:
save_ttl("/Users/jpnelson/2019/sinopia-data", "train", training)
save_ttl("/Users/jpnelson/2019/sinopia-data", "test", test)

In [16]:
save_rts("/Users/jpnelson/2019/sinopia-data", stage_rts)

Started save of 614 Resource Templates
.........................Finished at 2019-09-24 17:28:21.058932, total time 6938


# RDF Sinopia Harvest on Stage
## 2019-09-09 23:44:53.286982


In [20]:
test, training, home = harvest_trellis()

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-09-10 00:02:00.095830
Institution URI https://trellis.stage.sinopia.io/repository/yale
.................Finished with 16
Institution URI https://trellis.stage.sinopia.io/repository/cornell
...........Finished with 10
Institution URI https://trellis.stage.sinopia.io/repository/washington
....Finished with 3
Institution URI https://trellis.stage.sinopia.io/repository/ucdavis
....Finished with 3
Institution URI https://trellis.stage.sinopia.io/repository/nlm
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/northwestern
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/frick
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/hrc
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/boulder
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/duke
Finished with 0
Institution URI https://trelli

In [22]:
save_ttl("/Users/jpnelson/2019/sinopia-data", "train", training)
save_ttl("/Users/jpnelson/2019/sinopia-data", "test", test)

In [11]:
stage_rts = harvest_trellis_rts()

Harvesting Resource Templates for https://trellis.stage.sinopia.io/repository/ld4p at 2019-09-09 23:49:19.349898
0......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [14]:
for i,row in enumerate(stage_rts):
    try:
        rt = json.loads(row)
    except:
        print(f"Error with {i:,}")
        continue
    with open(f"/Users/jpnelson/2019/sinopia-data/2019/09/09/resource_templates/rt-{i:05d}.json", "w+") as fo:
        json.dump(rt, fo, indent=2, sort_keys=True)
    if not i%25:
        print(f"{i:,}", end="")
    else:
        print(".", end="")

0........................25........................50........................75........................100........................125........................150........................175........................200........................225........................250........................275........................300........................325........................350........................375........................400........................425........................450........................475........................500........................525........................550........................575........................600............Error with 613
...........625........................650........................675........................700........................725........................750........................775........................800........................825........................850Error with 851
.......................875........................900..

# RDF Sinopia Harvest
## 2019-08-29 18:37:13.564139

In [84]:
test, training, home = harvest_trellis()

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-08-29 19:28:48.428878
Institution URI https://trellis.stage.sinopia.io/repository/ucdavis
....Finished with 3
Institution URI https://trellis.stage.sinopia.io/repository/harvard
...............Finished with 14
Institution URI https://trellis.stage.sinopia.io/repository/ucsd
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/washington
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/dlc
Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/princeton
.......Finished with 6
Institution URI https://trellis.stage.sinopia.io/repository/yale
...Finished with 2
Institution URI https://trellis.stage.sinopia.io/repository/pcc
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/michigan
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/northwestern
..Finished with 1
Institution URI https://trel

In [77]:
rts = harvest_trellis_rts()

Harvesting Resource Templates for https://trellis.stage.sinopia.io/repository/ld4p
.....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [85]:
save_ttl("/Users/jpnelson/2019/sinopia-data", "train", training)
save_ttl("/Users/jpnelson/2019/sinopia-data", "test", test)

In [96]:
for i,row in enumerate(rts):
    try:
        rt = json.loads(row)
    except:
        print(f"Error with {i:,}")
        continue
    with open(f"/Users/jpnelson/2019/sinopia-data/2019/08/29/resource_templates/rt-{i:05d}.json", "w+") as fo:
        json.dump(rt, fo, indent=2, sort_keys=True)
    if not i%25:
        print(f"{i:,}", end="")
    else:
        print(".", end="")
 

0........................25........................50........................75........................100........................125........................150.....Error with 156
..................175........................200........................225........................250........................275........................300........................325........................350........................375........................400........................425........................450........................475........................500........................525........................550........................575........................600........................625........................650........................675........................700........................725........................750........................775........................800........................825........................850........................875........................900................

In [97]:
rts[156]

''

# RDF Sinopia Harvest
## 2019-08-28 16:18:53.266392

In [33]:
test, training, resource_templates, home = harvest_trellis()

Starting Harvest of https://trellis.stage.sinopia.io/repository at 2019-08-28 18:50:24.315447
Institution URI https://trellis.stage.sinopia.io/repository/northwestern
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/duke
Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/princeton
.......Finished with 6
Institution URI https://trellis.stage.sinopia.io/repository/dlc
Finished with 6
Institution URI https://trellis.stage.sinopia.io/repository/frick
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/nlm
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/michigan
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/alberta
..Finished with 1
Institution URI https://trellis.stage.sinopia.io/repository/boulder
.Finished with 0
Institution URI https://trellis.stage.sinopia.io/repository/cornell
......Finished with 5
Institution URI https://trellis.stage.sinopia.io/

NameError: name 'resource_templates' is not defined

In [21]:
len(training)

53

In [24]:
save_ttl("/Users/jpnelson/2019/sinopia-data", "train", training)
save_ttl("/Users/jpnelson/2019/sinopia-data", "test", test)


# RDF Sinopia Harvest
## 2019-08-19 16:46:33.21902

In [48]:
home_stage = requests.get('https://trellis.stage.sinopia.io/repository')
home_stage_graph = rdflib.ConjunctiveGraph()
home_stage_graph.parse(data=home_stage.text, format='turtle')
rdf_resources = []
test, training = [], []
for o in home_stage_graph.objects(predicate=LDP.contains):
    if str(o).endswith("ld4p"):
        continue
    print(o)
    group_result = requests.get(str(o))
    group_graph = rdflib.ConjunctiveGraph()
    group_graph.parse(data=group_result.text, format='turtle')
    for resource_uri in group_graph.objects(predicate=rdflib.URIRef('http://www.w3.org/ns/ldp#contains')):
        resource_result = requests.get(str(resource_uri))
        output = resource_result.text
        output = output.replace("<>", f"<{resource_uri}>")
        choice = random.random()
        if choice <= .80:
            test.append(output)
        else:
            training.append(output)
        


https://trellis.stage.sinopia.io/repository/hrc
https://trellis.stage.sinopia.io/repository/minnesota
https://trellis.stage.sinopia.io/repository/penn
https://trellis.stage.sinopia.io/repository/stanford
https://trellis.stage.sinopia.io/repository/washington
https://trellis.stage.sinopia.io/repository/frick
https://trellis.stage.sinopia.io/repository/michigan
https://trellis.stage.sinopia.io/repository/pcc
https://trellis.stage.sinopia.io/repository/dlc
https://trellis.stage.sinopia.io/repository/nlm
https://trellis.stage.sinopia.io/repository/tamu
https://trellis.stage.sinopia.io/repository/alberta
https://trellis.stage.sinopia.io/repository/cornell
https://trellis.stage.sinopia.io/repository/northwestern
https://trellis.stage.sinopia.io/repository/chicago
https://trellis.stage.sinopia.io/repository/ucsd
https://trellis.stage.sinopia.io/repository/yale
https://trellis.stage.sinopia.io/repository/princeton
https://trellis.stage.sinopia.io/repository/ucdavis
https://trellis.stage.sinopi

In [44]:
len(training)

9

In [45]:
len(test)

33

In [49]:
print(training[1])

@prefix schema:  <http://schema.org/> .
@prefix owl:  <http://www.w3.org/2002/07/owl#> .
@prefix xsd:  <http://www.w3.org/2001/XMLSchema#> .
@prefix memento:  <http://mementoweb.org/ns#> .
@prefix skos:  <http://www.w3.org/2004/02/skos/core#> .
@prefix rdfs:  <http://www.w3.org/2000/01/rdf-schema#> .
@prefix acl:  <http://www.w3.org/ns/auth/acl#> .
@prefix geo:  <http://www.w3.org/2003/01/geo/wgs84_pos#> .
@prefix dc11:  <http://purl.org/dc/elements/1.1/> .
@prefix as:  <https://www.w3.org/ns/activitystreams#> .
@prefix rdf:  <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix ldp:  <http://www.w3.org/ns/ldp#> .
@prefix time:  <http://www.w3.org/2006/time#> .
@prefix prov:  <http://www.w3.org/ns/prov#> .
@prefix dc:  <http://purl.org/dc/terms/> .

_:b0    rdf:value  "310724444444444"@en ;
        rdf:type   <http://id.loc.gov/ontologies/bibframe/Barcode> ;
        <http://sinopia.io/vocabulary/hasResourceTemplate>  "sinopia:resourceTemplate:bf2:Identifiers:Barcode" .

_:b1    rdf:v

In [None]:
for i,row in enumerate(test):
    with open(f"/Users/jpnelson/2019/sinopia-data/2019/08/testing/{i:05d}.ttl", "w+") as ttl:
        ttl.write(row)