What we have to do for each trialregistry and number match we find in one PRREG field:
- create separate prereg nodes for each trialregistry and number match!
- we still need to add any urls and dois!
- but what if the url/doi are for the same trialnumber? We need to match them up!

In [19]:
## find trial numbers in PRREG and match to registry

prregs = (
    "|u https://drks.de/search/de/trial/DRKS00007824",
    "|u https://osf.io/pwjhx |d 10.17605/OSF.IO/PWJHX",
    "|u https://drks.de/search/de/trial/DRKS00015308",
    "|u https://drks.de/search/en/trial/DRKS00013206 |d  |i DRKS-ID: DRKS00013206",
    "|u https://drks.de/search/de/trial/DRKS00007687 |i DRKS-ID DRKS00007687",
    "|u https://drks.de/search/en/trial/DRKS00022819 |d  |i DRKS00022819",
    "|u https://aspredicted.org/mw3aq",
    "|u https://osf.io/2gpn9  |d  |i ",
    "!|u https://www.crd.york.ac.uk/PROSPERO/display_record.php?RecordID=221753  |d  |i ",
    "|u https://osf.io/tz7hy?view only=b42d75e6b88d4996a5cd1637220e42ef |d  |i Study 2",
    "|u https://osf.io/5r3yp |d https://doi.org/10.17605/OSF.IO/5R3YP |i first preregistration",
    "|u https://osf.io/98w3h |d https://doi.org/10.17605/OSF.IO/98W3H |i amended preregistration",
    "|u https://clinicaltrials.gov/ct2/show/NCT03418142",
    "|u https://www.drks.de/drks web/navigate.do?navigationId=trial.HTML&amp;TRIAL ID=DRKS00022867", 
    "|u https://drks.de/search/de/trial/DRKS00013309 |d  |i PROSPERO (https://www.crd.york.ac.uk/prospero; CRD42018084057; 2018/02/01), German Clinical Trials Register (www.drks.de; DRKS00013309; 2018/01/23).", 
    "|u https://drks.de/search/en/trial/DRKS00013206 |d  |i DRKS-ID: DRKS00013206",
    "|u https://drks.de/search/de/trial/DRKS00007824", 
    "https://drks.de/search/en/trial/DRKS00020564",
    "|u https://clinicaltrials.gov/ct2/show/NCT02375308?term=NCT02375308&amp;draw=2&amp;rank=1"
    )

import re

# a set of trial number regexes and the corresponding registry uri in 
# https://w3id.org/zpid/vocabs/trialregs/
trial_number_regexes = [
    ("DRKS\d+", "drks"),
    ("CRD\d+", "prospero"),
    ("ISRCTN\d+", "srctn"),
    ("NCT\d+", "clinical-trials-gov"),
    ("actrn\d+", "anzctr"),
    ("(?i)chictr[-a-z]*\d+", "chictr"),
    ("kct\d+", "cris"),
    ("ctri[\d/]+", "clinical-trial-registry-india"),
    ("\d{4}-\d+-\d+", "euctr"),
    ("irct[0-9a-z]+", "irct"),
    ("isrctn\d+", "isrctn"),
    # ("", "jma"),
    # ("", "jprn"),
    ("(?i)(nl|ntr)[-0-9]+", "dutch-trial-register"),
    ("rbr\d+", "rebec"),
    ("rpcec\d+", "rpec"),
    ("slctr[\d/]+", "slctr"),
    ("tctr\d+", "tctr"),
    ("umin\d+", "umin-japan"),
    ("u[\d-]+", "utn")
]

def add_trials_as_preregs(prereg_string, trial_number_regexes):
    """Checks the PRREG field for trial numbers and adds them as separate preregistrations per number, adding the recognzed registry, too.
    TODO: also checks any existing Preregistration nodes to see if a trial is already listed via its url, and adding the trialnumber and registry to that node, otherwise creating a new Preregistration node.
    """
    # a string may contain several trial numbers from different registries. 
    # match all of them!
    trailnumber_matches = []
    for trial_number_regex, trialreg in trial_number_regexes:
        # match = trial_number_regex.search(prereg_string)
        # change to use a string for the regex, adding re.compile() here only once:
        match = re.compile(trial_number_regex).search(prereg_string)
        if match:
            trailnumber_matches.append((trialreg, match.group()))
            # print(match.group() + " matches registry: " + trialreg)
    print(trailnumber_matches)
    return trailnumber_matches


    # for trial_number_regex, trialreg in trial_number_regexes:
    #     match = trial_number_regex.search(prereg_string)
    #     if match:
    #         print(match.group() + " matches registry: " + trialreg)
    #         #return trialreg, match.group()
    

for prreg in prregs:
    # print("this string: " + prreg + " matches: ")
    add_trials_as_preregs(prreg, trial_number_regexes)




[('drks', 'DRKS00007824')]
[]
[('drks', 'DRKS00015308')]
[('drks', 'DRKS00013206')]
[('drks', 'DRKS00007687')]
[('drks', 'DRKS00022819')]
[]
[]
[]
[]
[]
[]
[('clinical-trials-gov', 'NCT03418142')]
[('drks', 'DRKS00022867')]
[('drks', 'DRKS00013309'), ('prospero', 'CRD42018084057')]
[('drks', 'DRKS00013206')]
[('drks', 'DRKS00007824')]
[('drks', 'DRKS00020564')]
[('clinical-trials-gov', 'NCT02375308')]


In [377]:
# generate a title key for deduplication (Dublettencheck und Werksextraktion)
import json
import re

records_before = [
    # this first record is NOT a definite duplicate of the second one, because the last name of the second author is different: it should appear in the list of possible duplicates:
    {"DFK":"0360687", "mainTitle": "Interkulturelle Kompetenz der Möglichkeiten: Kritische Betrachtung eines Konstrukts und mehr - mit Aenderungen am Maß", "subtitle": "", "PY":"2018", "authors": [{"familyname":"Genkova", "givenname":"Petia"},{"familyname":"Maler", "givenname":"Pedro"}]},
    # the following two are exact duplicates of each other - they should end up in a "definite duplicates" list:
    {"DFK":"0368936", "mainTitle": "Interkulturelle Kompetenz der Möglichkeiten", "subtitle": "Kritische Betrachtung eines Konstrukts and mehr, mit Änderungen am Mass", "PY":"2020", "authors": [{"familyname":"Genkova", "givenname":"Petia"},{"familyname":"Müller", "givenname":"H."}]},
    {"DFK":"0368935", "mainTitle": "Interkulturelle Kompetenz der Möglichkeiten", "subtitle": "Kritische Betrachtung eines Konstrukts and mehr, mit Änderungen am Mass", "PY":"2020", "authors": [{"familyname":"Genkova", "givenname":"P. A."},{"familyname":"Mueller", "givenname":"H."}]},
    # the following three are also near duplicates of each other - the first should end up in a "possible duplicates" list, the other two in a "definite duplicates" list:
    {"DFK":"000001","mainTitle": "Homogenität.", "subtitle": "Ein Maß für Ärger", "PY":"2022", "authors": [{"familyname":"Genkova", "givenname":"P."},{"familyname":"Mueller", "givenname":"Pedro"}]},
    # these two are exact duplicates - they have the same title key, first author key and all authors key:
    {"DFK":"000002","mainTitle": "Homogenitaet: Ein Maß für Ärger", "subtitle": "", "PY":"2022", "authors": [{"familyname":"Genkova", "givenname":"Petia"},{"familyname":"Mueller", "givenname":"Heinz"}]},
    {"DFK":"000003","mainTitle": "Homogenitaet: Ein Mass für AErger", "subtitle": "", "PY":"2022", "authors": [{"familyname":"Genkova", "givenname":"P. A."},{"familyname":"Müller", "givenname":"H."}]}
]

umlaut_map = {"ö":"oe", "ä":"ae", "ü":"ue", "ß":"ss"}
special_char_removal = re.compile("[^a-z&0-0]")
and_replace_map = {"and": "&", "und": "&"}

def generate_title_key(title):
    key = title.casefold().translate(str.maketrans(umlaut_map))
    # replace the ands_and_unds:
    for word, initial in and_replace_map.items():
        key = re.sub(r"\b{}\b".format(word), initial, key)
    # remove special characters:
    key = special_char_removal.sub("", key)
    return key

def generate_single_author_key(givenname, familyname):
    # we want only the first letter of the given name:
    givenname = givenname[0]
    # concatenate the two:
    key = familyname + givenname
    # make it lowercase and replace umlauts:
    key = key.casefold().translate(str.maketrans(umlaut_map))
    # todo: sometimes journals mangle author names with umlauts by simply removing the dots, such that Müller becomes Muller, not Mueller. What can we do about that?
    return key

def generate_all_authors_key(authors):
    # make an empty string:
    key = ""
    # go through all authors and add their keys to the string:
    for author in authors:
        # get the keys:
        givenname = author["givenname"]
        familyname = author["familyname"]
        # generate the key for this author:
        author_key = generate_single_author_key(givenname, familyname)
        # add it to the string:
        key += author_key
    return key


def generate_keys_for_all(records):
    # go through all records and generate a key for each:
    for record in records:
        # generate the key for the current record and add it to the list (dfk_list) of all records with their keys:
        title_key = generate_title_key(record["mainTitle"] + " " + record["subtitle"])
        # add the new key to the existing record:
        record["title_key"] = title_key
        first_author_key = generate_single_author_key(record["authors"][0]["givenname"], record["authors"][0]["familyname"])
        record["first_author_key"] = first_author_key
        all_authors_key = generate_all_authors_key(record["authors"])
        # add to the record:
        record["all_authors_key"] = all_authors_key
        #print(record)
        # save the record in a file named records_with_keys.json
        with open("records_with_keys.json", "w", encoding="utf-8") as f:
           json.dump(records, f, indent=2)

    return records
####
####

records = [
  {
    "DFK": "0360687",
    "mainTitle": "Interkulturelle Kompetenz der M\u00f6glichkeiten: Kritische Betrachtung eines Konstrukts und mehr - mit Aenderungen am Ma\u00df",
    "subtitle": "",
    "PY": "2018",
    "authors": [
      {
        "familyname": "Genkova",
        "givenname": "Petia"
      },
      {
        "familyname": "Maler",
        "givenname": "Pedro"
      }
    ],
    "title_key": "interkulturellekompetenzdermoeglichkeitenkritischebetrachtungeineskonstrukts&mehrmitaenderungenammass",
    "first_author_key": "genkovap",
    "all_authors_key": "genkovapmalerp"
  },
  {
    "DFK": "0368936",
    "mainTitle": "Interkulturelle Kompetenz der M\u00f6glichkeiten",
    "subtitle": "Kritische Betrachtung eines Konstrukts and mehr, mit \u00c4nderungen am Mass",
    "PY": "2020",
    "authors": [
      {
        "familyname": "Genkova",
        "givenname": "Petia"
      },
      {
        "familyname": "M\u00fcller",
        "givenname": "H."
      }
    ],
    "title_key": "interkulturellekompetenzdermoeglichkeitenkritischebetrachtungeineskonstrukts&mehrmitaenderungenammass",
    "first_author_key": "genkovap",
    "all_authors_key": "genkovapmuellerh"
  },
  {
    "DFK": "0368935",
    "mainTitle": "Interkulturelle Kompetenz der M\u00f6glichkeiten",
    "subtitle": "Kritische Betrachtung eines Konstrukts and mehr, mit \u00c4nderungen am Mass",
    "PY": "2020",
    "authors": [
      {
        "familyname": "Genkova",
        "givenname": "P. A."
      },
      {
        "familyname": "Mueller",
        "givenname": "H."
      }
    ],
    "title_key": "interkulturellekompetenzdermoeglichkeitenkritischebetrachtungeineskonstrukts&mehrmitaenderungenammass",
    "first_author_key": "genkovap",
    "all_authors_key": "genkovapmuellerh"
  },
  {
    "DFK": "000001",
    "mainTitle": "Homogenit\u00e4t.",
    "subtitle": "Ein Ma\u00df f\u00fcr \u00c4rger",
    "PY": "2022",
    "authors": [
      {
        "familyname": "Genkova",
        "givenname": "P."
      },
      {
        "familyname": "Mueller",
        "givenname": "Pedro"
      }
    ],
    "title_key": "homogenitaeteinmassfueraerger",
    "first_author_key": "genkovap",
    "all_authors_key": "genkovapmuellerp"
  },
  {
    "DFK": "000002",
    "mainTitle": "Homogenitaet: Ein Ma\u00df f\u00fcr \u00c4rger",
    "subtitle": "",
    "PY": "2022",
    "authors": [
      {
        "familyname": "Genkova",
        "givenname": "Petia"
      },
      {
        "familyname": "Mueller",
        "givenname": "Heinz"
      }
    ],
    "title_key": "homogenitaeteinmassfueraerger",
    "first_author_key": "genkovap",
    "all_authors_key": "genkovapmuellerh"
  },
  {
    "DFK": "000003",
    "mainTitle": "Homogenitaet: Ein Mass f\u00fcr AErger",
    "subtitle": "",
    "PY": "2022",
    "authors": [
      {
        "familyname": "Genkova",
        "givenname": "P. A."
      },
      {
        "familyname": "M\u00fcller",
        "givenname": "H."
      }
    ],
    "title_key": "homogenitaeteinmassfueraerger",
    "first_author_key": "genkovap",
    "all_authors_key": "genkovapmuellerh"
  }
]

def find_duplicate_dfks(records):
    # Dictionary to store perfect matches based on title_key, first_author_key, and all_authors_key
    # it will be structured as follows:
    # { 
    #   (title_key, first_author_key, all_authors_key): [dfk1, dfk2, ...],
    #   ...
    # }
    # where dfk1, dfk2, ... are the DFKs of the records that match the key
    # and (title_key, first_author_key, all_authors_key) is a list of the keys that are the same for the records (dfks) listed as values
    perfect_matches = {}

    for record in records:
        dfk = record["DFK"]
        title_key = record["title_key"]
        first_author_key = record["first_author_key"]
        all_authors_key = record["all_authors_key"]

        if (title_key, first_author_key, all_authors_key) in perfect_matches:
            perfect_matches[(title_key, first_author_key, all_authors_key)].append(dfk)
        else:
            perfect_matches[(title_key, first_author_key, all_authors_key)] = [dfk]

    # Dictionary to store less perfect matches based on title_key and first_author_key
    possible_matches = {}

    for (title_key, first_author_key, all_authors_key), dfk_list in perfect_matches.items():
        for dfk in dfk_list:
            if (title_key, first_author_key) in possible_matches:
                possible_matches[(title_key, first_author_key)].append(dfk)
            else:
                possible_matches[(title_key, first_author_key)] = [dfk]

    perfect_matches = {k: v for k, v in perfect_matches.items() if len(v) > 1}
    possible_matches = {k: v for k, v in possible_matches.items() if len(v) > 1}

    return perfect_matches, possible_matches


perfect_matches, possible_matches = find_duplicate_dfks(records)

# print(perfect_matches)
# print(possible_matches)

print("==Exact Matches==")
# print in a structured way by going through the dictionary:
for item in perfect_matches.items():
    print("\nThese records are definitely identical:")
    for dfk in item[1]:
        # print all in the same line, comma separated:
        print(dfk, end=", ")
    print()

print("\n==Possible Matches==")
print("For these, only the list of authors is different, but the title and first author are the same:")
for item in possible_matches.items():
    print("\ncheck these records, they may be identical:")
    for dfk in item[1]:
        print(dfk, end=", ")
    print()


# print("Perfect Matches:")
# for (title_key, first_author_key, all_authors_key), dfk_list in perfect_matches.items():
#     print(f"Title Key: {title_key}, First Author Key: {first_author_key}, All Authors Key: {all_authors_key}")
#     for dfk in dfk_list:
#         print(f"- DFK: {dfk}")
#     print()

# print("\nLess Perfect Matches:")
# for (title_key, first_author_key), dfk_list in less_perfect_matches.items():
#     print(f"Title Key: {title_key}, First Author Key: {first_author_key}")
#     for dfk in dfk_list:
#         print(f"- DFK: {dfk}")
#     print()


==Exact Matches==

These records are definitely identical:
0368936, 0368935, 

These records are definitely identical:
000002, 000003, 

==Possible Matches==
For these, only the list of authors is different, but the title and first author are the same:

check these records, they may be identical:
0360687, 0368936, 0368935, 

check these records, they may be identical:
000001, 000002, 000003, 


Model:

```r
<Work> a bf:Work ;
    bf:hasInstance [

        # the partOf relationship is between article instance and journal instance:
        a bf:Instance ;
        bf:issuance issuanceType:JournalArticle ;
        # Relationship bnode with all the info about the journal:
        bflc:relationship [
            a bflc:Relationship ;
            bflc:relation relations:isArticleInJournal ;
            # biblio info: issue, vol, pages, articleno:
            pxp:inVolume "91"; # taken from field JBD
            pxp:inIssue "1"; # taken from field JHFT
            pxp:pageStart "1"; # taken from combined record (split field PAGE)
            pxp:pageEnd "26";
            # or pxp:articleNumber "No. 1234455"; # taken from field PAGE
            bf:hasSeries [
                a bf:Instance ;
                bf:title [a bf:Title ; rdfs:label "Journal of the American Chemical Society" ] ;
                bf:instanceOf [a bf:Work, bf:Serial;
                    bf:title [a bf:Title ; rdfs:label "Journal of the American Chemical Society" ] ;
                    bf:issuance issuanceType:Periodical ;
                    bf:identifiedBy [
                        a bf:Issn ;
                        rdf:value "0002-7863" ;
                    ]
                ]
            ]
        ]
    ]
```

Note: In PSYNDEXER, we link an article's "Instance bundle" - actually several instances with different "media types" connected by bf:otherPhysicalFormat - to the journal "hub". 
But how can we do this in the model?



In [None]:
# linking journal and location in it:
records = [
    {'JT': "Translational Neuroscience", 'ISSN': "2081-3856", 'EISSN': "2081-6936",
     'JBD':"8", 'JHFT':"1", 'PAGE':"182-190", 'MT': 'Print', 'MT2': 'Online Medium'},
     # Articleno in PAGE:
    {'JT': "Philosophical Transactions of the Royal Society - Series B", 'ISSN': "0962-8436", 'EISSN': "1471-2970",
        'JBD':"373", 'PAGE':"No. 20170151"},
    # no JHFT, Articleno in PAGE:
    {'JT': "Frontiers in Psychiatry", 'ISSN': "1664-0640",
        'JBD':"9", 'PAGE':"No. 114"},
    # no EISSN:
    {'JT': "Wissenschaftliche Zeitschrift der Humboldt-Universität zu Berlin - Gesellschaftswissenschaftliche Reihe", 'ISSN': "0522-9855",
        'JBD':"36", 'JHFT':"10", 'PAGE':"952-955", 'MT': 'Print'},
]

# Language guessing

In [17]:
import langid
langid.set_languages(["de", "en"])

def guess_language(string_in_language):
    return (langid.classify(string_in_language)[0])

# print(langid.classify("Zur transgenerationalen Traumatisierung"))
# print(langid.classify("Ätiologie und Ansätze für die Therapie"))

# print(langid.classify("Zur transgenerationalen Traumatisierung")[0])
print(guess_language("\"A 'true' artist may draw mountainous seas!\" - Eine Würdigung von Paul Watzlawick zu seinem 100. Geburtstag"))
# print(guess_language("What does it mean for children's development whether and to what extent both parents are employed and they accordingly spend part of the day outside the family? And how should care for young children be structured? Research findings from Developmental Psychology provide some answers. (translated by DeepL)"))

de


# Checking strings for non-letters

In [18]:

language = "EnglishX$X$"
# check if string contains any non-letter character:
if not language.isalpha():
    print("yes")
else:
    print("no")

yes


# Reconciling affiliation with ror

In [19]:
import requests_cache
from datetime import timedelta
# from mappings import geonames_countries
# from mappings import abstract_origins
# from mappings import dd_codes

# for reconciling affiliation strings with ror api:
ROR_API_URL = "https://api.ror.org/organizations?affiliation="

# for getting data about a known id:
ROR_API_LOOKUP_URL = "https://api.ror.org/organizations/"

from modules.mappings import dd_codes

def replace_encodings(text):
    for case in dd_codes:
        text = text.replace(case[0], case[1]) 
    return text

urls_expire_after = {
    # Custom cache duration per url, 0 means "don't cache"
    # f'{SKOSMOS_URL}/rest/v1/label?uri=https%3A//w3id.org/zpid/vocabs/terms/09183&lang=de': 0,
    # f'{SKOSMOS_URL}/rest/v1/label?uri=https%3A//w3id.org/zpid/vocabs/terms/': 0,
}

session = requests_cache.CachedSession(
    ".cache/requests",
    allowable_codes=[200, 404],
    expire_after=timedelta(days=30),
    urls_expire_after=urls_expire_after,
)

def get_ror_id_from_api(affiliation_string):
    # this function takes a string with an affiliation name and returns the ror id for that affiliation from the ror api
    # clean the string to make sure things like "^DDS" are replaced:
    affiliation_string = replace_encodings(affiliation_string)
    #replace_encodings(affiliation_string)
    ror_api_url = ROR_API_URL + affiliation_string
    # make a request to the ror api:
    # ror_api_request = requests.get(ror_api_url)
    # make request to api with caching:
    ror_api_request = session.get(
            ror_api_url, timeout=20
    )
    # if the request was successful, get the json response:
    if ror_api_request.status_code == 200:
        ror_api_response = ror_api_request.json()
        # check if the response has any hits:
        if len(ror_api_response["items"]) > 0:
            # if so, get the item with a key value pair of "chosen" and "true" and return its id:
            for item in ror_api_response["items"]:
                if item["chosen"] == True:
                    return item["organization"]["id"]
        else:
            return None
    else:
        return None
    

    # here is a list of affiliation strings to go through:
affiliation_strings = [
    "Klinik für Frauenheilkunde und Geburtshilfe, Universitätsklinikum Ulm",
    "Klinik für Psychososmatische Medizin und Psychotherapie, Universitätsklinikum Ulm",
    "Sektion Medizinische Psychologie, Universitätsklinikum Ulm",
    "Klinik für Psychososmatische Medizin und Psychotherapie, Universitätsklinikum Ulm",
    "Psychology School, Hochschule Fresenius ^DDS University of Applied Sciences, Düsseldorf",
    "Fakultät Medizin, MSH Medical School Hamburg ^DDS University of Applied Sciences and Medical University, Hamburg",
    "Fakultät Medizin, MSH Medical School Hamburg ^DDS University of Applied Sciences and Medical University, Hamburg",
    "Department of Child and Adolescent Psychiatry, Psychosomatics and Psychotherapy; LVR Klinikum Essen; University Hospital Essen; University of Duisburg-Essen; Essen",
    "Child and Adolescent Psychiatry/Psychology, Erasmus Medical Center Rotterdam"
]

# use the function to get the ror id for each affiliation string:

# for affiliation_string in affiliation_strings:
#     print(replace_encodings(affiliation_string) + ": " + str(get_ror_id_from_api(affiliation_string)))

# print(replace_encodings("Stimulus ^DDS non psychological interference ^DDL analogy"))
print(replace_encodings("Zagreb Children^D&gt;'s Hospital, Pediatric Clinic"))
print("Test " + replace_encodings("Zagreb Children^D&gt;'s Hospital, Pediatric Clinic") + " more")
print(replace_encodings("p &lt; .05"))
print(replace_encodings(' &lt; '))
print(replace_encodings('haha ^DIF ^DTM'))
print(replace_encodings('Geo^Dffrey Je^Dfferson'))
# print(get_ror_id_from_api("Klinik für Frauenheilkunde und Geburtshilfe, Universitätsklinikum Ulm"))

Zagreb Childrenğt;'s Hospital, Pediatric Clinic
Test Zagreb Childrenğt;'s Hospital, Pediatric Clinic more
p &lt; .05
 &lt; 
haha ∞ ™
Geoffrey Jefferson


In [20]:
def utf8len(s):
    return len(s.encode('utf-8'))

print(utf8len("Test string"))

11


# Deprecated: Reconcile with Wikidata

The "reconciler" package can use other reconciliation APIs, too. Wikidata is the default. 
To change the API endpoint, call the reconcile() function with the parameter `reconciliation_endpoint="https..."`

In [21]:
from reconciler import reconcile
import pandas as pd

# A DataFrame with a column you want to reconcile.
test_df = pd.DataFrame(
    {
        "City": ["Rio de Janeiro", "São Paulo", "São Paulo", "Natal"],
        "Country": ["Q155", "Q155", "Q155", "Q155"],
        "Land": ["XD-BR", "XD-BR", "XD-BR", "XD-BR"]
    }
)

funder_names = pd.DataFrame(
    {
        "funder_name": [ "Bundesministerium für Bildung und Forschung (BMBF)",
                 "Federal Ministry of Education and Research (BMBF)",
                 "DFG", "Robert Bosch Foundation, Stuttgart, Germany","Robert Bosch Foundation","Robert Bosch Stiftung",
                 "German Research Foundation, Clinical Research Unit 256",
                 "German Research Society (DFG)",
                 "German Research Society (Deutsche Forschungsgemeinschaft)",
                 "DFG (German Research Foundation)", "German Research Society (Deutsche Forschungsgemeinschaft, DFG)",
                "German Research Council", 
               # "Berlin University Alliance",
                "Jacobs Foundation",
               # "Typhaine Foundation",
               # "European Commission",
                # "JSPS Overseas Research Fellowship",
                "German Research Society (DFG)",
              #  "Villigst e.V.",
                "Canada Research Chairs",
                "Projekt DEAL",
                "Natural Sciences and Engineering Research Council of Canada (NSERC)",
               # "Templeton Religion Trust",
               # "Austrian Science Fund (FWF)",
                "Netherlands Organisation for Scientific Research",
                "Advanced ERC grant",
               # "Vertretungsnetz",
               # "AOP Orphan","Angelini",
              #  "Science Foundation Ireland (SFI)",
              #  "Interdisciplinary Center for Clinical Research (IZKF) of the medical faculty of Münster"
              ]
    }
)

# Reconcile against type city (Q515), getting the best match for each item.
reconciled = reconcile(funder_names["funder_name"], reconciliation_endpoint="http://recon.labs.crossref.org/reconcile")
# reconciled = reconcile(funder_names["funder_name"], type_id="TerritorialCorporateBodyOrAdministrativeUnit", property_mapping={"geographicAreaCode": test_df["Land"]}, reconciliation_endpoint="https://lobid.org/gnd/reconcile/")
# reconciled = reconcile(test_df["City"], type_id="Q515")

# save the results to a csv file:
test_df.to_csv("test.csv")
reconciled.to_csv("reconciled.csv")


  0%|          | 0/2 [00:00<?, ?it/s]


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
# testing with some of our terms and GND Subject Headings:
test_df = pd.DataFrame(
     {
         "TermE": ["Inductive Deductive Reasoning", "Spatial Imagery", "Verbal Comprehension"],
         "TermD": ["Induktiv-deduktives logisches Denken", "Räumliche Bildvorstellung", "Verbales Verständnis"],
     }
    )

reconciled = reconcile(test_df["TermE"], type_id="SubjectHeading", reconciliation_endpoint="https://lobid.org/gnd/reconcile/")

# save the results to a csv file:
test_df.to_csv("test_terms.csv")
reconciled.to_csv("reconciled_terms.csv")

# This is not going to work, our cts just have too different wording compared to their gnd subject headings!




# Use our authority records for insitutes to reconcile affiliations

At first, I thought it was a good idea to do this by exposing our csv files as reconc apis using this JAVA tool: https://okfnlabs.org/reconcile-csv/

It gives us a reconciliation API endpoint (on http://localhost:8000/reconcile) that one could use with the "reconciler" package, like this:


In [None]:
test_df = pd.DataFrame(
     {
         "Obstname": ["Äpfel", "Birne", "Himbeere"],
     }
    )

reconciled = reconcile(test_df["Obstname"], reconciliation_endpoint="http://localhost:8000/reconcile")
reconciled.to_csv("reconciled_fruit.csv")  


The above works, but we need to select which part to add to the record. It returns the label that was matched, a score, a match True/False (only True if score 1.0! Maybe we can lower the cutoff to 0.75?), and a type 

Also, this only works with dataframes, so whole tables, whereas we want to reconcile individual strings.

Idea: 
Instead of reconciling with the API, we can just import the CSV of the authority institutes (as a list of dicts) and use fuzzywuzzy to match a given affiliation string. 
[Fuzzywuzzy](https://pypi.org/project/fuzzywuzzy/) ("Fuzzy string matching in python") compares two strings and returns a score. We can use a cutoff of 75% to decide if the string matches the label.



# Import our authority institutes as CSV, use fuzzywuzzy to match a given affiliation string to them

Fuzzywuzzy, if passed a list with sublists for synonyms, also automatically looks in there. 

In [None]:

# from rapidfuzz import fuzz
# from rapidfuzz import process
import Levenshtein
from rapidfuzz.process import extractOne
from rapidfuzz.fuzz import ratio
from rapidfuzz.fuzz import token_set_ratio
import csv

# import csv file with dachlux institutes:
with open('institute_lux.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    # save it in a list:
    dachlux_institutes = list(reader)
    # split string "known_names" into a list of strings on "##":
    for institute in dachlux_institutes:
        institute["known_names"] = institute["known_names"].split(" ## ")
# print("Und die ganze Tabelle:")
# print(dachlux_institutes)


# affiliation_string = "Abteilung für Psychologie, University of Luxembourg, Esch-sur-Alzette"
# expected match: uuid: bfe28ac0-4901-4125-aaa6-c1fb6c644b7a, Centre de Prévention des Toxicomanies (CePT)

def match_local_authority_institutes(string, list):
    # this function takes a string and returns the best match from a list of strings
    # first, get the list of strings:
    # then, get the best match (token_set_ratio seems to be the best scorer for our purposes, so we use that):)):
    # It yields 100% for exact matches. It is also insensitive to word order differences.
    # best_match = process.extractOne(string, list, scorer=fuzz.token_set_ratio)
    # best_match = extractOne(string, list, scorer=token_set_ratio)
    return extractOne(string, list, scorer=token_set_ratio)
    # return best_match
    # or if i just wanted the value in "uuid":
    # return best_match[0].get("uuid")

hundeliste = ("Hundefriseur", "Hundefrise", "Hundefrisur")

match_local_authority_institutes("Frosch", dachlux_institutes[list==["known_names"]])
# print("Am besten passt:",match_local_authority_institutes("Fakultät für Geisteswissenschaften, Erziehungswissenschaften und Sozialwissenschaften; Universität Luxemburg", dachlux_institutes))
# print("Am besten passt:",match_local_authority_institutes("Department of Behavioral and Cognitive Sciences, University of Luxembourg, Esch-sur-Alzette", dachlux_institutes))



To use this in the script, we need the following pseudo-code:

- check if the affiliation string contains a country name
- if yes, check if the country name is in the list of countries (D, A, CH, LUX)
- only then use fuzzywuzzy to match the affiliation string to the list of institutes for that country

If we have separate csv files for each country, we can use the country name to select the right file.


# Use a ror id that we have to look up more data

For authority records, we can use the ror id (that we already looked up using the api with the affiliation parameter) to look up more data about the institute, like the country, the name, the aliases, etc.

In [None]:
# https://api.ror.org/organizations/00tjv0s33

# for getting data about a known id:
import requests_cache
from datetime import timedelta

ROR_API_LOOKUP_URL = "https://api.ror.org/organizations/"

urls_expire_after = {
}

session_ror = requests_cache.CachedSession(
    ".cache/requests",
    allowable_codes=[200, 404],
    expire_after=timedelta(days=30),
    urls_expire_after=urls_expire_after,
)

# For research institutes/departments, it makes sense to use ror-id for things that are true for the parent 
# organization as well as for the department: country code, the country name, the city. 

# Trouble is: we have no way to know if a reconciled ror-id is for a department or for its parent organization.
# if sb used their university as the affiliation, we would get the ror-id for the university, 
# but if sb has a department, we still get the ror-id for the university, and we can't say: 
# this is the ror-id for the parent or for this exact suborg. Maybe! There is a way to find out: 
# if it's a full match (not partial) then it is a "sameAs" relationship,
# if it's only a partial match to the org name, it's a "related" or "Child" relationship?.

# The response is a JSON object containing a full ROR record. 
# See [ROR data structure](https://ror.readme.io/docs/ror-data-structure) for details about the fields and values in a ROR record.
# we are interested in:
# acryonyms (list)
# aliases (list)
# country.country_code (string), country.country_name (string)
# addresses[0].city (string), 
# addresses[0].country_geonames_id -> can link to our own geographica's
# addresses[0].geonames_city.id (string), maybe: addresses[0].geonames_city.name (string)
# Allowed external IDs: 
#  Funder ID (FundRef), ISNI, Wikidata. Other external IDs not actively curated include GRID, OrgRef, HESA, UCAS, UKPRN, CNRS.
# external_ids.ISNI.preferred (string), or if "null": .external_ids.ISNI.all (array, or just use the first [0])
# external_ids.Wikidata.preferred (string), or if "null": .external_ids.Wikidata.all (array, or just use the first [0])
# relationships (array) mit jeweils:
#    .id (ror-id mit http-Vorspann der related org)
#    .type (string, eins von "Related", "Successor","Predecessor", "Parent", "Child")
#    .label (string, z.B. "Leibniz-Association")
# we also want the German name or name in other languages, if available. 
# it should be in .labels : labels[0].iso639 = "de", labels[0].label = "Leibniz-Gemeinschaft"
# and we can construct a langstring from it like "Leibniz-Gemeinschaft"@de


def get_ror_authority_data(ror_id):
    ror_api_url = ROR_API_LOOKUP_URL + ror_id
    # make a request to the ror api:
    # ror_api_request = requests.get(ror_api_url)
    # make request to api with caching:

    # put in a try/except block to catch timeouts:
    try:
        ror_api_request = session_ror.get(
            ror_api_url, timeout=20
        )
    except TimeoutError:
        print("Timeout!")
        return None
    else: 
        # if the request was successful, get the json response:
        if ror_api_request.status_code == 200:
            try:
                ror_api_response = ror_api_request.json()
            except:
                print("Error getting json response!")
                return None
            # if we got the response (try ran successfully), 
            # do something with it:
            else:
                try:
                    name = ror_api_response["name"]
                except:
                    name = None
                    print("Error getting name!")
                for acronym in ror_api_response["acronyms"]:
                    if acronym is not None:
                        acronym = acronym
                    else:
                        acronym = None
                city = ror_api_response["addresses"][0]["city"]
                geonames_city = ror_api_response["addresses"][0]["geonames_city"]["id"]
                country_code = ror_api_response["country"]["country_code"]
                country_name = ror_api_response["country"]["country_name"]
                external_ids = ror_api_response["external_ids"]

            return name,acronym, city, geonames_city, country_code, country_name, external_ids
        else:
            return None
    

# print(get_ror_authority_data("0165gz615")) # ZPID
print(get_ror_authority_data("02qnsw591"))
print(get_ror_authority_data("random1")) 

('Centre for European Economic Research', 'ZEW', 'Mannheim', 2873891, 'DE', 'Germany', {'ISNI': {'preferred': None, 'all': ['0000 0004 0492 4665']}, 'Wikidata': {'preferred': None, 'all': ['Q191206']}, 'GRID': {'preferred': 'grid.13414.33', 'all': 'grid.13414.33'}})
None


# Feld GRANT migrieren

- Vor allem Unterfeld |n - auftrennen! |i aufheben, und wenn möglich überflüssiges, was keine Nummer ist, wegwerfen.


Ergebnis soll so aussehen (ähnlich wie bei Crossref:, aber mit grant_name ähnoich DataCite, dort heißt es aber grantTitle ) :

```
{
    'funder': 
    {
        'funder_name': 'Sächsische Aufbaubank ^DDS Förder bank ^DDS (SAB)', 'funder_id': None
    }, 
        'grants': 
        [
            {
                'grant_number': '100362999 an YG', 
                'grant_name': None
            }
        ], 
        'funding_note': None
},
{'funder': {'funder_name': 'Institute for Applied Research, Development and Further Education (IAF) at the Catholic University of Applied Sciences in Freiburg', 'funder_id': None}, 'grants': None, 'funding_note': None},
{
    'funder': 
    {
        'funder_name': 'JSPS KAKENHI', 
        'funder_id': None
    }, 
    'grants': [
        {
            'grant_number': '15K00871', 
            'grant_name': None
        }, 
        {
            'grant_number': '18KK0055', 
            'grant_name': None
        }
    ], 
    'funding_note': None}
```

Note: may remove the general "funding_note" field, anywhere else, or move its contents as grant_name to each grant (as OpenAlex does). May rename 

In [None]:
n_strings = ("KND1: 01GI0102, 01GI0420, 01GI0422, 01GI0423, 01GI0429, 01GI0431, 01GI0433, 01GI0434; KNDD: 01GI0710, 01GI0711, 01GI0712, 01GI0713, 01GI0714, 01GI0715, 01GI0716",
             "HO5852/1-1","392443797","01GI1008C","801210010-20",
             "TA 857/3-2", "2016YFC1306800", "81671329", "18ZDA293", "17411969900",
             "20144Y0053", "SHDC12014111", "13dz2260500", "ZH2018QNB19",
             "2018-FX-04, 2013-YJGJ-03", "IIR-1303", 
             "01GL1714A; 01GL1714B; 01GL1714C; 01GL1714D, 01GY1613",
             "15K00871 and 18KK0055", "366/14", "386/14", "100362999 an YG"
             )

import rdflib



def extract_grant_numbers(subfield_n_string):
    # this function takes a string and returns a list of award numbers
    # first, split the string on "," or ";" or "and": (first replacing all semicolons and "ands" with commas)")
    subfield_n_string = subfield_n_string.replace(" and ", ", ")
    subfield_n_string = subfield_n_string.replace(";", ",")
    subfield_n_string = subfield_n_string.split(", ")
    # in each of the returned list elements, remove any substrings that are shorter 
    # than 5 characters (to get rid of things like " for" or "KDL: " YG: " etc.)
    # for element in subfield_n_string:
    #     if len(element) < 5:
    #         subfield_n_string.remove(element)
    # go through all the list elements and replace each with a dict,
    # which has a key "grant_number" and a key "grant_name" (which is None for now):
    for i, element in enumerate(subfield_n_string):
        subfield_n_string[i] = {"grant_number": element, "grant_name": None}
    # return the list of dicts:
    return subfield_n_string

# extract_grant_numbers(n_strings[0])



def build_grant_from_starfield(grantfield):
    # this function takes a string and returns a funder, grant number, grant name, grant holder
    # first, use anything before the first "|" as the funder:
    funder = {"funder_name": grantfield.split("|")[0].strip(), "funder_id": None}
    # then check the rest for a grant number:
    if "|n " in grantfield:
        grants = grantfield.split("|n ")[1].split(" |")[0]
        grants = extract_grant_numbers(grants)
    else:
        grants = None
    # then check the rest for a grant name:
    if "|i " in grantfield:
        funding_info = grantfield.split("|i ")[1].split(" |")[0]
    else:
        funding_info = None
    if "|e " in grantfield:
        funding_recipients = grantfield.split("|e ")[1].split(" |")[0]
        if funding_info is not None:
            funding_info = funding_info + ". Recipient(s): " + funding_recipients
        else:
            funding_info = "Recipient(s): " + funding_recipients
    # return a dict of the variables:
    return {"funder": funder, "grants": grants, "funding_note": funding_info}

GRANTs = (
"Deutsche Forschungsgemeinschaft |e L.M. |i \"Pragmatic Functions and Effects of Register Variation and Switch: a Register approach to negation and polarity\" (SFB 1412 \"Register\"; project number: 416591334)"
)

# print(build_grant_from_starfield(GRANTs[3]))

#for grant in GRANTs:
 #    pass
    #print(build_grant_from_starfield(grant))

print(build_grant_from_starfield(GRANTs))


In [None]:
from urllib.parse import urlencode
import requests_cache
from datetime import timedelta
from modules.mappings import dd_codes
from modules.mappings import funder_names_replacelist
import html
# from mappings import geonames_countries
# from mappings import abstract_origins

skip_these_grants = (
    "projekt deal", "project deal", "open access funding"
)



# set up friendly session by adding mail in request:
CROSSREF_FRIENDLY_MAIL = "&mailto=ttr@leibniz-psychology.org"
# for getting a list of funders from api ():
CROSSREF_API_URL = "https://api.crossref.org/funders?query="


def replace_encodings(text):
    for case in dd_codes:
        text = text.replace(case[0], case[1]) 
    return text

def replace_common_fundernames(funder_name):
    """This will accept a funder name that crossref api may not recognize, at least not as the first hit,
    and replace it with a string that will supply the right funder as the first hit"""
    # if the funder_name is in the list of funder names to replace (in index 0), then replace it with what is in index 1:
    for funder in funder_names_replacelist:
        if funder_name == funder[0]:
            funder_name = funder[1]
    return funder_name
    
urls_expire_after = {
    # Custom cache duration per url, 0 means "don't cache"
    # f'{SKOSMOS_URL}/rest/v1/label?uri=https%3A//w3id.org/zpid/vocabs/terms/09183&lang=de': 0,
    # f'{SKOSMOS_URL}/rest/v1/label?uri=https%3A//w3id.org/zpid/vocabs/terms/': 0,
}

session = requests_cache.CachedSession(
    ".cache/requests",
    allowable_codes=[200, 404],
    expire_after=timedelta(days=30),
    urls_expire_after=urls_expire_after,
)


def get_crossref_funder_id(funder_name):
    # this function takes a funder name and returns the crossref funder id for that funder name
    # to do this, use the crossref api.
    if funder_name.lower() in skip_these_grants:
        print("Skipping " + funder_name)
    else:
        funder_name = replace_common_fundernames(funder_name)
        # encode for url parameters (that is, remove any html entities with an & in front of them):
        #funder_name = html.unescape(funder_name)
        # construct the api url:
        crossref_api_url = CROSSREF_API_URL + funder_name + CROSSREF_FRIENDLY_MAIL
        # + CROSSREF_FRIENDLY_MAIL
        # make a request to the crossref api:
        # crossref_api_request = requests.get(crossref_api_url)
        # make request to api:
        try:
            crossref_api_request = session.get(
                crossref_api_url, timeout=20
            )
        except TimeoutError:
            print("Timeout!")
            return None
        else:
            # if the request was successful, get the json response:
            crossref_api_response = crossref_api_request.json()
            if crossref_api_request.status_code == 200 and crossref_api_response["message"]["total-results"] >=1:
                first_hit = f'10.23456/{crossref_api_response["message"]["items"][0]["id"]}'
            else:
                first_hit = "Funder Not found"
            return funder_name, first_hit


funderstrings = ("something&amp;Else",
                 "Bundesministerium für Bildung und Forschung (BMBF)",
                 "Federal Ministry of Education and Research (BMBF)",
                 "DFG", "Robert Bosch Foundation, Stuttgart, Germany","Robert Bosch Foundation","Robert Bosch Stiftung",
                 "German Research Foundation, Clinical Research Unit 256",
                 "German Research Society (DFG)",
                 "German Research Society (Deutsche Forschungsgemeinschaft)",
                 "DFG (German Research Foundation)", "German Research Society (Deutsche Forschungsgemeinschaft, DFG)",
                "German Research Council", 
               # "Berlin University Alliance",
                "Jacobs Foundation",
               # "Typhaine Foundation",
               # "European Commission",
                # "JSPS Overseas Research Fellowship",
                "German Research Society (DFG)",
              #  "Villigst e.V.",
                "Canada Research Chairs",
                "Projekt DEAL",
                "Natural Sciences and Engineering Research Council of Canada (NSERC)",
               # "Templeton Religion Trust",
               # "Austrian Science Fund (FWF)",
                "Netherlands Organisation for Scientific Research",
                "Advanced ERC grant",
               # "Vertretungsnetz",
               # "AOP Orphan","Angelini",
              #  "Science Foundation Ireland (SFI)",
              #  "Interdisciplinary Center for Clinical Research (IZKF) of the medical faculty of Münster"
                )

# import rdflib
# from rdflib import Graph, Literal, RDF, URIRef, Namespace
# from rdflib.namespace import SKOS, DC, DCTERMS, FOAF, OWL, RDF, RDFS, XSD

# # new namespace skosxl:
# SKOSXL = Namespace("http://www.w3.org/2008/05/skos-xl#")

# fundref_registry = Graph()
# fundref_registry.parse("crossref_fundref_registry.rdf", format="xml")   

# def crossref_local_lookup(funder_name):
#     # if the name is a skosxl:prefLabel/skosxl:Label/skosxl:literalForm in the fundref registry, then return the fundref id:
#     # first, check if the funder_name is a skosxl:prefLabel/skosxl:Label/skosxl:literalForm in the fundref registry:
#     # if it is, return the fundref id:
#     for s, p, o in fundref_registry.triples((None, SKOSXL.prefLabel, Literal(funder_name))):
#         print("Found " + funder_name + " in fundref registry as skosxl:prefLabel")
#         # print("Fundref id: " + s)
#         # return s

# for funder in funderstrings:
#     crossref_local_lookup(funder)
#     #print(funder)

for funder in funderstrings:
    #print("Funder: " + funder)
    print(get_crossref_funder_id(funder))
    print("")

('something&amp;Else', 'Funder Not found')

('Bundesministerium für Bildung und Forschung (BMBF)', '10.23456/501100002347')

('Federal Ministry of Education and Research (BMBF)', '10.23456/501100002347')

('Deutsche Forschungsgemeinschaft (DFG)', '10.23456/501100001659')

('Robert Bosch Foundation, Stuttgart, Germany', 'Funder Not found')

('Robert Bosch Foundation', '10.23456/501100001646')

('Robert Bosch Stiftung', '10.23456/501100001646')

('German Research Foundation, Clinical Research Unit 256', 'Funder Not found')

('Deutsche Forschungsgemeinschaft (DFG)', '10.23456/501100001659')

('Deutsche Forschungsgemeinschaft (DFG)', '10.23456/501100001659')

('DFG (German Research Foundation)', '10.23456/501100001659')

('Deutsche Forschungsgemeinschaft (DFG)', '10.23456/501100001659')

('Deutsche Forschungsgemeinschaft (DFG)', '10.23456/501100001659')

('Jacobs Foundation', '10.23456/501100006301')

('Deutsche Forschungsgemeinschaft (DFG)', '10.23456/501100001659')

('Canada Research Cha

# Getting conference info and transforming into a contribution

Currently in field CF. However, we have over 300 publications with CF that can be dropped. We only want to keep and transform the field for BE types SS and SM (books, either edited or authored).

We transform the content of the CF field and its subfields into a bf:Contribution, where the agent is a BF:Meeting, which conforms to RDA and Bibframe.
- Required: one conference name 
- Optional: one year (extracted from heterogeneous date strings in subfield |d)
- Optional: one place as a Literal/string (extracted from subfield |o). There will be no matching with geonames or our own place/cities authority, because there is no real use case for it, and because the data is too heterogeneous (sometimes a building in a city, sometimes a hosting institution, such as a university).
- Optional: one info field, extracted from subfield |b. This will also be used to hold any leftover complex dates from subfield |d.

Bibframe example:

```r
<ProceedingsWork> a bf:Work ;
    bf:contribution [a pxc:ConferenceReference ; # a bf:Contribution ;
        bf:agent [a bf:Meeting ;
            rdfs:label "Tagung der Arbeitsgemeinschaft Psychodynamischer Professorinnen und Professoren" ;
            bf:identifiedBy [a pxc:ConferenceDoi ;
                rdf:value "10.12344" ; 
                bf:source "TIB"; # oder so
            ] ;
            bflc:simplePlace "Berlin" ;
            bflc:simpleDate "2019"^^xsd:gYear ; 
        ] ;
        bf:role <http://id.loc.gov/vocabulary/relators/ctb> ;
        bf:note [a bf:Note ;
            rdfs:label "Date: 03.-04.10.2019, International Psychoanalytic University (IPU) Berlin" ;
        ] ;
    ]
    .
```

In [None]:
# sample strings from CF field:
cf_strings = {
    'Tagung der Group Analytic Society International (GASi) |d 2017 |o Berlin',
    'DVG-Fachtagung |o Frankfurt a. M. |b Vortrag "Krise! Welche Krise? Oder: Lernen ist immer eine Möglichkeit"',
    'Tagung der Arbeitsgemeinschaft Psychodynamischer Professorinnen und Professoren |d 03.-04.10.2019 |b International Psychoanalytic University (IPU) Berlin',
}

import re

def make_conference_node(string):
    # this function takes a string and returns a conference node
    # initialize the variables with None:
    conference_name = None
    conference_pid = None
    contribution_role = "conference"
    date = None
    year = None
    location = None
    conference_note = None
    # first, use the first part before the first "|" as the conference name:
    conference_name = string.split("|")[0].strip()
    # then check the rest for a date:
    try:
        date = string.split("|d ")[1].split(" |")[0]
    except:
        date = None
    else:
        # copy the date into conference_note:
        conference_note = "Date(s): " + date
        # check date for a year: anything with 4 digits anywhere in the date string is a year:
        # use a regex for finding YYYY pattern in any string:
        year_pattern = re.compile(r"\d{4}")
        # if there is a year in the date string, use that as the date:
        if year_pattern.search(date):
            year = year_pattern.search(date).group()
        else:
            year = None
    # then check the rest for a location:
    try:
        location = string.split("|o ")[1].split(" |")[0]
    except:
        location = None
    # then check the rest for a conference note:
    try:
        conference_note = conference_note + ". " + string.split("|b ")[1]
    except:
        conference_note = conference_note
    # return a dict of the variables:
    return {"conference_name": conference_name, "conference_pid": conference_pid, "year": year, "location": location, "conference_note": conference_note}

for cf_string in cf_strings:
    print(make_conference_node(cf_string))

{'conference_name': 'Tagung der Group Analytic Society International (GASi)', 'conference_pid': None, 'year': '2017', 'location': 'Berlin', 'conference_note': 'Date(s): 2017'}
{'conference_name': 'DVG-Fachtagung', 'conference_pid': None, 'year': None, 'location': 'Frankfurt a. M.', 'conference_note': None}
{'conference_name': 'Tagung der Arbeitsgemeinschaft Psychodynamischer Professorinnen und Professoren', 'conference_pid': None, 'year': '2019', 'location': None, 'conference_note': 'Date(s): 03.-04.10.2019. International Psychoanalytic University (IPU) Berlin'}


# Dealing with Bibliographic notes in field BN

## Splitting BN: "Original: " entries

We can use what is in BN of a translation to create relationships to the original publication - by splitting it into two fields, the title with edition and the provision activity statement string (place, publisher, year). This is useful for books.

So we can make do with two string fields for the new "related work" stuff - for originals of translations, at least!

Bibframe example:

```r
<TranslatedBookWork> a bf:Work ;
    # bf:title [a bf:Title ; rdfs:label "Qualitative Forschung." ] ; # actually, we should export the original title here, too, because it is the "real"/preferred title of the work. But do we have to? 
    bflc:relationship [a bflc:Relationship ; 
        bflc:relation relations:hasTranslation ;
        bf:translationOf [
            a bf:Work ; 
            bf:hasInstance [
                a bf:Instance ;
	                bf:title [a bf:Title ; rdfs:label "Qualitative Forschung. 3. überarb. Aufl." ] ; # we could try to find the language? 
	                bf:provisionActivityStatement "Reinbek: Rowohlt Taschenbuch Verlag, 1995" .
            ]
        ]
    ]
<Instance> 
```



In [None]:
original_strings = {
    "Original: 2008. Psychische stoornissen, gedragsproblemen en verstandelijke handicap. Een integratieve benadering voor kinderen en volwassenen. 3., bearb. Aufl. Assen: Van Gorcum",
    "Original: 2008. Feeling good together. The secret to making troubled relationships work. New York: Broadway Books",
    "Original: 1976. L'hystérique, le sexe et le médecin. Paris: Masson" ,
    "Original: 198O. Mindstorms. Children, Computer, and powerful ideas. New York: Basic Books" , # note: year has a letter!
    "Original: 1978. The child and his symptoms. Third edition. Oxford: Blackwell" ,
    "Original: 1975. Vys#s27aja nervnaja dejatel'nost' #c27eloveka motivacionno-emocional'nye aspekty. Moskau: Izdatel'stvo Nauka" ,
    # outliers:
    "Original: 1981. Crescere. Roma: Astrolabio-Ubaldini." , # dot at the end
    "Original. 2012. Doing Dialectical Behavior Therapy: A practical guide. New York: Guilford Press" , # dot instead of : after Original
    "Original. 1975. Psychology. Boston: Little, Brown and Company" , # same,
    "Deutschsprachiges Original: 2006. Motivation und Handeln. - 3., überarb. u. aktualis. Aufl. Berlin: Springer" , # has "Deutschsprachiges" at the start
    "Englische Übersetzung des Originals. 1995. Qualitative Forschung. Reinbek: Rowohlt Taschenbuch Verlag" , # has "Englische Übersetzung des Originals" at the start, dot instead of : after Original
    "Englische Übersetzung des Originals: 1989. Beziehungen und Probleme verstehen. Eine Einführung in die psychotherapeutische Plananalyse. Bern: Huber" , # same, bit has a colon after Original
    "Englische Übersetzung des deutschsprachigen Originals: 1998. Namenlos. Geistig Behinderte verstehen. - 3., überarb. Aufl.- Neuwied: Luchterhand", # same, but has "Englische Übersetzung des deutschsprachigen Originals" at the start and "-" instead of "." before the place
}

# - [ ] recognize language?!

import langid
langid.set_languages(["de", "en", "nl", "fr", "it", "ru"])

def guess_language(string_in_language):
    return (langid.classify(string_in_language)[0])

def split_bn_original(string):
    # this function takes a string and returns a dict with the original string and the bn string
    # first, remove the "Original: " from the string, if it exists - it is always at the start:
    # these are its variations:
    prefix_variations = ["Original: ", "Original. ", "Deutschsprachiges Original: ", 
                         "Englische Übersetzung des Originals. ", "Englische Übersetzung des Originals: ", 
                         "Englische Übersetzung des deutschsprachigen Originals: " ]
    # if string.startswith("Original: ") or string.startswith("Original. "):
    # check if the string starts with any of the variations:
        # redefine the string as the part after "Original: " (which is as long as the "original" prefix):
        # get the variation that is at the start of the string:
    for variation in prefix_variations:
        if string.startswith(variation):
        # redefine the string as the part after the variation:
            string = string[len(variation):]
            # then split out the year into a separate variable. It always comes first, has four characters, and is followed by a dot:
            year = string[:4]
            # if there is a O in the year, replace it with a 0:
            if "O" in year:
                year = year.replace("O", "0")
            # then split out the provision activity (place and publisher) into a separate variable. It always comes last, is preceded by a dot and has a colon between place and publisher:
            provision_activity = string.split(". ")[-1]
            # if it ends in a dot, remove it:
            if provision_activity.endswith("."):
                provision_activity = provision_activity[:-1]
            # then split the title and edition info into another variable. It is everything between the year and the provision activity. Keep any dots in the title, as they are part of the title, but strip any whitespace at the start or end:
            title = string[5:-len(provision_activity)-1].strip()
            # guess the language of the title:
            title_language = guess_language(title)
            # add content of year to the provision activity in a new variable provision_activity_statement:
            provision_activity_statement = provision_activity + ", " + year
            # print as a bibframe Instance with title and provision activity, but use f-strings:
            print(f"<Instance> a bf:Instance ;\n\tbf:title [a bf:Title ;\n\trdfs:label \"{title}\" {title_language} ] ;\n\tbf:provisionActivityStatement \"{provision_activity_statement}\" .")
            # return {"title": title, "provision_activity_statement": provision_activity_statement}

for counter, element in enumerate(original_strings):
    # print(split_bn_original(element))
    print(counter)
    split_bn_original(element)

## BN with other things:

Simple statements that are looking for their own field or vocabulary:
- "Offsetdruck" - 2241x - Hoschschulschrift gedruckt, also eher mehrere Exemplare. (wohin damit?) 
- "Schreibmaschinenfasung" (2858x!) - bedeutet: ist eine Hochschulschrift, die eher ein Manuskript ist, also nur wenie Exemplare vorliegen. bf:Manuscript zuordnen???
- Microfichefassung (52x)
- Loseblattsammlung (8x), "Loseblattausgabe", "Loseblattausgabe im Ordner", ""Loseblattausgabe im Ringbuchordner", "Loseblattausgabe in Ordner", 
- "Kumulative Dissertation" (3000x), 
    - "Kumulative Dissertation, bestehend aus mehreren Buch- und Zeitschriftenbeiträgen", 
    - "Kumulative Dissertation: (1) Pavel, F.-G. 1978. Die klientenzentrierte Psychotherapie. München: Pfeiffer; (2) Beitrag in: Spiel, W. (Ed.) 1980. Die Psychologie des 20. Jahrhunderts. Bd. 12. Zürich: Kindler. S. 844-864; (3) GwG-Info 1982, 47, 37-48; (4) GwG-Info 1983, 52, 7-27; (5) Zeitschrift f. Personenzentr. Psychol. u. Psychotherapie 1984, 3, 277-300"
    - "Kumulative Dissertation. _WEITERE ANGABEN_" (12x)

- "Buchausgabe" (1x)
- "Dissertation" (1x, BE: SR!)

Complex:
- ca 200x eine URL - oft DOI-Typ, Stichproben: das sind externe Supplements
- Gesprächsführung: 
    danach 1 Name: Vorname Nachname (als Contribution mit Rolle interviewer exportieren?)
    danach 2 Namen: Vorname Nachname, Vorname Nachname (als 2 Contributions mit Rolle interviewer exportieren?)
- als **Buchausgabe** - mit oder ohne Titel:
    - "Original als Buchausgabe erschienen: YYYY"
    - 'Original als Buchausgabe unter dem Titel "Psychologie und Neurophysiotherapie Vojtas. Ein Gruppenvergleich zwischen frühbehandelten und bisher unauffälligen Vorschulkindern" erschienen: 1982'
    - Original als Buchausgabe _unter dem Titel "Kinderpsychotherapien. Schulenbildung, Schulenstreit, Integration"_ erschienen: 1984
- als **Band** einer **Report-Reihe**:
    - Original als Band einer Report-Reihe erschienen: 1990 (Bad Tönissteiner Blätter. Beiträge zur Suchtforschung und -therapie. Schriftenreihe der Fachklinik Bad Tönisstein, Band 2, Heft 1)
    - Original als Band einer Report-Reihe erschienen: 2000. (Beiträge zur Arbeitsmarkt- und Berufsforschung, BeitrAB 235)
    - Original als Band einer Report-Reihe erschienen: 2000. (Forschungsbericht, Nr. 2000-27)
- als **Teil** einer **Report-Reihe**:
    - Original als Teil einer Report-Reihe erschienen: 1977
- als **Beitrag** in einem **Sammelwerk**:
    - Original als Beitrag in einem Sammelwerk erschienen: Holzkamp, Klaus (Ed.) 1979. Forum Kritische Psychologie 5

- Original als Zeitschriftenaufsatz erschienen: 2001. Should courts order PAS-children to visit/reside with the alienated parent? In: American Journal of Forensic Psychlogy, 19 (3), p. 61-106
- Original als on-line Version erschienen auf der Homepage der Zeitschrift Supervision: www.fpi-publikationen.de/supervision

- Anlage: (17x)
- Anhang (6x) ...
- Auszug aus: (158x)
- "Auszug aus " (5x) zb: 'dem 4. Kapitel von "Conceptual foundations of occupational therapy"'
- Auszüge aus einem Briefwechsel ... mit Andreas Wilhelm, Auckland/Neuseeland

- Auszüge aus dem Original: (gefolgt von YYYY. Titel. Ort: Verlag) (2x)
- Auswahl aus dem Original: (gefolgt von YYYY. Titel. Ort: Verlag)

- Ausgabe in X Bänden/Heften/Ringordner (54x)

- Abdruck aus: (2x)


- Published in: (12x)
- Also published (in:, under the title, in German language)
    - in:
        - Also published in: (5x)
        - Already published in: (2x)
        - Also published in German language in: (1x)
        - Auch erschienen in: (7x)
        - Außerdem erschienen in: (1x)
    - as:
        - Also published in German language under the title (1x)
        - Also published as: (1x)
        - Also published under the title (18x)

    - Bereits:
        - Bereits erschienen in: (646x)
        - Bereits in anderer Fassung erschienen in:  
        - Bereits in ausführlicherer Form erschienen in:
        - Bereits ausführlicher erschienen in:
        - Bereits einer kürzeren Fassung erschienen in:
        - Bereits leicht gekürzt in englischer Sprache erschienen in: 
        - Bereits in englischer Sprache erschienen in: 
        - Bereits in französischer Sprache erschienen in:
        - Bereits in holländischer Sprache erschienen in:
        - Bereits in niederländischer Sprache erschienen in:
        - Bereits in spanischer Sprache erschienen in:
        - Bereits in deutscher Sprache erschienen in:

    - Auch als Buchausgabe erschienen: 
    - Auch als Buchausgabe erschienen unter dem Titel: XXXX. YYYY. Ort: Verlag. ISBN
    - Auch als Buchausgabe unter dem Titel XXX erschienen:
    - Auch als Buchfassung unter dem Titel: "XXX" erschienen: YYYY. Ort: Verlag 
    - Als Buchausgabe unter dem Titel ...

- a correct (ion) (is published, was published, to this article was published) (48x)
- Aus dem Englischen übersetzt von (6x)
- Aus den ersten drei Kapiteln bestehende Übersetzung des Originals: 1955. The psychology of personal constructs. New York: Norton</BN>
- <BN>Aktualisierte Fassung des Originals: 1985. AIDS - vår framtid? Stockholm: Svenska Carnegie Institutet</BN>
- <BN>Als Autorenname sind die Initialen R. S. angegeben</BN>
- <BN>Als Erstveröffentlichung erschienen in: Der Gynäkologe, (1) 1986 unter dem Titel "Die weibliche Sexualität aus psychoanalytischer Sicht"</BN>
 


# Get Research Data from URLAI and DATAC:

We'll just put any contents of subfields |u and |d into one processing field, check what it actually is, and then sort it accordingly. 

In [None]:
from distutils.command import build
import html 
import re



def build_doi_identifier_node(doi):
    print(f"bf:identifiedBy > bf:Doi > rdf:value: {doi}.")


def build_electronic_locator_node(url):
    print(f"bf:electronicLocator > Res > rdf:value: {url}^^anyURI.")

def check_for_url_or_doi(string):
    """checks if the content of the string is a doi or url or something else.
       Returns the a string and a string_type (doi, url, unknown). The given string 
       is sanitized, eg. missing http protocol is added for urls; dois are stripped
       of web protocols and domain/subdomains like dx, doi.org)."""
    # first, # replace spaces with underscores:
    string = re.sub(' {2,}', ' ', string)
    string = re.sub(" ", "_", string)
    doi_pattern = re.compile(r"^(https?:)?(\/\/)?(dx\.)?doi\.org\/?(.*)$")
    if doi_pattern.search(string):
        # remove the matching part:
        string = doi_pattern.search(string).group(4)
        string_type = "doi"
        # print("DOI: " + doi)
    elif string.startswith("10."):
        # if the string starts with "10." the whole thing is a DOI:
        string_type = "doi"
        # print("DOI: " + doi)
        # proceed to generate an identifier node for the doi:
    else:
        # doi = None
        # check for validity of url using a regex:
        url_pattern = re.compile(r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", re.IGNORECASE)
        if url_pattern.search(string):
            # if it's a nonstandard url starting with "//", add a "http:" protocol to the start:
            if string.startswith("//"):
                string = "http:" + string
            string_type = "url"
            # print("URL: " + datac_url)
        else:
            # url = None
            string_type = "unknown"
            # print("Das ist weder eine DOI noch eine URL: " + string)
    return string, string_type

def get_subfield(subfield_full_string, subfield_name):
        # split out the subfield:
        subfield = subfield_full_string.split(f"|{subfield_name} ")[1].split(" |")[0]
         # strip out any double spaces and replace them with single spaces:
        subfield = re.sub(' {2,}', ' ', subfield)
        return subfield 

def get_datac(datac_list):
    """Gets research data from field DATAC. 
Note: We define all data from this field as type "research data only, no code", and "open/unrestricted access"
Newer data from PSYNDEXER may be something else, but for first migration, we assume all data is research data only.
"""
    # for datac in record.findall("DATAC"):
    # go through the list of datac fields and get the doi, if there is one:
    for count, data in enumerate(datac_list):
        print(f"\n-- Datac field {count}:--")
        print("genre:researchData.")
        print("bf:usageAndAccessPolicy > bf:AccessPolicy")
        print("> rdfs:label: open access@en. > rdf:value 'http://purl.org/coar/access_right/c_abf2'^^xsd:anyURI")
        # first of all, get the text in that field, cleaning it from html entities in the process.
        # datac_field = datac.text.strip()
        datac_field = data.strip()
        # grab subfields u and d as strings and check if they are a url or a doi:
        for subfield_name in ("u", "d"):
            try: 
                subfield = get_subfield(datac_field, subfield_name)
            except:
                subfield = None
            else:
                # if the string_type returned [1] is doi or url, treat them accordingly, using the returned string [0]
                # as a doi or url:
                # if it is a doi, run a function to generate a doi identifier node
                if check_for_url_or_doi(subfield)[1] == "doi":
                    build_doi_identifier_node(check_for_url_or_doi(subfield)[0])
                elif check_for_url_or_doi(subfield)[1] == "url":
                    build_electronic_locator_node(check_for_url_or_doi(subfield)[0])
                # if the returned typ is something else "unknown", do nothing with it:
                else:
                    print("bf:note > bf:Note > rdfs:label: " + subfield)
                    
datac = (
    "|u https://zenodo.org/record/160530 |d 10.5281/zenodo.160530",
    "|u https://dx.doi.org/10.5281/zenodo.160530 |d 10.5281/zenodo.160530",
    "|u http://dx.doi.org/10.1016/j.psyneuen.2015.11.018",
    "|u 10.3389/fpsyg.2020.01623",
    "|d 10.1016/j.jenvp.2020.101428",
    "|u http://webapps.ccns.sbg.ac.at/OpenData |d ",
    "|u //osf.io/nj6zt/?view  only=b78ad5411b4b4e^Dffa15dc2c5fee17d6e",
    "|u 123456789",
    "|u https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-018-25953-0/MediaObjects/41598 2018 25953 MOESM1 ESM.pdf",

)
        
def get_urlai(urlai_list):
    """Gets research data from field URLAI. This is always in PsychData, so it will be restricted access by default.
    We will also assume it to always be just research data, not code.
    """
    for count, data in enumerate(urlai_list):
        print(f"\n-- URLAI field {count}:--")
        print("genre:researchData.")
        print("bf:usageAndAccessPolicy > bf:AccessPolicy")
        print("> rdfs:label: restricted access@en. > rdf:value 'http://purl.org/coar/access_right/c_16ec'^^xsd:anyURI")
        urlai_field = data.strip()
        # there are no subfields in urlai, so let's just grab the whole thing and pass it on to the url or doi checker:
        # if the string_type returned [1] is doi or url, treat them accordingly, using the returned string [0]
        # as a doi or url:
        # if it is a doi, run a function to generate a doi identifier node
        if check_for_url_or_doi(urlai_field)[1] == "doi":
            build_doi_identifier_node(check_for_url_or_doi(urlai_field)[0])
        elif check_for_url_or_doi(urlai_field)[1] == "url":
            build_electronic_locator_node(check_for_url_or_doi(urlai_field)[0])
        # if the returned typ is something else "unknown", do nothing with it:
        else:
            print("bf:note > bf:Note > rdfs:label: " + urlai_field)



urlais = ("http://dx.doi.org/10.5160/psychdata.stuh96ko20", 
          "https://doi.org/10.5160/psychdata.wfcn13ma18", 
          "https://osf.io/hafsx  view only", "12 23  56"
)

#get_datac(datac)

get_urlai(urlais)





-- URLAI field 0:--
genre:researchData.
bf:usageAndAccessPolicy > bf:AccessPolicy
> rdfs:label: restricted access@en. > rdf:value 'http://purl.org/coar/access_right/c_16ec'^^xsd:anyURI
bf:identifiedBy > bf:Doi > rdf:value: 10.5160/psychdata.stuh96ko20.

-- URLAI field 1:--
genre:researchData.
bf:usageAndAccessPolicy > bf:AccessPolicy
> rdfs:label: restricted access@en. > rdf:value 'http://purl.org/coar/access_right/c_16ec'^^xsd:anyURI
bf:identifiedBy > bf:Doi > rdf:value: 10.5160/psychdata.wfcn13ma18.

-- URLAI field 2:--
genre:researchData.
bf:usageAndAccessPolicy > bf:AccessPolicy
> rdfs:label: restricted access@en. > rdf:value 'http://purl.org/coar/access_right/c_16ec'^^xsd:anyURI
bf:electronicLocator > Res > rdf:value: https://osf.io/hafsx_view_only^^anyURI.

-- URLAI field 3:--
genre:researchData.
bf:usageAndAccessPolicy > bf:AccessPolicy
> rdfs:label: restricted access@en. > rdf:value 'http://purl.org/coar/access_right/c_16ec'^^xsd:anyURI
bf:note > bf:Note > rdfs:label: 12 23  5

It's best if we had a generic function that builds a research data node.

If passed a URLAI field, we build one with restricted access,
if passed a DATAC field, we build one with open access.

Otherwise, they should be treated the same! How can this be done? With a parameter?

When calling the function (we call it twice, one for all URLAIs, once for all DATACs?)?

Or we could call it once, and it will go through all URLAIs and DATACs of the record?

What should be in the function, anyway:

- build a bnode for the relationship, add relation, 
- build a bnode for the supplement work
- build a bnode for the supplement instance
- add the doi identifier node or electroniclocator, depending on url or doi, or a note if it's something else
- add the usage and access policy, which will be different depending on the source (URLAI or DATAC)
- URLAIs don't have subfields, but DATACs do. 

maybe we should make a function that generates relationships, and then reuse it for all kinds of fields?
Depending on the field type, we can make some changes (the relation, the genre of the related work)

In [None]:
from rdflib import Graph, Literal
from rdflib.namespace import RDF, RDFS, XSD, Namespace
from rdflib import BNode
from rdflib import URIRef

BF = Namespace("http://id.loc.gov/ontologies/bibframe/")
BFLC = Namespace("http://id.loc.gov/ontologies/bflc/")
MADS = Namespace("http://www.loc.gov/mads/rdf/v1#")
SCHEMA = Namespace("https://schema.org/")
WORKS = Namespace("https://w3id.org/zpid/resources/works/")
INSTANCES = Namespace("https://w3id.org/zpid/resources/instances/")
PXC = Namespace("https://w3id.org/zpid/ontology/classes/")
PXP = Namespace("https://w3id.org/zpid/ontology/properties/")
LANG = Namespace ("http://id.loc.gov/vocabulary/iso639-2/")
LOCID = Namespace("http://id.loc.gov/vocabulary/identifiers/")
ROLES = Namespace("https://w3id.org/zpid/vocabs/roles/")
RELATIONS = Namespace("https://w3id.org/zpid/vocabs/relations/")

records_bf = Graph()

records_bf.bind("bf", BF) 
records_bf.bind("bflc", BFLC) 
records_bf.bind("works", WORKS)  
records_bf.bind("instances", INSTANCES) 
records_bf.bind("pxc", PXC) 
records_bf.bind("pxp", PXP) 
records_bf.bind("lang", LANG) 
records_bf.bind("schema", SCHEMA) 
records_bf.bind("locid", LOCID) 
records_bf.bind("mads", MADS) 
records_bf.bind("roles", ROLES) 
records_bf.bind("relations", RELATIONS)

relation_types = {
    "rd_open_access": {
        "relation": "hasResearchData",
        "relatedTo_subprop": "supplement",
        "work_subclass": "Dataset",
        "content_type": "dataset",
        "genre": "researchData",
        "access_policy_label": "open access",
        "access_policy_value": "http://purl.org/coar/access_right/c_abf2"
    },
    "rd_restricted_access": {
        "relation": "hasResearchData",
        "relatedTo_subprop": "supplement",
        "work_subclass": "Dataset",
        "content_type": "dataset",
        "genre": "researchData",
        "access_policy_label": "restricted access",
        "access_policy_value": "http://purl.org/coar/access_right/c_16ec"
    },
}

import re



def build_doi_identifier_node(instance, doi):
    # print(f"bf:identifiedBy > bf:Doi > rdf:value: {doi}.")
    # make bnode for the identifier:
    identifier_node = BNode()
    # give it class bf:Doi:
    records_bf.add((identifier_node, RDF.type, BF.Doi))
    # give it the doi as a literal value:
    records_bf.add((identifier_node, RDF.value, Literal(doi)))
    # attach it to the instance with bf:identifiedBy:
    records_bf.add((instance, BF.identifiedBy, identifier_node))


def build_electronic_locator_node(instance, url):
    locator_node = BNode()
    # add it to the instance_node of relationship_node via bf:electronicLocator:
    # no specific class!
    # give it the url as a literal value:
    records_bf.set((locator_node, RDF.value, Literal(url, datatype=XSD.anyURI)))
    # attach it to the instance with bf:electronicLocator:
    records_bf.set((instance, BF.electronicLocator, locator_node))

def build_note_node(instance, note):
    note_node = BNode()
    records_bf.set((note_node, RDF.type, BF.Note))
    records_bf.set((note_node, RDFS.label, Literal(note)))
    records_bf.set((instance, BF.note, note_node))

def check_for_url_or_doi(string):
    """checks if the content of the string is a doi or url or something else.
       Returns the a string and a string_type (doi, url, unknown). The given string 
       is sanitized, eg. missing http protocol is added for urls; dois are stripped
       of web protocols and domain/subdomains like dx, doi.org)."""
    # first, # replace spaces with underscores:
    string = re.sub(' {2,}', ' ', string)
    string = re.sub(" ", "_", string)
    doi_pattern = re.compile(r"^(https?:)?(\/\/)?(dx\.)?doi\.org\/?(.*)$")
    if doi_pattern.search(string):
        # remove the matching part:
        string = doi_pattern.search(string).group(4)
        string_type = "doi"
        # print("DOI: " + doi)
    elif string.startswith("10."):
        # if the string starts with "10." the whole thing is a DOI:
        string_type = "doi"
        # print("DOI: " + doi)
        # proceed to generate an identifier node for the doi:
    else:
        # doi = None
        # check for validity of url using a regex:
        url_pattern = re.compile(r"[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", re.IGNORECASE)
        if url_pattern.search(string):
            # if it's a nonstandard url starting with "//", add a "http:" protocol to the start:
            if string.startswith("//"):
                string = "http:" + string
            string_type = "url"
            # print("URL: " + datac_url)
        else:
            # url = None
            string_type = "unknown"
            # print("Das ist weder eine DOI noch eine URL: " + string)
    return string, string_type




        
def get_urlai(work_uri, urlai_list):
    """Gets research data from field URLAI. This is always in PsychData, so it will be restricted access by default.
    We will also assume it to always be just research data, not code.
    """
    for data in urlai_list:
        # print(f"\n-- URLAI field {count}:--")
        # print("genre:researchData.")
        # print("bf:usageAndAccessPolicy > bf:AccessPolicy")
        # print("> rdfs:label: restricted access@en. > rdf:value 'http://purl.org/coar/access_right/c_16ec'^^xsd:anyURI")
        urlai_field = data.strip()
        doi_set = set()
        #build the relationship node:
        relationship_node, instance = build_work_relationship_node(work_uri, relation_type="rd_restricted_access") 
        # there are no subfields in urlai, so let's just grab the whole thing and pass it on to the url or doi checker:
        # if the string_type returned [1] is doi or url, treat them accordingly, using the returned string [0]
        # as a doi or url:
        # if it is a doi, run a function to generate a doi identifier node
        if check_for_url_or_doi(urlai_field)[1] == "doi":
            # build_doi_identifier_node(instance,check_for_url_or_doi(urlai_field)[0])
            doi_set.add(check_for_url_or_doi(urlai_field)[0])
        elif check_for_url_or_doi(urlai_field)[1] == "url":
            build_electronic_locator_node(instance, check_for_url_or_doi(urlai_field)[0])
        # if the returned typ is something else "unknown", do nothing with it:
        else:
            # print("bf:note > bf:Note > rdfs:label: " + urlai_field)
            build_note_node(instance, check_for_url_or_doi(urlai_field)[0])

        # loop through the set to build doi nodes, so we won't have duplicates:
        for doi in doi_set:
            build_doi_identifier_node(instance, doi)
        # now attach the finished node for the relationship to the work:
        records_bf.add((work_uri, BFLC.relationship, relationship_node))



urlais = ("http://dx.doi.org/10.5160/psychdata.stuh96ko20", 
          "https://doi.org/10.5160/psychdata.wfcn13ma18", 
          "https://osf.io/hafsx  view only", "12 23  56"
)

def build_work_relationship_node(work_uri, relation_type):
    # check the relation_type against the relation_types dict:
    if relation_type in relation_types:
        # if it is, get the values for the relation_type:
        relation = relation_types[relation_type]["relation"]
        relatedTo_subprop = relation_types[relation_type]["relatedTo_subprop"]
        work_subclass = relation_types[relation_type]["work_subclass"]
        content_type = relation_types[relation_type]["content_type"]
        genre = relation_types[relation_type]["genre"]
        access_policy_label = relation_types[relation_type]["access_policy_label"]
        access_policy_value = relation_types[relation_type]["access_policy_value"]
    # make a bnode for this relationship:
    relationship_bnode = BNode()
    # make it class bflc:Relationship:
    records_bf.set((relationship_bnode, RDF.type, BFLC.Relationship))
    # add a bflc:Relation (with a label and value) via bflc:relation to the relationship bnode 
    # (label and value could be given as a parameter):
    # print("\tbflc:relation [a bflc:Relation ; rdfs:label 'has research data', rdf:value 'relation:hasResearchData'^^xsd:anyURI] ;")
    # relation_bnode = BNode()
    # records_bf.set((relation_bnode, RDF.type, BFLC.Relation))
    # records_bf.add((relation_bnode, RDFS.label, Literal("has research data")))
    # records_bf.add((relation_bnode, RDF.value, Literal(RELATIONS.hasResearchData)))
    records_bf.set((relationship_bnode, BFLC.relation, URIRef(RELATIONS[relation])))
    # make a bnode for the work:
    related_work_bnode = BNode()
    records_bf.add((related_work_bnode, RDF.type, BF.Work))
    records_bf.add((related_work_bnode, RDF.type, URIRef(BF[work_subclass])))
    # give work a content type:
    records_bf.add((related_work_bnode, BF.content, Literal(content_type)))
    # and a genre:
    records_bf.add((related_work_bnode, BF.genre, Literal(genre)))
    # attach the work bnode to the relationship bnode with bf:relatedTo 
    # (or a subproperty as given as a parameter)):
    # print("\tbf:relatedTo [a bf:Work ;")
    records_bf.add((relationship_bnode, BF[relatedTo_subprop], related_work_bnode))
    # make a bnode for the instance:
    related_instance_bnode = BNode()
    records_bf.set((related_instance_bnode, RDF.type, BF.Instance))
    records_bf.add((related_instance_bnode, RDF.type, BF.Electronic))
    # attach the instance to the work bnode via bf:hasInstance:
    #print("\t\tbf:hasInstance [a bf:Instance ;")
    records_bf.add((related_work_bnode, BF.hasInstance, related_instance_bnode))
    # add accesspolicy to instance:
    access_policy_node = BNode()
    records_bf.add((access_policy_node, RDF.type, BF.AccessPolicy))
    records_bf.add((access_policy_node, RDFS.label, Literal(access_policy_label, lang="en")))
    records_bf.add((access_policy_node, RDF.value, Literal(access_policy_value, datatype=XSD.anyURI)))
    records_bf.add((related_instance_bnode, BF.usageAndAccessPolicy, access_policy_node))
    # insert dois and/or urls:
    #
    # print("\t\t\t]") # end instance
    # in the end, return the relationship bnode so it can be attached to the work
    # records_bf.add((work_uri, BFLC.relationship, relationship_bnode))
    return relationship_bnode, related_instance_bnode

def get_datac(work_uri, datac_list):
    """Gets research data from field DATAC. 
Note: We define all data from this field as type "research data only, no code", and "open/unrestricted access"
Newer data from PSYNDEXER may be something else, but for first migration, we assume all data is research data only.
"""
    # for datac in record.findall("DATAC"):
    # go through the list of datac fields and get the doi, if there is one:
    for data in datac_list:
        datac_field = data.strip()
        # add an item "hello" to the set:
        #build the relationship node:
        relationship_node, instance = build_work_relationship_node(work_uri, relation_type="rd_open_access") 

        # we want to drop any duplicate dois that can occur if datac has a doi and doi url (same doi, but protocol etc prefixed) 
        # for the same data that,
        # after conversion, ends up being identical. So we make a set of dois,
        # which we will add dois to, and then later loop through the set (sets are by defintion list with only unique items!):
        doi_set = set()
        # grab subfields u and d as strings and check if they are a url or a doi:
        for subfield_name in ("u", "d"):
            try: 
                subfield = get_subfield(datac_field, subfield_name)
            except:
                subfield = None
            else:
                # if the string_type returned [1] is doi or url, treat them accordingly, using the returned string [0]
                # as a doi or url:
                # if it is a doi, run a function to generate a doi identifier node
                if check_for_url_or_doi(subfield)[1] == "doi":
                    # add the doi to a list:
                    doi_set.add(check_for_url_or_doi(subfield)[0])
                    #build_doi_identifier_node(instance, check_for_url_or_doi(subfield)[0])
                elif check_for_url_or_doi(subfield)[1] == "url":
                    build_electronic_locator_node(instance, check_for_url_or_doi(subfield)[0])
                    # if the returned typ is something else "unknown", do nothing with it:
                else:
                    # print("bf:note > bf:Note > rdfs:label: " + subfield)
                    build_note_node(instance, check_for_url_or_doi(subfield)[0])
        # doi_set = set(doi_list)
        # print(doi_list)
        # print(doi_set)
        for doi in doi_set:
            build_doi_identifier_node(instance, doi)
                
                
        # now attach the finished node for the relationship to the work:
        records_bf.add((work_uri, BFLC.relationship, relationship_node))
                    
datac = (
    "|u https://zenodo.org/record/160530 |d 10.5281/zenodo.160530",
    "|u https://dx.doi.org/10.5281/zenodo.160531 |d 10.5281/zenodo.160531",
    "|u http://dx.doi.org/10.1016/j.psyneuen.2015.11.018",
    "|u 10.3389/fpsyg.2020.01623",
    "|d 10.1016/j.jenvp.2020.101428",
    "|u http://webapps.ccns.sbg.ac.at/OpenData |d ",
    "|u //osf.io/nj6zt/?view  only=b78ad5411b4b4e^Dffa15dc2c5fee17d6e",
    "|u 123456789",
    "|u https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-018-25953-0/MediaObjects/41598 2018 25953 MOESM1 ESM.pdf",

)

## now for the main program:
# make a Work node:
work_uri = URIRef(WORKS["123456789"])
records_bf.add((work_uri, RDF.type, BF.Work))
# call the function to build a relationship node, which should attach the relationship node to the work node:
# build_work_relationship_node(work_uri, relation_type="rd_open_access") 

# it's better to call the build_relationshjp_node function from within the urlai and datac functions. i supppose.
# get_urlai(work_uri, urlais)
get_datac(work_uri, datac)

# serialize the graph:
print(records_bf.serialize("dois.ttl",format="turtle"))

[a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'Memory']].


In [None]:
url = "https://osf.io/atc48/"
doi = "10.17605/OSF.IO/ATC48"
#doi = "10.17605/ui/ATC48"
if "osf.io" in url and "OSF.IO/" in doi and doi.split("/")[2].lower() in url:
    print(f"duplicate doi in url {url}: {doi.split('/')[2]} from {doi}. Removing url in favor of doi.")
else:
    print("no")
    

yes, https://osf.io/atc48/ contains ATC48


In [None]:
from operator import ne
import re

def orcid_checker(orcid):
    """Checks if an orcid is valid. Returns True if valid, False if not."""
    # first, check if other stuff is at the beginning of the string - if it starts with either "/" or "https://orcid.org/" or "orcid.org/" - then strip that out, using regex:
    # orcid = re.sub(r"^(\/|https?:\/\/(orcid\.)?org\/)?", "", orcid)

    # then remove any spaces:
    orcid = orcid.replace(" ", "")

    # then check if it is a valid orcid by using a regex, which also checks if it starts with "http(s)://orcid.org/", orcid/org/", or a "/" and removes these":
    orcid_pattern = re.compile(r"^(https?:\/\/(orcid\.)?org\/)?(orcid\.org\/)?(\/)?([0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X])$")
    if orcid_pattern.search(orcid):
        # if it is, remove the matching part:
        orcid = orcid_pattern.search(orcid).group(5)
        print(f"after corrections, {orcid} is valid.")
    else:
        print(f"{orcid} is not a valid orcid.")

        

orcid_list = (
    "0000-0002-818X-844X", # incorrect, because X in third group
    "0000-0001-6000- 0967", # a correct orcid, but with a space inside: catch and correct, maybe look if it exists
    "0000-0002- 0089-7618", # as above
    "/0000-0003-4757-1460", # as above, but with a slash (c&p error)
    "/0000-0001-8112-0837", # as above
    "Velten, Julia", # unacceptable, drop
    "10.1007/978-3-658-10947-9 21-1", # unacceptable, drop
    "10.1026/0033-3042/a000591",
    "10.1026/0049-8637/a000177",
    "orcid.org/0000-0002-8181-844X", # remove orcid.org/ and check if valid orcid
    "https://orcid.org/0000-0002-1397-0060", # remove https://orcid.org/ and check if valid orcid
    "https://orcid.org/0000-0003-1342-7006",
    "https://orcid.org/0000-00 01-9885- 3252",
)

for orcid in orcid_list:
    orcid_checker(orcid)


orcid_field = (
    "0000-0003-3359-6157 |u Zinke, Alexander", # switch name and orcid
    "0000-0002-0350-1359 |u Soloviev, Andrey G.",
    "0000-0001-8311-1184 |u Peseschkian, Hamid"
    "Knaevelsrud, Christine |u Knaevelsrud, Christine", # drop orcid
    "Heinzel, Carlotta V. |u 0000-0002-2619-913X" # correct, leave as is
)




0000-0002-818X-844X is not a valid orcid.
after corrections, 0000-0001-6000-0967 is valid.
after corrections, 0000-0002-0089-7618 is valid.
after corrections, 0000-0003-4757-1460 is valid.
after corrections, 0000-0001-8112-0837 is valid.
Velten,Julia is not a valid orcid.
10.1007/978-3-658-10947-921-1 is not a valid orcid.
10.1026/0033-3042/a000591 is not a valid orcid.
10.1026/0049-8637/a000177 is not a valid orcid.
after corrections, 0000-0002-8181-844X is valid.
after corrections, 0000-0002-1397-0060 is valid.
after corrections, 0000-0003-1342-7006 is valid.
after corrections, 0000-0001-9885-3252 is valid.


In [17]:
values = ({"tp": 0, "fp": 1},
          {"tp": 1, "fp": 1},
            {"tp": 0, "fp": 18},
            {"tp": 1, "fp": 18},
          )

modifier = 1

def modified_precision(tp, fp, modifier):
    """Calculates precision from true positives and false positives."""
    return (tp+modifier / (tp + modifier + fp))

for value in values:
    print(f'tp: {value["tp"]}, fp: {value["fp"]}, mod: {modifier} - prec: {modified_precision(value["tp"], value["fp"], modifier)}')

tp: 0, fp: 1, mod: 1 - prec: 0.5
tp: 1, fp: 1, mod: 1 - prec: 1.3333333333333333
tp: 0, fp: 18, mod: 1 - prec: 0.05263157894736842
tp: 1, fp: 18, mod: 1 - prec: 1.05


# Getting and modelling links to replicated studies in field RPLIC

Es gibt keine Subfelder. Im Hauptfeld steht meist ein Zitationsstring. Er kann manchmal auch eine DOI enthalten, entwerder ausschlielßich oder am Ende der Zitation (oder irgendwo dazwischen).
Oder auch eine URL, zB zu OSF.

Am einfachsten könnte sein, die Zitation an Crossref weiterzugeben und sich eine DOI dafür zurückliefern zu lassen.


In [1]:
# Looking up a citation string from RPLIC and getting the corresponding DOI from crossref:
import re
import requests
import requests_cache
import json
from datetime import timedelta
import modules.mappings as mappings
from fuzzywuzzy import fuzz


CROSSREF_FRIENDLY_MAIL = "&mailto=ttr@leibniz-psychology.org"
# for getting a list of funders from api ():
CROSSREF_API_URL = "https://api.crossref.org/works?query="
    
urls_expire_after = {
    # Custom cache duration per url, 0 means "don't cache"
    # f'{SKOSMOS_URL}/rest/v1/label?uri=https%3A//w3id.org/zpid/vocabs/terms/09183&lang=de': 0,
    # f'{SKOSMOS_URL}/rest/v1/label?uri=https%3A//w3id.org/zpid/vocabs/terms/': 0,
}

session = requests_cache.CachedSession(
    ".cache/requests",
    allowable_codes=[200, 404],
    expire_after=timedelta(days=30),
    urls_expire_after=urls_expire_after,
)

def get_subfield(subfield_full_string, subfield_name):
    """Given a string that contains star subfields (|name ) and the name of the subfield,
    e.g. i for |i, return the content of only that subfield as a string."""
    # first, make sure that the extracted substring is not None, not empty or completely comprised of spaces:
    if subfield_full_string is not None and subfield_full_string != "":
        # strip out any double spaces and replace with single space, also strip spaces around:
        subfield_full_string = re.sub(" {2,}", " ", subfield_full_string.strip())
        # split out the content of the field - from the first |name to either the next | or the end of subfield_full_string:
        subfield = None
        # check if the subfield is in the string:
        if f"|{subfield_name}" in subfield_full_string:
            # if it is, split the string on the subfield name:
            subfield = subfield_full_string.split(f"|{subfield_name}")[1].strip()
            # end the string at the next | or the end of the string:
            subfield = subfield.split("|")[0].strip()
        # subfield = subfield_full_string.split(f"|{subfield_name}")[1].strip().split("|")[0].strip()
        # print(subfield)
        if subfield != "" and subfield is not None:
            return html.unescape(mappings.replace_encodings(subfield))
        else:
            return None


def get_mainfield(field_fullstring):
    """Given a string extracted from a star field that may have substrings or not, return the content of
    the main field as a string - either to the first |subfield or the end of the field, if no subfields.
    """
    # first, make sure that the extracted substring is not None, not empty or completely comprised of spaces:
    if field_fullstring is not None and field_fullstring != "":
        # strip out any double spaces and replace with single space, also strip spaces around:
        field_fullstring = re.sub(" {2,}", " ", field_fullstring.strip())
        # split out the content of the field - to the first | or the end of subfield_full_string:
        field = None
        # check if a subfield is in the string:
        if f"|" in field_fullstring:
            # if it is, return the part before it:
            field = field_fullstring.split("|")[0].strip()
        else:
            # if not, return the whole string:
            field = field_fullstring.strip()
        if field != "" and field is not None:
            return html.unescape(mappings.replace_encodings(field))
        else:
            return None

rplics = (
    "Testeintrag, wieder loeschen",
    "https://osf.io/kv65n/",
    "Timmer, K., Calabria, M., Branzi, F. M., Baus, C., &amp; Costa, A. (2018). On the reliability of switching costs  across time and domains. Frontiers in Psychology, 9, 1032.",
    "Stumbrys, T., &amp; Erlacher, D. (2014). The science of lucid dream induction. In R. Hurd, &amp; K. Bulkeley (Eds.). Lucid dreaming: New perspectives on consciousness in sleep (pp. 77^DDS96). Westport: Praeger.",
    "Mattler, U., &amp; Fendrich, R. (2010). Consciousness mediated by neural transition states: How invisiblyrapid motions can become visible.Consciousness and Cognition,19, 172^DDS185. https://doi.org/10.1016/j.concog.2009.12.015",
    "Dreßing H, Kuehner C, Gass P: Lifetime prevalence and impact of stalking in a European population: epidemiological data from a middlesized German city. Br J Psychiat 2005; 187: 168^DDS72",
    "Daw ND, O^D&gt;'Doherty JP, Dayan P, Seymour B, Dolan RJ. 2006. Cortical substrates for exploratory decisions inhumans.Nature441:876^DDS879.DOI: https://doi.org/10.1038/nature04766,PMID: 16778890",
    "Kahneman, D., &amp; Tversky. A. (1979). Prospect theory: An analysis of decision under risk. Econometrica, 47(2), 263-292.",
    "Saalbach, H., Eckstein, D., Andri, N., Hobi, R., &amp; Grabner, R. H. (2013). When language of instruction and language of application differ: Cognitive costs of bilingual mathematics learning. Learning and Instruction, 26, 36-44.",
    "Grabner, R. H., Saalbach, H., &amp; Eckstein, D. (2012). Language‐Switching Costs in Bilingual Mathematics Learning. Mind, Brain, and Education, 6(3), 147-155.",
    "Eskine, K. J., Kacinik, N. A., &amp; Prinz, J. J. (2011). A bad taste in the mouth: Gustatory disgustinfluences moral judgment.Psychological Science,22(3), 295^DDS299.",
    "Head, D., &amp; Isom, M. (2010). Age effects on wayfinding and route learn-ing skills.Behavioural Brain Research,209,49^DDS58.",
    "Rütsche, B., Hauser, T. U., Jäncke, L., and Grabner, R. H. (2015). Whenproblem size matters: differential effects of brain stimulation on arithmeticproblem  solving  and  neural  oscillations. PLoS ONE10:e0120665.doi: 10.1371/journal.pone.0120665",
    "https://doi.org/10.1101/470435",
    "Armstrong, T., Bilsky, S. A., Zhao, M., &amp; Olatunji, B. O. (2013). Dwellingon potential threat cues: An eye movement marker for combat-relatedPTSD.Depression and Anxiety, 30,497^DDS502.",
    "Kugler, K. G., Reif, J. A. M., Kaschner, T., &amp; Brodbeck, F. C. (2018). Gender differences in the initiation of negoti-ations: A meta-analysis.Psychological Bulletin,144, 198^DDS222.",
    "Tietze, W., Becker-Stoll, F., Bensel, J., Eckhardt, A. G., Haug-Schnabel, G., Kalicki, B.,. . . &amp; Leyendecker, B. (2013). Nationale Untersuchung zur Bildung, Betreuung undErziehung in der frühen Kindheit (NUBBEK). Weimar: verlag das netz.",
    "Korb, F. M., Jiang, J., King, J. A., &amp; Egner, T. (2017). Hierarchicallyorganized medial frontal cortex-basal ganglia loops selectively controltask- and response-selection.Journal of Neuroscience: The Official Journal of the Society for Neuroscience, 37,7893^DDS7905.http://dx.doi.org/10.1523/JNEUROSCI.3289-16.2017",
    "Langguth,  B. et al. Tinnitus severity, depression, and the big five personality traits. In Langguth, B., Hajak, G., Kleinjung, T., Cacace, A. &amp; Møller, A. R. (eds.) Progress in Brain Research, vol. 166 of Tinnitus: Pathophysiology and Treatment, 221^DDS225.",
    "Schmitz, J., Krämer, M., Blechert, J., &amp; Tuschen-Caffier, B. (2010). Post-event processing in children with social phobia.Journal ofAbnormal Child Psychology, 38(7), 911^DDS919.",
    "Gamble, T., &amp; Walker, I. (2016). Wearing a bicycle helmet can increase risk taking and sensation seeking in adults. Psychological Science, 27(2),  289^DDS294.  https ://doi.org/10.1177/09567 97615 620784",
    'Sprenger,  L.,  Becker,  K.,  Heinzel-Gutenbrunner,  M.,  Mingebach,  T.,  Otterbach,  S.,  Peters,  M.,  &amp;  Kamp-Becker,  I.  (2015).  Is  the  ^D&lt;,Stepping  Stones/Triple  P^D&gt;"-parenting  program  a  reason-able, additional intervention in the treatment of children with an autism spectrum disorder? Kindheit und Entwicklung, 24, 28^DDS36',
    "Hagan  CC,  Graham  JM,  Tait  R,  et  al.  Adolescents  with  current  major  depressive  disorder  show  dissimilar  patterns  of  age-related  differences in ACC and thalamus. Neuroimage Clin 2015;7:391-9.",
    "Jaworska  N,  Yucel  K,  Courtright  A,  et  al.  Subgenual  anterior  cin-gulate  cortex  and  hippocampal  volumes  in  depressed  youth:  the  role of comorbidity and age. J Affect Disord 2016;190:726-32.",
    "Tran, U. S., Glück, T. M., &amp; Nader, I. W. (2013). Investigating the FiveFacet Mindfulness Questionnaire (FFMQ): construction of a shortform and evidence of a two-factor higher order structure of mind-fulness.Journal of Clinical Psychology, 69,951^DDS965.https://doi.org/10.1002/jclp.21996.",
    " Rand, D. G., Greene, J. D., &amp; Nowak, M. A. (2012). Spontaneous giving and calculated greed. Nature, 489, 427^DDS430",
    )

def build_replicated_study_relationship_node(rplic):
        # extract subfields:
    try:
        replicated_doi = get_subfield(rplic, "d")
    except:
        replicated_doi = None
    try:
        replicated_url = get_subfield(rplic, "u")
    except:
        replicated_url = None
    try:
        replicated_citation = get_mainfield(rplic)
    except:
        replicated_citation = None
    # if there is a url, add it as bf:electronicLocator:
    if replicated_url:
        print(f"URL: {replicated_url}")
    # if there is a doi, add it as bf:identifier:
    if replicated_doi:
        print(f"DOI: {replicated_doi}")
    # if there is a citation, look it up in crossref:
    if replicated_citation:
        print(f"Citation: {replicated_citation}")
        fetch_doi_from_citation_string(replicated_citation)
        print(f"DOI: {doi}")
        print(f"URL: https://doi.org/{doi}")
        
        

def fetch_doi_from_citation_string(citation):
    """Given a citation string, look up the DOI in crossref. Returns the citation string and the DOI."""
    # clean up ^DD codes with mappings.replace_encodings:
    citation = mappings.replace_encodings(citation)
    # the citation must not include any &amp or & - because they would  be interpreted params in the url:
    citation = citation.replace("&amp;", "&")
    # citation = citation.replace("&", "and")
    
    # strip out any double spaces and replace them with single spaces:
    citation = re.sub(' {2,}', ' ', citation)
    # strip out any spaces at the beginning or end:
    clean_citation = citation.strip()
    # query api, but put citation string in quotes:
    # citation = f'"{citation}"'
    #encode the string so it works as a url query:
    citation = requests.utils.quote(clean_citation)
    crossref_api_url = CROSSREF_API_URL + citation + CROSSREF_FRIENDLY_MAIL

    # make request to api:
    try:
        crossref_api_request = session.get(
            crossref_api_url, timeout=20
        )
    except TimeoutError:
        print("Timeout!")
        return None
    else:
        # if the request was successful, get the json response:
        crossref_api_response = crossref_api_request.json()
        # print(crossref_api_response)
        # check if there is a doi in the response:
        try:
            doi = crossref_api_response["message"]["items"][0]["DOI"]
        except:
            return f"DOI not found for {citation}"
        else:
            # check if the title of the response matches the citation string (fuzzy match):
            try:
                title = crossref_api_response["message"]["items"][0]["title"][0]
            except:
                return f"Title not found for {citation}"
                title = None
            else:
                # use fuzzywuzzy:
                # check if the title matches the citation string:
                if fuzz.partial_ratio((title.lower()), clean_citation.lower()) < 50:
                    return f"Title mismatch: {title} != {clean_citation}", None
                else:
                    return clean_citation, doi
            return clean_citation, doi
    
for count ,citation in enumerate(rplics):
    print(f"{count+1}: {citation}")
    citation, doi = fetch_doi_from_citation_string(citation)
    print(f"{count+1}: {citation}: \nhttps://doi.org/{doi}")


# so, this works, except for cases we need to handle:
# - main field is not a citation string (or doi or url-doi), but a url (eg at osf.io)
# - "Testeintrag"
# - "Stumbrys, T., & Erlacher, D. (2014). The science of lucid dream induction. In R. Hurd, & K. Bulkeley (Eds.). Lucid dreaming: New perspectives on consciousness in sleep (pp. 77–96). Westport: Praeger.:"
#   why that is: this is a chapter that doesn't have a doi. Just a record in APA PsycNet.
# Conclusion: Catch any URLs and DOis (and the "Testeintrag") in the main field beforehand, save them as what they are.
# Only look up "real" citations in crossref. 

1: Testeintrag, wieder loeschen: 
https://doi.org/10.1002/chin.197104180
2: Title mismatch: OSF io  - Materi Jeff Webinar EIYRA 2 != https://osf.io/kv65n/: 
https://doi.org/None
3: Timmer, K., Calabria, M., Branzi, F. M., Baus, C., & Costa, A. (2018). On the reliability of switching costs across time and domains. Frontiers in Psychology, 9, 1032.: 
https://doi.org/10.3389/fpsyg.2018.01032
4: Title mismatch: Dream characters and the dream ego: An exploratory online study in lucid dreams. != Stumbrys, T., & Erlacher, D. (2014). The science of lucid dream induction. In R. Hurd, & K. Bulkeley (Eds.). Lucid dreaming: New perspectives on consciousness in sleep (pp. 77–96). Westport: Praeger.: 
https://doi.org/None
5: Mattler, U., & Fendrich, R. (2010). Consciousness mediated by neural transition states: How invisiblyrapid motions can become visible.Consciousness and Cognition,19, 172–185. https://doi.org/10.1016/j.concog.2009.12.015: 
https://doi.org/10.1016/j.concog.2009.12.015
6: Dreßing H

In [37]:
from rdflib import Graph, URIRef, Literal, Namespace, RDF, RDFS, BNode
from rdflib.namespace import SKOS, OWL, DCTERMS, XSD

# load the vocab as a graph:
vocab = Graph()
vocab.parse("terms_conversion/skosified_apa_thes_2023_02.ttl", format="turtle")

list_of_excluded_concepts = [ "https://w3id.org/zpid/vocabs/terms/00010", "https://w3id.org/zpid/vocabs/terms/00020" ]
# add a new triple:
for block_concept in list_of_excluded_concepts:
    for s,p,o in vocab.triples((URIRef(block_concept), RDF.type, SKOS.Concept)):
        vocab.set((s, OWL.deprecated, Literal("true", datatype=XSD.boolean)))
        vocab.set((s, SKOS.editorialNote, Literal("annif-blocklisted", datatype=XSD.string)))
vocab.serialize("reduced_testvoc.ttl", format="turtle")

<Graph identifier=N3727fc0dfdb8409ba168a777e2a90ea2 (<class 'rdflib.graph.Graph'>)>

In [3]:
import datetime
import dateparser

date = "20.10.2021"
date2 = "7 June 2021"

print(dateparser.parse(date).strftime("%Y-%m-%d"))
print(dateparser.parse(date2).strftime("%Y-%m-%d"))
print(dateparser.parse("2021").strftime("%Y"))  
print(dateparser.parse(": 23. September 2021").strftime("%Y-%m-%d"))

2021-10-20
2021-06-07
2021


AttributeError: 'NoneType' object has no attribute 'strftime'

In [9]:
abstracts= 'Healthy mental functions are essentially based on undisturbed synaptic processes in the brain. Drugs that can potentially affect synapse formation, neurotransmitter metabolism, or action potentials carry a risk of leading to psychiatric disorders. In particular, ligands of neurotransmitter receptors as well as hormones have a (b) high psychiatric side effect potential. (c) Thieme. All rights reserved.'

import re

def add_abstract_licensing_note(abstracttext):
    """Adds a licensing note to the abstract if it contains a copyright string and/or a "translated by DeepL" notice."""
    abstract_copyright_string = None
    # 1. first check if there is a "(translated by DeepL)" at the end of the abstract, remove it and add it to the licensing note.
    # 2 then check for a copyright string at the (new) end of the abstract. Remove it and copy it into the licensinf note - 
    # but only if there isn't already something in there (the translated by deepl note) - because if there is, the translation note takes precedence
    # and the copyright note will not be retained. 
    deepl_match = re.search(r"^(.*)\s\((translated by DeepL)\)$", abstracttext, re.IGNORECASE)
    if deepl_match:
        # replace the abstract with the content before the "(translated by DeepL)":
        abstracttext = deepl_match.group(1)
        # add it to the licensing note, but only if empty:
        abstract_copyright_string = deepl_match.group(2)
    else:
        abstract_copyright_string = None

    # also, after that, check the new abstract for a copyright string:
    license_match = re.search(r"(.*)(\(c\).*)$", abstracttext, re.IGNORECASE)
    # if that match is not None, check if it is in the last 100 characters of the abstract:
    if license_match and len(license_match.group(2)) < 100:
        # if so, check if there is a "(b)" anywhere in the abstract before the match (this is an exclusion criterion,
        # because if there is a "(b)" before the "(c)", it's just a lettered list item, not the copyright string):
        if re.search(r"(.*)(\(b\).*)", license_match.group(1), re.IGNORECASE):
            pass
            # if there is _no_ "(b)" before the "(c)", we have a copyright string; add it to the licensing note.
            # unless it already contains something - which will always be the translation note:
        else:
            if abstract_copyright_string is None or abstract_copyright_string == "":
                abstract_copyright_string = license_match.group(2)
                abstracttext = license_match.group(1)
            else:
                # don't write it into the note if there is already something in it, but do remove it from the abstract!
                abstracttext = license_match.group(1)
            # otherwise ignore the string, we have no copyright string
    
    print("neuer abstract: " + abstracttext)
    print("lizenzstring: " + str(abstract_copyright_string))

add_abstract_licensing_note(abstracts)


neuer abstract: Healthy mental functions are essentially based on undisturbed synaptic processes in the brain. Drugs that can potentially affect synapse formation, neurotransmitter metabolism, or action potentials carry a risk of leading to psychiatric disorders. In particular, ligands of neurotransmitter receptors as well as hormones have a (b) high psychiatric side effect potential. (c) Thieme. All rights reserved.
lizenzstring: None


# Deciding Genre

Fields to use:
- BE
- DT, DT2
- CM
- CF (for conference proceedings)?
- BN, BNDI
- DIDH

## Theses:

## cumulative thesis
- BN starts with "Kumulative" (14x in 556.xml)

### ThesisHabilitation
- BE = SM (oder SH?)
- DT = 01 (SM)
- und/oder DIDH = "Habil.Schr.
- und/oder BN = "Kumulative Habilitationsschrift"

### ThesisDoctoral: 
- BE = SH (15x) oder DT=61 (15x) oder DT2 = 61
- und/oder DIDH = "Diss."
- und/oder BN starts with = "Kumulative Dissertation"


Note for Roles and contributions:
<BN>Gesprächsführung: Vorname Nachname</BN> -> Contribution with role interviewer hinten anfügen, wenn es nicht schon eine gibt. Sonst: fehlende Rolle nachtragen bzw ändern von Autor zu IN



In [57]:
# to map any cm to a matching genre, do a search of the cm's code/notation in the genres vocid, and use the first hit. I used the same notations for genres that were taken from cm - so the api should find the genre that matches the cm.

# also, to map any empirical cm to "Research Paper", go by number of the code, maybe? Anything starting with 10 is empirical.
test_records = [
    { "dfk":"0227202", "be":"SM", "dt":"01", "dt2":"", "cm":"", "didh":"", "bn":"Kumulative Habilitationsschrift", "bndi":""},
    { "dfk":"0390734", "be":"SH", "dt":"61", "dt2":"", "cm":"","didh":"", "bn":"Kumulative Dissertation", "bndi":""},
    {"dfk": "0017943", "be":"SH","dt":"61","dt2":"01", "cm":"", "didh":"Diss.", "bn":"Kumulative Dissertation: (1) Pavel, F.-G. 1978. Die klientenzentrierte Psychotherapie. München: Pfeiffer; (2) Beitrag in: Spiel, W. (Ed.) 1980. Die Psychologie des 20. Jahrhunderts. Bd. 12. Zürich: Kindler. S. 844-864; (3) GwG-Info 1982, 47, 37-48; (4) GwG-Info 1983, 52, 7-27; (5) Zeitschrift f. Personenzentr. Psychol. u. Psychotherapie 1984, 3, 277-300", "bndi":""},
    {
        "dfk": "0017943","be":"SH", "cm":"", "didh":"Diss.", "dt":"61", "dt2":"01", "bn":"Schreibmaschinenfassung", "bndi":""
    },
      {
        "dfk": "0017943","be":"SH", "cm":"", "didh":"Diss.", "dt":"61", "dt2":"01", "bn":"Schreibmaschinenfassung", "bndi":""
    },
          {
        "dfk": "0017943","be":"SH", "cm":"", "didh":"Diss.", "dt":"61", "dt2":"01", "bn":"Schreibmaschinenfassung", "bndi":""
    }
    ]

def determine_genre(be, dt="", dt2="", cm="", didh="", bn="", bndi=""):
    
    genres = []
    # doctoral thesis:
    if be == "SH" or dt == "61" or dt2 == "61" or "Diss".casefold() in didh.casefold():
        genres.append("doctoral thesis")
    # habilitation thesis:
    if "habil".casefold() in didh.casefold() or "habilitationsschrift".casefold() in bn.casefold():
        genres.append("habilitation thesis")
    # cumulative thesis:
    if "kumulativ".casefold() in bn.casefold():
         genres.append("cumulative thesis")
            
    return genres

for record in test_records:
    print("Genres of the work: " 
      + str(determine_genre(record["be"], record["dt"], record["dt2"], record["cm"], record["didh"], record["bn"], record["bndi"])))


Genres of the work: ['habilitation thesis', 'cumulative thesis']
Genres of the work: ['doctoral thesis', 'cumulative thesis']
Genres of the work: ['doctoral thesis', 'cumulative thesis']
Genres of the work: ['doctoral thesis']
Genres of the work: ['doctoral thesis']


# Code to fix weird dates in field PY

https://docs.google.com/spreadsheets/d/1ESISF24A4QbXw8Vf7EAzRY1oaYYnkNVsYoz45J5Ri9Y/edit?gid=2136685722#gid=2136685722

The Goal is to always use the first year that is given, if there are several. That can be hard because the year formatting can differ wildly.

1993/94 -> 1993
1998/99
1999/2000 -> 1999
2000/2001
2003-2004
2005-2006
2001/2002
2005-2006 -> 2005
2014-2015 -> 2014
2005-06 -> 2005
2001-2002
1981-83 -> 1981
1981-83
1981-83
1981-83
1981-83
1981-83
1979/80
1982-83
1982-83
1982-83
1982-83
1982-83
1982-83
1982-83
1976/77
1986/87
1986/87
1986/87
1986/87
1986/87
1986/87
1985/86
1985/86
1985/86
1985/86
1986/87
1986/87
1986/87
1986/87
1986/87
1986/87
1986/87
1986/87
1986/87
1986/87
1986/87
1986/87
1986/87
1988/89
1988/89
1988/89
1988/89
1988/89
1988/89
1988/89
1988/89
1989/90
1989/90
1989/90
1989/90
1989/90
1989/90
1989/90
1989/90
1989/90
1989/90
1989/90
1989/90
1989/90
1989/90
1990/91
1990/91
1990/91


In [28]:
# list of PY years:
PY = [
    "1998/99",
    "1993/94",
    "1999/2000",
    "2000/2001",
    "2003-2004",
    "2005-2006",
    "2001/2002",
    "2005-2006",
    "2014-2015",
    "2005-06",
    "2001-2002",
    "1981-83",
    "1981-83",
    "1981-83",
    "1981-83",
    "1981-83",
    "1981-83",
    "1979/80",
    "1982-83",
    "1982-83",
    "1982-83",
    "1982-83",
    "1982-83",
    "1982-83",
    "1982-83",
    "1976/77",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1985/86",
    "1985/86",
    "1985/86",
    "1985/86",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1986/87",
    "1988/89",
    "1988/89",
    "1988/89",
    "1988/89",
    "1988/89",
    "1988/89",
    "1988/89",
    "1988/89",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1989/90",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1990/91",
    "1992/93",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1980-90",
    "1980-90",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1993/94",
    "1985/86",
    "1995/96",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1991/92",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1995/96",
    "1993/95",
    "1993/95",
    "1993/95",
    "1993/95",
    "1993/95",
    "1996/97",
    "1996/97",
    "1996/97",
    "1996/97",
    "1996/97",
    "1995/96",
    "1996/97",
    "1996/97",
    "1993/95",
    "1998/99",
    "1990/91",
    "1998/99",
    "1999/2000",
    "1999/2000",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2001/2002",
    "2001/2002",
    "2001/2002",
    "2001/2002",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "1949/50",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "2000/2001",
    "1998/99",
    "2007/2008",
    "1988-1989",
    "2009-2010",
    "2009-2010",
    "2009-2010",
    "2009-2010",
    "2009-2010",
    "2009-2010",
    "2009-2010",
    "2009-2010",
    "2009-2010",
    "2008-2009",
    "2008-2009",
    "2008-2009",
    "2008-2009",
    "2008-2009",
    "2008-2009",
    "2008-2009",
    "2005-2007",
    "2005-2007",
    "2005-2007",
    "2005-2007",
    "2005-2007",
    "2005-2007",
    "2005-2007",
    "2005-2007",
    "2005-2007",
    "2011-2012",
    "2011-2012",
    "2011-2012",
    "2011-2012",
    "2011-2012",
    "2011-2012",
    "2011-2012",
    "2011-2012",
    "2011-2012",
    "2011-2012",
    "2011-2012",
    "2002/03",
    "2005/2006",
    "2014-2015",
    "2014-2015",
    "2014-2015",
    "2014-2015",
    "2014-2015",
    "2014-2015",
    "2014-2015",
    "2014-2015",
    "2014-2015",
    "2014-2015",
    "2014-2015",
    "2015-2016",
    "2015-2016",
    "2015-2016",
    "2015-2016",
    "2015-2016",
    "2015-2016",
    "2015-2016",
    "2015-2016",
    "2005-2006",
    "2015-2016",
    "2016-2017",
    "2000/2001",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2020-2021",
    "2019-2020",
    "2019-2020",
    "2019-2020",
    "22","34", # maybe we can try to make a year out of things like "22"-> 2022? (so anything that is not four digits, but two digits? what makes 22 different from 34, which would be invalid?)
    "o.J.",
    "1671",
    "keine Angabe",
    "abcd","1111"
]
for year in PY:
    print(year + " becomes ", end="")
    try:
        # get the first 4 characters of the string and convert to int:
        # anything smaller than 1700 and larger than 2026 is not valid years:
        if int(year[:4]) < 1700 or int(year[:4]) > 2026:
                print("4 digits, but not a valid year")
        elif year[:4]:
            print(year[:4])
    except:
        print("error: can't interpret year, please correct")

1998/99 becomes 1998
1993/94 becomes 1993
1999/2000 becomes 1999
2000/2001 becomes 2000
2003-2004 becomes 2003
2005-2006 becomes 2005
2001/2002 becomes 2001
2005-2006 becomes 2005
2014-2015 becomes 2014
2005-06 becomes 2005
2001-2002 becomes 2001
1981-83 becomes 1981
1981-83 becomes 1981
1981-83 becomes 1981
1981-83 becomes 1981
1981-83 becomes 1981
1981-83 becomes 1981
1979/80 becomes 1979
1982-83 becomes 1982
1982-83 becomes 1982
1982-83 becomes 1982
1982-83 becomes 1982
1982-83 becomes 1982
1982-83 becomes 1982
1982-83 becomes 1982
1976/77 becomes 1976
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1985/86 becomes 1985
1985/86 becomes 1985
1985/86 becomes 1985
1985/86 becomes 1985
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 1986
1986/87 becomes 19

# code to convert finding aid for Adolf Würth archove collection urls to new ones
http://www.awz.uni-wuerzburg.de/fileadmin/42050000/user_upload/Findbuecher/Findbuch_HvB-final-Mai_2017.pdf -> 
https://www.uni-wuerzburg.de/fileadmin/42050000/Findbuecher/Findbuch_HvB-final-Mai_2017.pdf


In [54]:
import re
urls = [
    "http://www.awz.uni-wuerzburg.de/fileadmin/42050000/user_upload/Findbuecher/Findbuch_HvB-final-Mai_2017.pdf",
    "https://example.org",
    "http://www.awz.uni-wuerzburg.de/fileadmin/42050000/user_upload/Findbuecher/FA_Lothar_Spillmann_29042016.pdf", # here they also updated it, so the filename (since it is data-based) itself is broken and the new link will not work, either! But we can't catch that.
"http://www.awz.uni-wuerzburg.de/fileadmin/42050000/user_upload/Findbuecher/FA_Friedrich_Sander_expansion_2015-1.pdf"
    ]

wuerth_pattern= re.compile("http:\/\/\www\.awz.uni-wuerzburg.de\/fileadmin\/([0-9]{8})\/user_upload")
wuerth_replace = r"https://www.uni-wuerzburg.de/fileadmin/\1"

for url in urls:
    print(re.sub(wuerth_pattern, wuerth_replace, url, flags=0))
    

https://www.uni-wuerzburg.de/fileadmin/42050000/Findbuecher/Findbuch_HvB-final-Mai_2017.pdf
https://example.org
https://www.uni-wuerzburg.de/fileadmin/42050000/Findbuecher/FA_Lothar_Spillmann_29042016.pdf
https://www.uni-wuerzburg.de/fileadmin/42050000/Findbuecher/FA_Friedrich_Sander_expansion_2015-1.pdf
