## Create new manifestation measurements

The following code is used to create new statistics. They can be saved to a CSV file and be taken into account in the section above about existing measurements.

In [1]:
import pandas as pd
import utils
import utils_stats
from datetime import datetime

In [41]:
corpus = pd.read_csv('./2022-07-20_kbr/csv/integrated-data-enriched.csv', index_col='targetIdentifier')
corpusNLFR = corpus[(corpus['sourceLanguage'] == 'Dutch') & (corpus['targetLanguage'] == 'French') ]
corpusFRNL = corpus[(corpus['sourceLanguage'] == 'French') & (corpus['targetLanguage'] == 'Dutch') ]
corpusOther = corpus[ ((corpus['sourceLanguage'] != 'Dutch') & (corpus['sourceLanguage'] != 'French')) | ((corpus['targetLanguage'] != 'Dutch') & (corpus['targetLanguage'] != 'French'))]

In [42]:
comment = "Adaptations to contributor integration SPARQL queries"
measurements = pd.DataFrame([
  utils_stats.createCorpusMeasurements(corpusFRNL, 'FR-NL', comment),
  utils_stats.createCorpusMeasurements(corpusNLFR, 'NL-FR', comment),
  utils_stats.createCorpusMeasurements(corpusOther, 'OTHER', comment)
])

In [43]:
convertedFRNL = corpusFRNL.fillna('')

In [44]:
(convertedFRNL['targetISBN13'].values != '').sum()

7405

In [45]:
measurements

Unnamed: 0,date,corpus,numberTranslations,withTargetISBN10,withTargetISBN13,withKBRIdentifier,withBnFIdentifier,withKBIdentifier,withKBRBnFAndKBIdentifier,withKBRAndBnFIdentifier,withKBRAndKBIdentifier,withBnFAndKBIdentifier,withBBThesaurusID,withSourceKBRIdentifier,withKBRSourceTitle,withKBSourceTitle,withSourceISBN10,withSourceISBN13,comment
0,2022-07-20 18:27:09.050199,FR-NL,8254,7400,7405,7524,51,2082,4,13,1390,4,7332,149,2575,2082,99,117,Adaptations to contributor integration SPARQL ...
1,2022-07-20 18:27:09.393980,NL-FR,4009,3421,3445,3532,1209,735,210,933,497,247,2936,193,1197,735,171,173,Adaptations to contributor integration SPARQL ...
2,2022-07-20 18:27:09.499835,OTHER,315,234,234,309,77,11,2,71,10,3,273,4,12,11,2,2,Adaptations to contributor integration SPARQL ...


In [46]:
measurements.to_csv('./measurements/2022-07-20-translation-stats.csv', index=False)

## Create new contributor measurements

In [18]:
personContributors = pd.read_csv('./2022-07-20_3_kbr/csv/contributors-persons.csv', index_col='contributorID')

In [19]:
personComment = "Interlink manually enriched Wikidata dump"

personContributorsMeasurements = pd.DataFrame([
    utils_stats.createContributorCorpusMeasurements(personContributors, '2022-07-20 12:00', personComment)
])

In [21]:
personContributorsMeasurements

Unnamed: 0,date,measurementTime,numberContributors,withKBRIdentifier,withBnFIdentifier,withKBIdentifier,withKBRBnFAndKBIdentifier,withKBRAndBnFIdentifier,withKBRAndKBIdentifier,withBnFAndKBIdentifier,...,withMultipleKBRIdentifiers,withMultipleBnFIdentifiers,withMultipleNTAIdentifiers,withMultipleISNIIdentifiers,withMultipleVIAFIdentifiers,withMultipleWikidataIdentifiers,withMultipleBirthDates,withMultipleDeathDates,withMultipleNationalities,comment
0,2022-07-20 12:00,2022-07-20 18:09:30.167726,6310,5734,1247,1124,251,900,863,283,...,36,10,4,90,195,2,50,11,126,Interlink manually enriched Wikidata dump


In [22]:
personContributorsMeasurements.to_csv('./measurements/2022-07-20_3_person-contributor-stats.csv', index=False)

## Redo manifestation measurements

In [9]:
manifestationFilesConfig = {
    '2022-05-03': ['2022-05-03', 'Data integrated from KBR, BnF and KB based on ISBN10 and ISBN13 identifiers.'],
    '2022-05-09': ['2022-05-09', 'Data integrated from KBR, BnF and KB based on ISBN10 and ISBN13 identifiers. Additionally KBR original title from field 246.'],
    '2022-05-23': ['2022-05-23', 'No changes to previous version.'],
    '2022-06-07': ['2022-06-07', 'Added publishing directors and included a collumn for source titles from KB'],
    '2022-06-16': ['2022-06-16', 'Removed duplicate manifestations with the same ISBN10/ISBN13'],
    '2022-06-20': ['2022-06-20', 'New KBR data dump with plenty of additions and refinements and ISBN fix for FR-NL'],
    '2022-06-23': ['2022-06-23', 'No ISBN duplicate removal anymore to prevent data loss'],
    '2022-06-24': ['2022-06-24', 'New KBR dump with old Dutch, old French and middle French as well as new data in general'],
    '2022-07-18': ['2022-07-18', 'Dataprofile query filtering also for Belgian organizations, not just Belgian persons (KB data was also updated from SPARQL endpoint, but did not cause more translations)'],
    '2022-07-20': ['2022-07-20', 'Fixed error in ISNI-SRU integration SPARQL query.'],
    '2022-07-21': ['2022-07-21', 'Manually curated Wikidata overlap used as bridge identifeir with adapted integration SPARQL queries.']
}

In [10]:
utils_stats.redoManifestationsCorpusMeasurements(manifestationFilesConfig, 'integrated-data-enriched.csv', 'translation-stats') 

## Redo contributor measurements

In [11]:
contributorFilesConfig = {
    '2022-05-03': ['2022-05-03', 'Contributors integrated from ISNI SRU dump, KBR, BnF and NTA based on ISNI, VIAF and Wikidata identifiers.'],
    '2022-05-09': ['2022-05-09', 'Contributors integrated from ISNI SRU dump, KBR, BnF and NTA based on ISNI, VIAF and Wikidata identifiers.'],
    '2022-05-23': ['2022-05-23', 'Only count persons who are actually contributing to manifestations of the corpus (author, translator, illustrator, scenarist, publishing director)'],
    '2022-06-07': ['2022-06-07', 'Added columns with statistics about how many books a contributor authored, translated etc.'],
    '2022-06-16': ['2022-06-16', 'Added columns with statistics about how many books a contributor authored, translated etc.'],
    '2022-06-20': ['2022-06-20', 'New KBR data dump with plenty of additions and refinements and ISBN fix for FR-NL.'],
    '2022-06-23': ['2022-06-23', 'No ISBN duplicate removal anymore to prevent data loss'],
    '2022-06-24': ['2022-06-24', 'New KBR dump with old Dutch, old French and middle French as well as new data in general'],
    '2022-07-18': ['2022-07-18', 'Dataprofile query filtering also for Belgian organizations, not just Belgian persons (KB data was also updated from SPARQL endpoint, but did not cause more translations)'],
    '2022-07-20': ['2022-07-20', 'Fixed error in ISNI-SRU integration SPARQL query.'],
    '2022-07-21': ['2022-07-21', 'Manually curated Wikidata overlap used as bridge identifeir with adapted integration SPARQL queries.']
}

In [12]:
utils_stats.redoContributorCorpusMeasurements(contributorFilesConfig, 'contributors-persons.csv', 'person-contributor-stats') 