## Create new manifestation measurements

The following code is used to create new statistics. They can be saved to a CSV file and be taken into account in the section above about existing measurements.

In [3]:
import pandas as pd
import utils
import utils_stats
from datetime import datetime

In [2]:
corpus = pd.read_csv('./2022-07-18_kbr/csv/integrated-data-enriched.csv', index_col='targetIdentifier')
corpusNLFR = corpus[(corpus['sourceLanguage'] == 'Dutch') & (corpus['targetLanguage'] == 'French') ]
corpusFRNL = corpus[(corpus['sourceLanguage'] == 'French') & (corpus['targetLanguage'] == 'Dutch') ]
corpusOther = corpus[ ((corpus['sourceLanguage'] != 'Dutch') & (corpus['sourceLanguage'] != 'French')) | ((corpus['targetLanguage'] != 'Dutch') & (corpus['targetLanguage'] != 'French'))]

In [3]:
comment = "Dataprofile query filtering also for Belgian organizations, not just Belgian persons (KB data was also updated from SPARQL endpoint, but did not cause more translations)"
measurements = pd.DataFrame([
  utils_stats.createCorpusMeasurements(corpusFRNL, 'FR-NL', comment),
  utils_stats.createCorpusMeasurements(corpusNLFR, 'NL-FR', comment),
  utils_stats.createCorpusMeasurements(corpusOther, 'OTHER', comment)
])

In [4]:
convertedFRNL = corpusFRNL.fillna('')

In [5]:
(convertedFRNL['targetISBN13'].values != '').sum()

7405

In [6]:
measurements

Unnamed: 0,date,corpus,numberTranslations,withTargetISBN10,withTargetISBN13,withKBRIdentifier,withBnFIdentifier,withKBIdentifier,withKBRBnFAndKBIdentifier,withKBRAndBnFIdentifier,withKBRAndKBIdentifier,withBnFAndKBIdentifier,withBBThesaurusID,withSourceKBRIdentifier,withKBRSourceTitle,withKBSourceTitle,withSourceISBN10,withSourceISBN13,comment
0,2022-07-18 18:11:53.272871,FR-NL,8254,7400,7405,7524,51,2082,4,13,1390,4,7332,149,2575,2082,99,117,Dataprofile query filtering also for Belgian o...
1,2022-07-18 18:11:53.610983,NL-FR,4009,3421,3445,3532,1209,735,210,933,497,247,2936,193,1197,735,171,173,Dataprofile query filtering also for Belgian o...
2,2022-07-18 18:11:53.681965,OTHER,315,234,234,309,77,11,2,71,10,3,273,4,12,11,2,2,Dataprofile query filtering also for Belgian o...


In [7]:
measurements.to_csv('./measurements/2022-07-18-translation-stats.csv', index=False)

## Create new contributor measurements

In [26]:
personContributors = pd.read_csv('./2022-06-20_with-duplicate-removing/csv/contributors-persons.csv', index_col='contributorID')

In [27]:
personComment = "New KBR data dump with plenty of additions and refinements and ISBN fix for FR-NL."

personContributorsMeasurements = pd.DataFrame([
    utils_stats.createContributorCorpusMeasurements(personContributors, personComment)
])

In [28]:
personContributorsMeasurements

Unnamed: 0,date,numberContributors,withKBRIdentifier,withBnFIdentifier,withKBIdentifier,withKBRBnFAndKBIdentifier,withKBRAndBnFIdentifier,withKBRAndKBIdentifier,withBnFAndKBIdentifier,withKBRAndISNIIdentifier,...,withMultipleKBRIdentifiers,withMultipleBnFIdentifiers,withMultipleNTAIdentifiers,withMultipleISNIIdentifiers,withMultipleVIAFIdentifiers,withMultipleWikidataIdentifiers,withMultipleBirthDates,withMultipleDeathDates,withMultipleNationalities,comment
0,2022-07-19 10:55:01.789360,5843,5214,1231,1063,232,853,774,270,3987,...,27,8,4,9,23,0,44,10,105,New KBR data dump with plenty of additions and...


In [29]:
personContributorsMeasurements.to_csv('./measurements/2022-06-20-person-contributor-stats1.csv', index=False)

## Redo contributor measurements

In [1]:
files = {
    '2022-05-03_kbr': ['2022-05-03', 'Contributors integrated from ISNI SRU dump, KBR, BnF and NTA based on ISNI, VIAF and Wikidata identifiers.'],
    '2022-05-09_kbr': ['2022-05-09', 'Contributors integrated from ISNI SRU dump, KBR, BnF and NTA based on ISNI, VIAF and Wikidata identifiers.'],
    '2022-05-23_kbr': ['2022-05-23', 'Only count persons who are actually contributing to manifestations of the corpus (author, translator, illustrator, scenarist, publishing director)'],
    '2022-06-07_kbr': ['2022-06-07', 'Added columns with statistics about how many books a contributor authored, translated etc.'],
    '2022-06-16_after-duplicate-removing': ['2022-06-16', 'Added columns with statistics about how many books a contributor authored, translated etc.'],
    '2022-06-20_with-duplicate-removing': ['2022-06-20', 'New KBR data dump with plenty of additions and refinements and ISBN fix for FR-NL.'],
    '2022-06-23_kbr': ['2022-06-23', 'No ISBN duplicate removal anymore to prevent data loss'],
    '2022-06-24_kbr': ['2022-06-24', 'New KBR dump with old Dutch, old French and middle French as well as new data in general'],
    '2022-07-18_kbr': ['2022-07-18', 'Dataprofile query filtering also for Belgian organizations, not just Belgian persons (KB data was also updated from SPARQL endpoint, but did not cause more translations)']
}

In [4]:
for corpusVersion in files:
    contributorFile = f'./{corpusVersion}/csv/contributors-persons.csv'
    corpusVersionDate = files[corpusVersion][0]
    corpusVersionComment = files[corpusVersion][1]
    outputFile = f'./measurements/{corpusVersionDate}_person-contributor-stats.csv'
    contributors = pd.read_csv(contributorFile, index_col='contributorID')
    measurements = pd.DataFrame([
        utils_stats.createContributorCorpusMeasurements(contributors, corpusVersionDate, corpusVersionComment)
    ])
    measurements.to_csv(outputFile, index=False)