## Create new manifestation measurements

The following code is used to create new statistics. They can be saved to a CSV file and be taken into account in the section above about existing measurements.

In [1]:
import pandas as pd
import utils
from datetime import datetime

In [2]:
corpus = pd.read_csv('./2022-06-23_kbr/csv/integrated-data-enriched.csv', index_col='targetIdentifier')
corpusNLFR = corpus[(corpus['sourceLanguage'] == 'Dutch') & (corpus['targetLanguage'] == 'French') ]
corpusFRNL = corpus[(corpus['sourceLanguage'] == 'French') & (corpus['targetLanguage'] == 'Dutch') ]
corpusOther = corpus[ ((corpus['sourceLanguage'] != 'Dutch') & (corpus['sourceLanguage'] != 'French')) | ((corpus['targetLanguage'] != 'Dutch') & (corpus['targetLanguage'] != 'French'))]

In [3]:
comment = "No ISBN duplicate removal anymore to prevent data loss"
measurements = pd.DataFrame([
  utils.createCorpusMeasurements(corpusFRNL, 'FR-NL', comment),
  utils.createCorpusMeasurements(corpusNLFR, 'NL-FR', comment),
  utils.createCorpusMeasurements(corpusOther, 'OTHER', comment)
])

In [4]:
convertedFRNL = corpusFRNL.fillna('')

In [5]:
(convertedFRNL['targetISBN13'].values != '').sum()

7065

In [6]:
measurements

Unnamed: 0,date,corpus,numberTranslations,withTargetISBN10,withTargetISBN13,withKBRIdentifier,withBnFIdentifier,withKBIdentifier,withKBRBnFAndKBIdentifier,withKBRAndBnFIdentifier,withKBRAndKBIdentifier,withBnFAndKBIdentifier,withBBThesaurusID,withSourceKBRIdentifier,withKBRSourceTitle,withKBSourceTitle,withSourceISBN10,withSourceISBN13,comment
0,2022-06-24 14:30:52.262214,FR-NL,7752,7061,7065,7040,52,1983,4,13,1310,4,6881,131,2489,1983,96,114,No ISBN duplicate removal anymore to prevent d...
1,2022-06-24 14:30:52.534531,NL-FR,3897,3343,3367,3420,1207,728,208,928,491,247,2847,190,1185,728,169,171,No ISBN duplicate removal anymore to prevent d...
2,2022-06-24 14:30:52.791451,OTHER,284,230,230,278,77,11,2,71,10,3,243,3,11,11,2,2,No ISBN duplicate removal anymore to prevent d...


In [7]:
measurements.to_csv('2022-06-23-translation-stats.csv', index=False)

## Create new contributor measurements

In [8]:
personContributors = pd.read_csv('./2022-06-23_kbr/csv/contributors-persons.csv', index_col='contributorID')

In [9]:
personComment = "No ISBN duplicate removal anymore to prevent data loss"
personContributorsMeasurements = pd.DataFrame([
    utils.createContributorCorpusMeasurements(personContributors, personComment)
])

In [10]:
personContributorsMeasurements

Unnamed: 0,date,numberContributors,withKBRIdentifier,withBnFIdentifier,withKBIdentifier,withKBRBnFAndKBIdentifier,withKBRAndBnFIdentifier,withKBRAndKBIdentifier,withBnFAndKBIdentifier,withISNIIdentifier,...,withMultipleKBRIdentifiers,withMultipleBnFIdentifiers,withMultipleNTAIdentifiers,withMultipleISNIIdentifiers,withMultipleVIAFIdentifiers,withMultipleWikidataIdentifiers,withMultipleBirthDates,withMultipleDeathDates,withMultipleNationalities,comment
0,2022-06-24 14:30:53.051889,5901,5272,1232,1067,233,854,778,271,4527,...,27,8,4,9,23,0,44,10,105,No ISBN duplicate removal anymore to prevent d...


In [11]:
personContributorsMeasurements.to_csv('2022-06-23-person-contributor-stats.csv', index=False)