# Cantus Corpus Analysis


## Antiphons

In [2]:
from src.utils.loader import prepare_dataset
from src.utils.data_analyzis import analyze_chants
X, y = prepare_dataset()
analyze_chants(X, y)

-------------------------------- Dataset Analyzis --------------------------------
	 Number of chants: 13865
	 Average chant length: 59.51265777136675
	 Maximal chant length: 683
	 Minimal chant length: 3
	 Mode distribution {'8': 3933, '6': 530, '2': 1005, '1': 3406, '3': 950, '4': 1372, '7': 2137, '5': 532}
----------------------------------------------------------------------------------


## Antiphons without differentia


In [3]:
from src.utils.loader import prepare_dataset
from src.utils.data_analyzis import analyze_chants
X, y = prepare_dataset()
analyze_chants(X, y)

-------------------------------- Dataset Analyzis --------------------------------
	 Number of chants: 13551
	 Average chant length: 53.97254815142794
	 Maximal chant length: 676
	 Minimal chant length: 3
	 Mode distribution {'8': 3832, '6': 513, '2': 979, '1': 3348, '3': 939, '4': 1327, '7': 2091, '5': 522}
----------------------------------------------------------------------------------


## Responsories

In [4]:
from src.utils.loader import prepare_dataset
from src.utils.data_analyzis import analyze_chants
X, y = prepare_dataset()
analyze_chants(X, y)

-------------------------------- Dataset Analyzis --------------------------------
	 Number of chants: 7031
	 Average chant length: 137.51543165979234
	 Maximal chant length: 364
	 Minimal chant length: 5
	 Mode distribution {'8': 1472, '2': 904, '7': 1307, '1': 1258, '6': 227, '5': 443, '3': 679, '4': 741}
----------------------------------------------------------------------------------


# Obsolete CantusCorpus analysis

In [None]:
!pip install music21

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from music21 import *
import os
import pandas as pd
import numpy as np

In [None]:
DATASET_DIR = "./dataset/csv"
CSV_FILE = "chant.csv"

In [None]:
def load_chants(dataset_dir: str = DATASET_DIR, csv_file: str = CSV_FILE):
    chants = pd.read_csv(os.path.join(dataset_dir, csv_file), index_col='id')
    return chants



In [None]:
chants = load_chants()

  """Entry point for launching an IPython kernel.


In [None]:
def get_unique_count(chants, column: str = 'id'):
  if column == 'id':
    return len(chants)
  else:
    return len(chants[column].unique().tolist())

def get_value_frequencies(chants, column: str = "incipit", normalize: bool = True, head: any = None):
  if head == None:
    return chants[column].value_counts(normalize = normalize)
  else:
    return chants[column].value_counts(normalize = normalize).head(head)

def get_counts_of_nans(chants, column: str = 'full_text', min_length: int = 10):
  nan_count = 0
  rest_count = 0
  for chant_value in chants[column]:
    if (type(chant_value) is float and np.isnan(chant_value)) or ((type(chant_value) is str) and len(chant_value) < min_length):
      nan_count += 1
    else:
      rest_count += 1
  return nan_count, rest_count
    
def get_length_statistics(chants, column: str = 'full_text', min_length: int = 10):
  lengths = []
  for chant_value in chants[column]:
    if (not (type(chant_value) is float and np.isnan(chant_value))) and ((type(chant_value) is str) and len(chant_value) >= min_length):
      lengths.append(len(chant_value))
  average = np.average(lengths)
  max = np.max(lengths)
  min = np.min(lengths)
  return average, max, min


In [None]:
def print_basic_analyzis(chants, column, head, min_length_not_nan):
  print("------------------------------------ Basic Data Analyzis ------------------------------------")
  unique_counts = get_unique_count(chants, column)
  nan_count, rest_count = get_counts_of_nans(chants, column, min_length_not_nan)
  value_frequencies = get_value_frequencies(chants, column, True, head)
  print("   Unique values:    {}".format(unique_counts))
  print()
  print("   Empty values:     {}".format(nan_count))
  print("   Filled values:    {}".format(rest_count))
  print()
  print("   Value frequencies of first {} values:".format(head))
  print(value_frequencies)
  print("---------------------------------------------------------------------------------------------")


def print_extended_analyzis(chants, column, min_length_not_nan):
  print("---------------------------------- Extended Data Analyzis -----------------------------------")
  average, max, min = get_length_statistics(chants, column = column, min_length = min_length_not_nan)
  print("   Average value length:     {}".format(average))
  print("   Maximal value length:     {}".format(max))
  print("   Minimal value length:     {}".format(min))
  print("---------------------------------------------------------------------------------------------")

## ID - Basic Dataset Information

In [None]:
column_id = 'id'
print("Total number of chants: {}".format(get_unique_count(chants, column_id)))

Total number of chants: 497071


## Incipit

In [None]:
column_id = 'incipit'
head = 20
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    59035

   Empty values:     16
   Filled values:    497055

   Value frequencies of first 20 values:
Gloria patri et filio et         0.011405
Gloria*                          0.004231
Gloria patri*                    0.001929
In omnem terram*                 0.001758
Gloria et honore*                0.001656
Specie tua*                      0.001567
Alleluia*                        0.001467
In omnem terram exivit sonus     0.001191
Alleluia iii                     0.001109
Ave Maria gratia plena           0.001050
Justum deduxit*                  0.001040
Diffusa est gratia in labiis     0.001040
Specie tua et pulchritudine      0.001022
Venite exsultemus domino         0.001022
Tecum principium*                0.001016
Benedicta tu in mulieribus et    0.001010
Posuisti domine*                 0.000966
Kyrie eleison*                   0.000966
Ave Maria*                  

## Cantus ID

These 6-digit numbers (plus suffixes) have been created by Cantus in order that the large repertory of chants contained in medieval manuscripts can be easily managed and searched. Refer to the main catalogue of chants, Cantus Index, for chant texts and their corresponding Cantus ID Numbers, which are used in all connected databases in the Cantus Index network. For details about these numbers and their assignment in the database, see this page (under the "About" menu).

In [None]:
column_id = 'cantus_id'
head = 5
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    31501

   Empty values:     10244
   Filled values:    486827

   Value frequencies of first 5 values:
909000    0.018247
008097    0.001865
008081    0.001781
001328    0.001725
909030    0.001635
Name: cantus_id, dtype: float64
---------------------------------------------------------------------------------------------


## Mode Statistic - something as Key, there should be only 8 of them which are not

In [None]:
column_id = 'mode'
head = 25
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    142

   Empty values:     38844
   Filled values:    458227

   Value frequencies of first 25 values:
*     0.230755
8     0.151012
1     0.126099
7     0.100153
4     0.072121
2     0.064346
3     0.045866
?     0.044703
r     0.040661
5     0.031179
6     0.027975
6T    0.008072
4T    0.007219
1S    0.006486
2T    0.004177
1T    0.003479
8S    0.002977
2S    0.002632
3S    0.002444
6S    0.002315
5S    0.002027
7S    0.001971
8*    0.001680
4S    0.001656
1*    0.001504
Name: mode, dtype: float64
---------------------------------------------------------------------------------------------


## Finalis - final note

In [None]:
column_id = 'finalis'
head = 25
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    35

   Empty values:     488152
   Filled values:    8919

   Value frequencies of first 25 values:
G              0.305752
D              0.259782
E              0.135778
F              0.121426
A              0.079157
C              0.041933
B              0.024891
g              0.007512
d              0.006839
a              0.003139
f              0.002242
e              0.002018
fa             0.001345
*              0.001233
mi             0.001233
re             0.001009
c              0.000785
la             0.000673
sol            0.000561
D?             0.000336
?              0.000336
h              0.000336
C?             0.000224
d (cofinal)    0.000224
H              0.000224
Name: finalis, dtype: float64
---------------------------------------------------------------------------------------------


## Differentia

This one- or two-digit number, or numbers and letters in combination, refers either to the differentia (the termination of the psalm tone to be employed in connection with a particular antiphon) or to the tone to be employed with an invitatory antiphon.

In [None]:
column_id = 'differentia'
head = 20
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    681

   Empty values:     319772
   Filled values:    177299

   Value frequencies of first 20 values:
*      0.155630
1      0.139662
G1     0.120756
A1     0.052713
2      0.047620
D1     0.043931
F1     0.031433
C1     0.026712
E1     0.026029
?      0.024744
3      0.023666
G2     0.021207
5      0.014258
w      0.013689
4      0.012600
a      0.010654
G      0.009487
A2     0.009171
D2     0.008178
G01    0.007778
Name: differentia, dtype: float64
---------------------------------------------------------------------------------------------


## Siglum

Some kind of ID to specific chapter of some source.

In [None]:
column_id = 'siglum'
head = 20
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    339

   Empty values:     0
   Filled values:    497071

   Value frequencies of first 20 values:
F-AS 893              0.016891
GB-WO F.160           0.016314
GB-WO F.160 (Facs)    0.015346
F-Pn lat. 1085        0.013732
F-CA 38               0.012608
NL-Uu 406             0.012588
PL-KIk 1              0.012566
D-AAm G 20            0.012545
I-Rv C.5              0.012501
A-LIs 290             0.012493
F-VAL 114             0.012411
TR-Itks 42            0.012191
PL-WRu R 503          0.012167
A-VOR 287             0.011878
E-Tc 44.2             0.011719
MA Impr. 1537         0.011670
D-KA Aug. LX          0.011666
CH-SGs 388            0.011487
F-Pn lat. 12044       0.011447
A-Wn 1890             0.011202
Name: siglum, dtype: float64
---------------------------------------------------------------------------------------------


## Position

Identifies the liturgical role of a particular chant according to Gregorian chant's system.

*ToDo: Do better statistic mapping this column to for instance genre column etc...*

In [None]:
column_id = 'position'
head = 30
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    456

   Empty values:     104459
   Filled values:    392612

   Value frequencies of first 30 values:
01     0.224374
1      0.066857
2      0.055579
M      0.053544
3      0.050210
4      0.041247
B      0.040271
5      0.034984
1.1    0.031769
1.2    0.030045
1.3    0.029795
02     0.029039
2.1    0.026056
2.3    0.025262
2.2    0.025236
3.1    0.022521
3.2    0.022091
3.3    0.021579
1.     0.016293
3.     0.015438
2.     0.012450
R      0.011268
1.4    0.008421
6      0.007649
2.4    0.007542
7      0.006143
8      0.005527
10     0.004972
3.4    0.004959
9      0.004778
Name: position, dtype: float64
---------------------------------------------------------------------------------------------


## Folio

Folio or page in the manuscript where the chant is found.

In [None]:
column_id = 'folio'
head = 5
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    2573

   Empty values:     35
   Filled values:    497036

   Value frequencies of first 5 values:
055r    0.002312
077r    0.002282
035v    0.002271
026r    0.002257
107r    0.002255
Name: folio, dtype: float64
---------------------------------------------------------------------------------------------


## Sequence

Order in which the chant is found on the page or folio side.

In [None]:
column_id = 'sequence'
head = 5
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    115

   Empty values:     37
   Filled values:    497034

   Value frequencies of first 5 values:
1.0    0.148966
2.0    0.130146
3.0    0.111401
4.0    0.098162
5.0    0.083936
Name: sequence, dtype: float64
---------------------------------------------------------------------------------------------


## Marginalia

Additional clarification of the location of the chant on the folio or page.

In [None]:
column_id = 'marginalia'
head = 5
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    73

   Empty values:     483040
   Filled values:    14031

   Value frequencies of first 5 values:
4    0.200200
7    0.147459
B    0.119307
A    0.115316
:    0.064144
Name: marginalia, dtype: float64
---------------------------------------------------------------------------------------------


## CAO Concordances

Corpus Antiphonalium Officii Concordances

In [None]:
column_id = 'cao_concordances'
head = 10
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    1114

   Empty values:     232283
   Filled values:    264788

   Value frequencies of first 10 values:
CGBEMVHRDFSL    0.198683
C BEMVHRDFSL    0.067394
C  EMVHRDFSL    0.030685
GBEMVHRDFSL     0.024193
SL              0.020076
CGBEMVHRDFS     0.015008
L               0.014989
R               0.012723
C BEMVHRDFS     0.012527
C BEMVHRD SL    0.011572
Name: cao_concordances, dtype: float64
---------------------------------------------------------------------------------------------


## Feasts

Feast - holidays/special days ...
Name, description, date, month, day, feast_code and notes could be found in *feast.csv* file that could be mapped with these IDs.

In [None]:
column_id = 'feast_id'
head = 10
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    1363

   Empty values:     8374
   Filled values:    488697

   Value frequencies of first 10 values:
feast_1202    0.017690
feast_0227    0.014340
feast_0552    0.013663
feast_0198    0.013153
feast_0500    0.012587
feast_1416    0.012396
feast_0093    0.011868
feast_0258    0.011850
feast_0475    0.011727
feast_0933    0.011703
Name: feast_id, dtype: float64
---------------------------------------------------------------------------------------------


## Genres

Generes of Masses and Officces - detailed description could be found in file *genre.csv* that could be mapped to these IDs.

**genre_a**: Antiphon - Office

**genre_r**: Responsory - Office

**genre_v**: Responsory verse - Office

**genre_h**: Hymn - Office

**genre_i**: Invitatory antiphon - Office

**genre_av**: Antiphon verse - Office

**genre_gr**: Gradual - Mass

**genre_al**: Alleluia - Mass

**genre_ps**: Psalm - Office

In [None]:
column_id = 'genre_id'
head = 10
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    47

   Empty values:     3969
   Filled values:    493102

   Value frequencies of first 10 values:
genre_a     0.416565
genre_r     0.207752
genre_v     0.191607
genre_w     0.072184
genre_h     0.036179
genre_i     0.020454
genre_av    0.007110
genre_gr    0.004889
genre_al    0.004831
genre_ps    0.004766
Name: genre_id, dtype: float64
---------------------------------------------------------------------------------------------


## Offices

IDs of offices, detailed described in *office.csv*.

**office_m**: Matins

**office_l**: Lauds

**office_v2**: Second Vespers

**office_v**: First Vespers

**office_mi**: Mass

In [None]:
column_id = 'office_id'
head = 5
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    17

   Empty values:     2117
   Filled values:    494954

   Value frequencies of first 5 values:
office_m     0.511439
office_l     0.156061
office_v2    0.081391
office_v     0.074284
office_mi    0.038167
Name: office_id, dtype: float64
---------------------------------------------------------------------------------------------


## Sources
IDs of sources, more described in *source.csv*.

**source_309**: Arras, Bibliothèque municipale, 893 (olim 465),Ff. 7-12: Kalendar of Arras.

**source_390**: Worcester, Cathedral - Music Library, F.160 (olim 1247) (with hymnal)93v, Sundays after Pentecost.

...

In [None]:
column_id = 'source_id'
head = 5
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    281

   Empty values:     40823
   Filled values:    456248

   Value frequencies of first 5 values:
source_309    0.018402
source_390    0.017773
source_633    0.016719
source_336    0.014961
source_573    0.014415
Name: source_id, dtype: float64
---------------------------------------------------------------------------------------------


## Melodies

IDs of melodies - how to decode it?

In [None]:
column_id = 'melody_id'
head = 5
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    92

   Empty values:     496980
   Filled values:    91

   Value frequencies of first 5 values:
038835751fabdac62104a2d5f1b474449f701d025882fc71a6467e5a390a4132    0.010989
a15e2ae954ce0d5156fe7fb821fe5e49026fa74b489a6c131641f3ae97c49a31    0.010989
9d474462825ab1fe5e526c60e1897b657f98ce2d1b61648bcc3c0e083865a68a    0.010989
2e588ca5d6fa690626b5fc41c104f2b528894484ce4a768a5ec0d09edb6639a6    0.010989
b3d4f999ed87e4719fcaf06e7ba5f700352545e68471d1eab55113c288c4c4bb    0.010989
Name: melody_id, dtype: float64
---------------------------------------------------------------------------------------------


## Drupal path

URL to Cantus website page of chant - each chant has own URL.

In [None]:
column_id = 'drupal_path'
head = 5
min_length_not_nan = 0
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    497071

   Empty values:     0
   Filled values:    497071

   Value frequencies of first 5 values:
http://cantus.uwaterloo.ca/chant/693173/    0.000002
http://cantus.uwaterloo.ca/chant/424101/    0.000002
http://cantus.uwaterloo.ca/chant/219506/    0.000002
http://cantus.uwaterloo.ca/chant/459754/    0.000002
http://cantus.uwaterloo.ca/chant/205134/    0.000002
Name: drupal_path, dtype: float64
---------------------------------------------------------------------------------------------


## Full texts

Full text in a standardized spelling.

In [None]:
column_id = 'full_text'
head = 10
min_length_not_nan = 5
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)
print_extended_analyzis(chants, column = column_id, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    53697

   Empty values:     279127
   Filled values:    217944

   Value frequencies of first 10 values:
Gloria patri et filio et spiritui sancto    0.008151
Gloria*                                     0.003911
Alleluia*                                   0.003145
LACUNA                                      0.001971
In omnem terram*                            0.001802
Kyrie eleison*                              0.001788
Gloria et honore*                           0.001628
Specie tua*                                 0.001554
Alleluia alleluia alleluia                  0.001284
Gloria patri*                               0.001279
Name: full_text, dtype: float64
---------------------------------------------------------------------------------------------
---------------------------------- Extended Data Analyzis -----------------------------------
   Average value length:     7

## Full texts - manuscripts

Full text in the manuscript spelling. This also includes barlines, or missing texts, etc.

In [None]:
column_id = 'full_text_manuscript'
head = 10
min_length_not_nan = 5
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)
print_extended_analyzis(chants, column = column_id, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    89090

   Empty values:     345585
   Filled values:    151486

   Value frequencies of first 10 values:
Gloria patri et filio et spiritui sancto    0.005771
Gloria                                      0.003462
Alleluia                                    0.001438
Gloria et honore                            0.001233
In omnem terram                             0.001200
Sanctus                                     0.001161
Agnus                                       0.001095
Gloria patri                                0.001016
Posuisti domine                             0.000956
Tecum principium                            0.000923
Name: full_text_manuscript, dtype: float64
---------------------------------------------------------------------------------------------
---------------------------------- Extended Data Analyzis -----------------------------------
   Average value le

## Volpiano

Transcription of the melody in the Volpiano typeface.

For more info at [cantus page](http://cantus.uwaterloo.ca/description#Volpiano) or [volpiano pdf document](https://cantus.uwaterloo.ca/sites/default/files/documents/2.%20Volpiano%20Protocols.pdf).

In [None]:
column_id = 'volpiano'
head = 40
min_length_not_nan = 1
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)
print_extended_analyzis(chants, column = column_id, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    60282

   Empty values:     433443
   Filled values:    63628

   Value frequencies of first 40 values:
1---6------6---3                            0.022899
1---6------67---3                           0.002813
1---6------6---3---6------6---3             0.001022
1---f--f---f---3                            0.000802
1---f--f--f--f---3                          0.000393
1---f--f--e---3                             0.000377
1--lmnml-kj-klkkj--kl-k--k--kl-k-k--k-      0.000330
1---6------677---3                          0.000299
1---6------6---3---6------67---3            0.000267
1---f--f--f---f---3                         0.000267
1---f--f--f---f--f---3                      0.000251
1---h--h--h---3                             0.000251
1---h--h--g--h---3                          0.000251
1---e---g--h7---3                           0.000251
1---f--f---f--f---3                    

## Notes

In [None]:
column_id = 'notes'
head = 10
min_length_not_nan = 1
print_basic_analyzis(chants, column = column_id, head = head, min_length_not_nan = min_length_not_nan)
print_extended_analyzis(chants, column = column_id, min_length_not_nan = min_length_not_nan)

------------------------------------ Basic Data Analyzis ------------------------------------
   Unique values:    78

   Empty values:     496974
   Filled values:    97

   Value frequencies of first 10 values:
Differentia added in margin                   0.072165
Diff. added in margin                         0.072165
Text and music incipit only                   0.030928
???, feast?                                   0.020619
Psalm  cued but differentia not written in    0.020619
Wrongly labelled A                            0.020619
DELETE                                        0.020619
Incipit is crossed out.                       0.020619
Duplicate - Needs to be deleted               0.020619
ps Deus judicium;...                          0.010309
Name: notes, dtype: float64
---------------------------------------------------------------------------------------------
---------------------------------- Extended Data Analyzis -----------------------------------
   Average value leng