In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
HOMEDIR = os.path.expanduser("~") + "/"

In [3]:
DIR = HOMEDIR + "Dropbox/lml/genome_2021/"

In [4]:
df = pd.read_csv(HOMEDIR + "Dropbox/lml/export_01_19_22.csv",encoding='latin1')

In [5]:
df['Secondary Catalog Number'] = df['Secondary Catalog Number'].fillna('').apply(str)

In [6]:
df['Catalog Number'] = df['Catalog Number'].fillna('').apply(str)

In [7]:
df['catalog'] = df['Catalog Number'] + df['Secondary Catalog Number']

In [8]:
cols = ['catalog',
    'Sampled','Scanned','Copies','Format','T1','T2','T3','T4','T5','Year','Manufacturer','Brand','gloss',
    'Thickness?','Gloss?','Texture?','Color?','L*-recto','a*-recto','b*-recto','R-recto','G-recto','B-recto',
    'L*-verso','a*-verso','b*-verso','R-verso','G-verso','B-verso'
]

In [9]:
df = df[cols]

# Gloss

Legacy gloss measurements exist in two files, both now located in Dropbox folder "lml/genome_legacy/".

In [10]:
gg = pd.read_excel(HOMEDIR + "Dropbox/lml/genome_legacy/Gloss Spreadsheet.xls", sheet_name="Sheet 2")

In [11]:
genevieve_idxs = list(gg[" .1"][2:].unique())

In [12]:
pg = pd.read_excel(HOMEDIR + "Dropbox/lml/genome_legacy/gloss.xlsx", sheet_name="raw data")

In [13]:
paul_idx_strings = pg[' '][2:]

In [14]:
paul_idxs = list(pd.Series([item.split('_')[0] for item in paul_idx_strings]).unique())

In [15]:
database_has_gloss = df[['catalog','gloss']][df['Gloss?']==-1]

In [16]:
np.mean(df['Gloss?'][df.gloss.notnull()])

-0.6960830580462483

In [17]:
np.mean(df['Gloss?'][df.gloss.isnull()])

-0.01517874975034951

In [18]:
database_has_gloss.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1551 entries, 2 to 6832
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   catalog  1551 non-null   object 
 1   gloss    1475 non-null   float64
dtypes: float64(1), object(1)
memory usage: 36.4+ KB


Ok so the "Gloss?" column does not even follow the database's own entries for gloss. I don't know where this came from, but clearly it cannot be trusted.

In [19]:
genevieve_idxs = [str(item) for item in genevieve_idxs]

In [20]:
set(database_has_gloss.catalog).intersection(set(genevieve_idxs))

set()

So, none of the "has gloss" comes from Genevieve's measurements.

In [21]:
len(set(database_has_gloss.catalog).intersection(set(paul_idxs))) / len(database_has_gloss)

0.8517085751128304

85% of the "has gloss" appears to come from Paul's measurements.

In [22]:
len(set(df.catalog[df.gloss.notnull()]).intersection(set(paul_idxs))) / len(database_has_gloss)

0.8252740167633784

This number goes down slightly if we just look at actual non-null gloss entries in the database.

In [23]:
# cats in paul_idxs that aren't in the catalog at all

whole_collection_residue = set(paul_idxs) - set(df.catalog)
len(whole_collection_residue)

114

In [31]:
whole_collection_residue

{'1006i',
 '1024i',
 '17j',
 '17l',
 '1942l',
 '2082j',
 '2084cc',
 '2210j',
 '23i',
 '293a',
 '293b',
 '293c',
 '293d',
 '293e',
 '293f',
 '293g',
 '293h',
 '293i',
 '293j',
 '293k',
 '293l',
 '293m',
 '293n',
 '293o',
 '293p',
 '293q',
 '293r',
 '293s',
 '293t',
 '293u',
 '293v',
 '293w',
 '293x',
 '2968aa',
 '2968b',
 '2968bb',
 '2968c',
 '2968cc',
 '2968d',
 '2968dd',
 '2968e',
 '2968ee',
 '2968f',
 '2968ff',
 '2968g',
 '2968gg',
 '2968h',
 '2968hh',
 '2968i',
 '2968ii',
 '2968j',
 '2968jj',
 '2968k',
 '2968kk',
 '2968l',
 '2968ll',
 '2968m',
 '2968mm',
 '2968n',
 '2968nn',
 '2968o',
 '2968oo',
 '2968p',
 '2968pp',
 '2968q',
 '2968qq',
 '2968r',
 '2968rr',
 '2968s',
 '2968ss',
 '2968t',
 '2968tt',
 '2968u',
 '2968uu',
 '2968v',
 '2968vv',
 '2968w',
 '2968ww',
 '2968x',
 '2968xx',
 '2968y',
 '2968z',
 '302a ',
 '302b ',
 '302c ',
 '302d ',
 '302e ',
 '302g ',
 '302h ',
 '302i ',
 '302j ',
 '3198rr',
 '3199ss',
 '4781m',
 '47i',
 '47l',
 '47m',
 '47n',
 '47o',
 '47p',
 '47q',
 '47r',

Puzzlingly, "paul_idxs" contains a number of catalog numbers that don't exist in the catalog at all.

In [24]:
print(len(database_has_gloss),len(paul_idxs),len(genevieve_idxs))

1551 1550 278


In [25]:
set(genevieve_idxs) - set(df.catalog)

set()

All of Genevieve's measurements are actual catalog numbers, so that's good.

In [26]:
df.Format[df.catalog.isin(genevieve_idxs)].value_counts()

Not exposed - package open      337
Not exposed - package sealed    109
Name: Format, dtype: int64

In [27]:
df.Sampled[df.catalog.isin(genevieve_idxs)].value_counts()

-1    413
 0     33
Name: Sampled, dtype: int64

So all of Genevieve's measurements were of binder papers, and weirdly, some haven't been sampled, according to the database.

In [28]:
df.Format[df.catalog.isin(paul_idxs)].value_counts()

Sample book    1615
Name: Format, dtype: int64

All of Paul's were from sample books.

In [29]:
df.catalog.value_counts()[:30]

288b     9
1732     9
1881a    7
366      7
356      7
1397     7
1518     7
141      7
1971     7
288c     7
896      6
994      6
235      6
292d     6
1418     6
2085d    6
2085l    6
289b     6
1450     6
2082n    6
1976     6
1882l    6
188      6
1026     6
1764     6
330      6
1743     6
4913g    6
1882f    6
1882m    6
Name: catalog, dtype: int64

Another puzzle: catalog numbers in the database are not unique. Some not even close. Although I think we've come across this before. I forget the explanation, and it might have something to do with certain types of tests.

In [30]:
df[['Manufacturer','Brand']][df.catalog=='288b']

Unnamed: 0,Manufacturer,Brand
597,Gevaert,Novabrom
598,Gevaert,Novabrom
599,Gevaert,Novabrom
600,Gevaert,Novabrom
601,Gevaert,Novabrom
602,Gevaert,Novabrom
603,Gevaert,Novabrom
604,Gevaert,Novabrom
605,Gevaert,Novabrom
