# Metadata Creation Script - Harvard College Observatory Announcement Cards 
This script generates metadata spreadsheets for each file in the collection. Series are created to indicate datasets and all metadata is expressed at the file level before getting collected into a series/dataset. 

In [3]:
## Set global variables

#source path
source_path = '/Users/katherinemika/Desktop/curation/historic_datasets/hco/'

#import libraries
import pandas as pd
import os
import numpy

In [24]:
dtype_spec = {
    'card_date_year': str,
    'card_date_month': str,
    'card_date_day': str
}

hco_inventory_df = pd.read_csv(source_path + 'Report_HCO_clean.csv', index_col=None, dtype=dtype_spec)
hco_inventory_df

Unnamed: 0,filename,card_number,card_date_year,card_date_month,card_date_day,compiler,observation
0,HCOAnnouncement0001_0001.innodata.xml,1,1926,3,12,Harlow Shapley,
1,HCOAnnouncement0001_0001.innodata.jpg,1,1926,3,12,Harlow Shapley,
2,HCOAnnouncement0001_0001.innodata.txt,1,1926,3,12,Harlow Shapley,
3,HCOAnnouncement0001_0001_a.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET
4,HCOAnnouncement0001_0001_b.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET
...,...,...,...,...,...,...,...
8430,HCOAnnouncement0039_0043_c.innodata.csv,1675,1964,12,23,Richard Southworth For Owen Gingerich,COMET IKEYA (1964f)
8431,HCOAnnouncement0039_0044.innodata.xml,1676,1965,12,30,David Latham For Owen Gingerich,
8432,HCOAnnouncement0039_0044.innodata.jpg,1676,1965,12,30,David Latham For Owen Gingerich,
8433,HCOAnnouncement0039_0044.innodata.txt,1676,1965,12,30,David Latham For Owen Gingerich,


In [25]:
hco_test_batch = hco_inventory_df.head(60)
hco_test_batch

Unnamed: 0,filename,card_number,card_date_year,card_date_month,card_date_day,compiler,observation
0,HCOAnnouncement0001_0001.innodata.xml,1,1926,3,12,Harlow Shapley,
1,HCOAnnouncement0001_0001.innodata.jpg,1,1926,3,12,Harlow Shapley,
2,HCOAnnouncement0001_0001.innodata.txt,1,1926,3,12,Harlow Shapley,
3,HCOAnnouncement0001_0001_a.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET
4,HCOAnnouncement0001_0001_b.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET
5,HCOAnnouncement0001_0002.innodata.xml,2,1926,3,18,Harlow Shapley,
6,HCOAnnouncement0001_0002.innodata.jpg,2,1926,3,18,Harlow Shapley,
7,HCOAnnouncement0001_0002.innodata.txt,2,1926,3,18,Harlow Shapley,
8,HCOAnnouncement0001_0002_a.innodata.csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET
9,HCOAnnouncement0001_0002_b.innodata.csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET


In [26]:
# create new col that collects observations according to card by grouping by 'card_number' and concatenating non-NaN 'observation' values
hco_test_batch.loc[:, 'all_observations'] = hco_test_batch.groupby('card_number')['observation'].transform(lambda x: '; '.join(x.dropna().unique()))

hco_test_batch

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hco_test_batch.loc[:, 'all_observations'] = hco_test_batch.groupby('card_number')['observation'].transform(lambda x: '; '.join(x.dropna().unique()))


Unnamed: 0,filename,card_number,card_date_year,card_date_month,card_date_day,compiler,observation,all_observations
0,HCOAnnouncement0001_0001.innodata.xml,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET
1,HCOAnnouncement0001_0001.innodata.jpg,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET
2,HCOAnnouncement0001_0001.innodata.txt,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET
3,HCOAnnouncement0001_0001_a.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET,BLATHWAYT’S COMET
4,HCOAnnouncement0001_0001_b.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET,BLATHWAYT’S COMET
5,HCOAnnouncement0001_0002.innodata.xml,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT’S COMET
6,HCOAnnouncement0001_0002.innodata.jpg,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT’S COMET
7,HCOAnnouncement0001_0002.innodata.txt,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT’S COMET
8,HCOAnnouncement0001_0002_a.innodata.csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET,ENSOR'S COMET; BLATHWAYT’S COMET
9,HCOAnnouncement0001_0002_b.innodata.csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET,ENSOR'S COMET; BLATHWAYT’S COMET


In [35]:
hco_test_batch['series_name'] = 'Announcement Card number: ' + hco_test_batch['card_number']
hco_test_batch

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hco_test_batch['series_name'] = 'Announcement Card number: ' + hco_test_batch['card_number']


Unnamed: 0,filename,card_number,card_date_year,card_date_month,card_date_day,compiler,observation,all_observations,series_name
0,HCOAnnouncement0001_0001.innodata.xml,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET,Announcement Card number: 1
1,HCOAnnouncement0001_0001.innodata.jpg,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET,Announcement Card number: 1
2,HCOAnnouncement0001_0001.innodata.txt,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET,Announcement Card number: 1
3,HCOAnnouncement0001_0001_a.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET,BLATHWAYT’S COMET,Announcement Card number: 1
4,HCOAnnouncement0001_0001_b.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET,BLATHWAYT’S COMET,Announcement Card number: 1
5,HCOAnnouncement0001_0002.innodata.xml,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2
6,HCOAnnouncement0001_0002.innodata.jpg,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2
7,HCOAnnouncement0001_0002.innodata.txt,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2
8,HCOAnnouncement0001_0002_a.innodata.csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2
9,HCOAnnouncement0001_0002_b.innodata.csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2


In [36]:
print(hco_test_batch)


                                   filename card_number card_date_year  \
0     HCOAnnouncement0001_0001.innodata.xml           1           1926   
1     HCOAnnouncement0001_0001.innodata.jpg           1           1926   
2     HCOAnnouncement0001_0001.innodata.txt           1           1926   
3   HCOAnnouncement0001_0001_a.innodata.csv           1           1926   
4   HCOAnnouncement0001_0001_b.innodata.csv           1           1926   
5     HCOAnnouncement0001_0002.innodata.xml           2           1926   
6     HCOAnnouncement0001_0002.innodata.jpg           2           1926   
7     HCOAnnouncement0001_0002.innodata.txt           2           1926   
8   HCOAnnouncement0001_0002_a.innodata.csv           2           1926   
9   HCOAnnouncement0001_0002_b.innodata.csv           2           1926   
10  HCOAnnouncement0001_0002_c.innodata.csv           2           1926   
11    HCOAnnouncement0001_0003.innodata.xml           3           1926   
12    HCOAnnouncement0001_0003.innodat

## Scrape URLs from HCO site

In [37]:
import requests
from bs4 import BeautifulSoup

In [40]:
def scrape_hac_urls(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Dictionary to store HAC numbers and their URLs
    hac_urls = {}

    # Find all links containing "HAC"
    for a_tag in soup.find_all('a', href=True):
        if 'HAC' in a_tag.text:
            # Extract HAC number from the text
            hac_number = ''.join(filter(str.isdigit, a_tag.text))
            if hac_number:
                hac_urls[int(hac_number)] = a_tag['href']

    return hac_urls

In [41]:
hac_url = "http://tamkin2.eps.harvard.edu/services/HACs.html"
hac_urls = scrape_hac_urls(hac_url)

In [63]:
def get_hac_url(card_number):
    try:
        card_number_int = int(card_number)
    except ValueError:
        return None
    url = hac_urls.get(card_number_int, None)
    if url is not None:
        return "http://tamkin2.eps.harvard.edu" + url
    return None

In [64]:
hco_test_batch['url'] = hco_test_batch['card_number'].apply(get_hac_url)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hco_test_batch['url'] = hco_test_batch['card_number'].apply(get_hac_url)


In [65]:
hco_test_batch

Unnamed: 0,filename,card_number,card_date_year,card_date_month,card_date_day,compiler,observation,all_observations,series_name,url
0,HCOAnnouncement0001_0001.innodata.xml,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg
1,HCOAnnouncement0001_0001.innodata.jpg,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg
2,HCOAnnouncement0001_0001.innodata.txt,1,1926,3,12,Harlow Shapley,,BLATHWAYT’S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg
3,HCOAnnouncement0001_0001_a.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET,BLATHWAYT’S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg
4,HCOAnnouncement0001_0001_b.innodata.csv,1,1926,3,12,Harlow Shapley,BLATHWAYT’S COMET,BLATHWAYT’S COMET,Announcement Card number: 1,http://tamkin2.eps.harvard.edu/IAUCs/HAC0001.jpg
5,HCOAnnouncement0001_0002.innodata.xml,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg
6,HCOAnnouncement0001_0002.innodata.jpg,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg
7,HCOAnnouncement0001_0002.innodata.txt,2,1926,3,18,Harlow Shapley,,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg
8,HCOAnnouncement0001_0002_a.innodata.csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg
9,HCOAnnouncement0001_0002_b.innodata.csv,2,1926,3,18,Harlow Shapley,ENSOR'S COMET,ENSOR'S COMET; BLATHWAYT’S COMET,Announcement Card number: 2,http://tamkin2.eps.harvard.edu/IAUCs/HAC0002.jpg


In [66]:
#export metadata file for local editing
hco_test_batch.to_csv(source_path + "batch_metdata.csv", index = False)