In [1]:
import pandas as pd

cr = pd.read_csv('../CleanData/by_verse_cross_ref.csv', index_col=0)


## Outputs
- BOM_summary.csv
- ISH_summary.csv
- by_chapter_cross_ref.csv

## Preparing Chapter DataSets

In [2]:
cr['combo_ISH'] = cr["book_title_ISH"] + ' ' + cr["chapter_number_ISH"].astype(str).str.replace(r'\.0$','')
cr['combo_BOM'] = cr["book_title_BOM"] + ' ' + cr["chapter_number_BOM"].astype(str).str.replace(r'\.0$','')

# Setting up ISH summary data
cr_group = cr[(cr['word_count_ISH'] > 0)]
cr_group = cr_group.sort_values(by='similarity_score', ascending=False)
cr_group = cr_group.drop_duplicates(subset=['verse_title_ISH'], keep='first')

ISH = pd.DataFrame(cr_group.drop_duplicates(subset=['combo_ISH'], keep='first')[['combo_ISH', 'chapter_number_ISH', 'Duhms_Class']]).reset_index().sort_values(by=['chapter_number_ISH'])

# Setting up BOM summary data
cr_group2 = cr[(cr['word_count_BOM'] > 0)]
cr_group2 = cr_group2.sort_values(by='similarity_score', ascending=False)
cr_group2 = cr_group2.drop_duplicates(subset=['verse_title_BOM'], keep='first')

BOM = pd.DataFrame(cr_group2.drop_duplicates(subset=['combo_BOM'], keep='first')[['combo_BOM', 'book_title_BOM', 'chapter_number_BOM']])

# Setting and completing up edges and cross_ref summary data
cr_group3 = cr.groupby(['combo_ISH', 'combo_BOM','Duhms_Class'])['verse_title_ISH'].count().reset_index()
cr_group3.rename({'verse_title_ISH': 'number_shared_verses'}, inplace=True, axis=1)
temp = cr.groupby(['combo_BOM','combo_ISH','chapter_number_ISH'])['similarity_category'].value_counts().unstack(fill_value=0).reset_index()
cr_full_grouped = cr_group3.merge(temp).sort_values(by = ['chapter_number_ISH'])

  cr['combo_ISH'] = cr["book_title_ISH"] + ' ' + cr["chapter_number_ISH"].astype(str).str.replace(r'\.0$','')
  cr['combo_BOM'] = cr["book_title_BOM"] + ' ' + cr["chapter_number_BOM"].astype(str).str.replace(r'\.0$','')


## Calculating by-chapter statistics

In [3]:
# ISH stats
word_count_ISH = cr_group.drop_duplicates(subset=['verse_title_ISH'], keep='first').groupby(['combo_ISH','chapter_number_ISH'])['word_count_ISH'].sum().reset_index().sort_values(by=['chapter_number_ISH']).reset_index()
bibleterm_count_ISH = cr_group.drop_duplicates(subset=['verse_title_ISH'], keep='first').groupby(['combo_ISH','chapter_number_ISH'])['bible_term_in_ISH'].sum().reset_index().sort_values(by=['chapter_number_ISH']).reset_index()
group_count_ISH = cr.groupby(['combo_ISH','chapter_number_ISH'])['similarity_category'].value_counts().unstack(fill_value=0).reset_index().sort_values(by=['chapter_number_ISH'])
group_count_ISH['cross_ref_count'] = group_count_ISH[['Direct Quote', 'Shared Language', 'Similar Theme']].sum(axis=1)
verse_count_ISH = cr_group.drop_duplicates(subset=['verse_title_ISH'], keep='first').groupby(['combo_ISH','chapter_number_ISH']).nunique().reset_index().sort_values(by=['chapter_number_ISH'])[['combo_ISH', 'verse_number_ISH']]
verse_count_ISH.rename({'verse_number_ISH': 'verse_count_ISH'}, axis=1,inplace=True)

# Join to ISH
ISH_stats = ISH.merge(
    word_count_ISH, how='left',
    on=['combo_ISH', 'chapter_number_ISH']
).merge(
    bibleterm_count_ISH, how='left',
    on=['combo_ISH', 'chapter_number_ISH']
).merge(
    group_count_ISH, how='left',
    on=['combo_ISH', 'chapter_number_ISH']
).merge(
    verse_count_ISH, how='left',
    on=['combo_ISH']
)

ISH_stats = ISH_stats.reset_index()
ISH_stats.fillna(0, inplace=True)
ISH_stats.drop(columns=['index_x','index_y','index'], inplace=True)
ISH_stats.rename(columns={'level_0': 'index'}, inplace=True)
ISH_stats['chapter_number_ISH'] = ISH_stats['chapter_number_ISH'].apply(lambda ch: int(ch))

#BOM stats
word_count_BOM = cr_group2.drop_duplicates(subset=['verse_title_BOM'], keep='first').groupby(['combo_BOM','chapter_number_BOM'])['word_count_BOM'].sum().reset_index()
bibleterm_count_BOM = cr_group2.drop_duplicates(subset=['verse_title_BOM'], keep='first').groupby(['combo_BOM','chapter_number_BOM'])['bible_term_in_BOM'].sum().reset_index()
group_count_BOM = cr.groupby(['combo_BOM','chapter_number_BOM'])['similarity_category'].value_counts().unstack(fill_value=0).reset_index()
group_count_BOM['cross_ref_count'] = group_count_BOM[['Direct Quote', 'Shared Language', 'Similar Theme']].sum(axis=1)
verse_count_BOM = cr.drop_duplicates(subset=['verse_title_BOM'], keep='first').groupby(['combo_BOM','chapter_number_BOM']).nunique().reset_index()[['combo_BOM', 'verse_number_BOM']]
verse_count_BOM.rename({'verse_number_BOM': 'verse_count_BOM'}, axis=1,inplace=True)

# Join to BOM
BOM_stats = BOM.merge(
    word_count_BOM, how='left',
).merge(
    bibleterm_count_BOM, how='left',
).merge(
    group_count_BOM, how='left',
).merge(
    verse_count_BOM, how='left',
)
BOM_stats.fillna(0, inplace=True)
BOM_stats['chapter_number_BOM'] = BOM_stats['chapter_number_BOM'].apply(lambda ch: int(ch))


## Ordering by chapter occurence

In [4]:
scripts = pd.read_excel('../DirtyData/lds-scriptures.xlsx')

In [5]:
# ordering by chapter occurance requires that I look at the order that the original scripture data set was in
BOM_series = scripts[scripts['volume_short_title'] == 'BoM'].reset_index(drop=True)[['book_title','chapter_number']]
BOM_series = BOM_series["book_title"] + ' ' + BOM_series["chapter_number"].astype(str).str.replace(r'\.0$','')
BOM_series = BOM_series.drop_duplicates(keep='first').reset_index(drop=True)
BOM_series = BOM_series.to_list()

  BOM_series = BOM_series["book_title"] + ' ' + BOM_series["chapter_number"].astype(str).str.replace(r'\.0$','')


In [6]:
BOM_stats['combo_BOM'] = pd.Categorical(BOM_stats['combo_BOM'], categories=BOM_series)
BOM_stats.sort_values(by = 'combo_BOM', inplace=True)
BOM_stats.reset_index(drop=True, inplace=True)

## To CSV files

In [7]:
cr_full_grouped.to_csv('../CleanData/by_chapter_cross_ref.csv')
ISH_stats.to_csv('../CleanData/ISH_summary.csv')
BOM_stats.to_csv('../CleanData/BOM_summary.csv')