## Creating the Corpus

This code loads the data from the seven volumes of Edgar Thurston and K Rangachari's The Castes and Tribes of Southern India from the HTML files available on Project Gutenberg in a machine readable format.

In [1]:
# importing required libraries
import sys
sys.path.insert(0, '/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages')
import requests as r
from bs4 import BeautifulSoup as soup
import pandas as pd
import re
import spacy
import pickle

### Loading the text from the HTML files

In [2]:
# Function to download data from an html file
def create_corpus(url):
    data = r.get(url)
    vol = soup(data.text, 'html.parser')
    return vol

# Loading the seven volumes
vol_one_data = create_corpus('https://www.gutenberg.org/cache/epub/42991/pg42991-images.html')
vol_two_data = create_corpus('https://www.gutenberg.org/cache/epub/42992/pg42992-images.html')
vol_three_data = create_corpus('https://www.gutenberg.org/cache/epub/42993/pg42993-images.html')
vol_four_data = create_corpus('https://www.gutenberg.org/cache/epub/42994/pg42994-images.html')
vol_five_data = create_corpus('https://www.gutenberg.org/cache/epub/42995/pg42995-images.html')
vol_six_data = create_corpus('https://www.gutenberg.org/cache/epub/42996/pg42996-images.html')
vol_seven_data = create_corpus('https://www.gutenberg.org/cache/epub/42997/pg42997-images.html')

### Extracting the required data from the loaded texts

In [3]:
# Function to add new text data to a list
def add_to_list(descr, list, str):
    if isinstance(descr, str):
        list.append(descr.strip())
    else:
        list.append(descr.get_text().strip()) 

# Function to extract paragraphs from the descriptions
def get_Descriptions(descr):
    if descr.find_next_sibling('p') is not None and descr.find_next_sibling('p').find('b') is None:
        return descr.find_next_sibling('p')
    else:
        return None

# Function to extract headings
def extract_headings(vol):
    headings = vol.body.find_all('b')
    list_clean = []
    for element in headings:
        add_to_list(element, list_clean, str)
        for i in range(0,len(list_clean)):
            list_clean[i] = re.sub('\.—', '', list_clean[i])
            list_clean[i] = re.sub('\.', '', list_clean[i])
    return list_clean

# Function to extract descriptions
def extract_descriptions(vol, main_table):
    vol_headings = vol.body.find_all('b')
    vol_descriptions = []
    # iterating over each heading element and extracting its corresponding descriptions
    for i in range(0, len(vol_headings)):
        descriptions = []
        next_paragraph = vol_headings[i].parent
        add_to_list(next_paragraph, descriptions, str)
        # grabbing each of the paragraphs until arriving at the next heading
        while True:
            next_paragraph = get_Descriptions(next_paragraph)
            if next_paragraph is not None:
                add_to_list(next_paragraph, descriptions, str)
            else:
                break
        # joining the collected descriptions into one string with paragraph breaks
        description_str = '\n\n'.join(descriptions)
        description_str = '—'.join(description_str.split('—')[1:])
        # removing page numbers, footnotes, some punctuation, and line breaks
        description_str = re.sub(r'(?<=[a-zA-Z,.\'"!?])\d+', '', description_str)
        description_str = re.sub('\[[^\]]*\]', '', description_str)
        description_str = re.sub('\r\n', ' ', description_str)
        # appending the cleaned descriptions into the final list
        vol_descriptions.append(description_str)
    main_table.extend(vol_descriptions)

# Extracting headings from each volume
vol_one_headings = extract_headings(vol_one_data)
vol_two_headings = extract_headings(vol_two_data)
vol_three_headings = extract_headings(vol_three_data)
vol_four_headings = extract_headings(vol_four_data)
vol_five_headings = extract_headings(vol_five_data)
vol_six_headings = extract_headings(vol_six_data)
vol_seven_headings = extract_headings(vol_seven_data)

# Manually editing some headings that are stylized in the html file
vol_one_headings[0] = 'Abhishēka'
vol_two_headings[0] = 'Canji'
vol_five_headings[0] = 'Marakkāyar'

# Creating final list of caste names
castes_names = []
castes_names.extend(vol_one_headings)
castes_names.extend(vol_two_headings)
castes_names.extend(vol_three_headings)
castes_names.extend(vol_four_headings)
castes_names.extend(vol_five_headings)
castes_names.extend(vol_six_headings)
castes_names.extend(vol_seven_headings)
print(len(castes_names))

# Extracting descriptions from each volume
castes_descriptions = []
extract_descriptions(vol_one_data, castes_descriptions)
extract_descriptions(vol_two_data, castes_descriptions)
extract_descriptions(vol_three_data, castes_descriptions)
extract_descriptions(vol_four_data, castes_descriptions)
extract_descriptions(vol_five_data, castes_descriptions)
extract_descriptions(vol_six_data, castes_descriptions)
extract_descriptions(vol_seven_data, castes_descriptions)
print(len(castes_descriptions))

2117
2117


### Creating a table with the names and description of each caste
The table is also saved as a CSV file for future use.

In [4]:
# Creating a df
column_names = ['Caste', 'Description']
df = pd.DataFrame(list(zip(castes_names, castes_descriptions)),
                  columns = column_names)
df['doc_id'] = df.index

# Saving the df as a CSV file
df.to_csv('./data/castes_dataframe.csv')

# Saving the df as an HTML file
html = df.to_html()
text_file = open("./data/castes_dataframe.html", "w")
text_file.write(html)
text_file.close()

### Creating doc entities
The following code converts the descriptions loaded above into spaCy doc entities to enable further analysis. <span style= "color:red;"> This chunk of code only needs to be run once. </span>

In [5]:
# Loading the spaCy English language model 
nlp = spacy.load("en_core_web_sm")

# Extracting the descriptions as doc entities
docs = list(nlp.pipe(df.Description))

# Pickling the list of docs
with open("./data/docs.pkl", "wb") as descriptions_docs:
    pickle.dump(docs, descriptions_docs)

### spaCy EDA

In [6]:
# Function to extract tokens and metadata from individual spaCy docs.
def extract_tokens_plus_meta(doc:spacy.tokens.doc.Doc):
    tokens = []
    for i in doc:
        tokens.append([
            i.text, i.i, i.lemma_, i.ent_type_, i.tag_, 
            i.dep_, i.pos_, i.is_stop, i.is_alpha, 
            i.is_digit, i.is_punct
        ])
    return pd.DataFrame(tokens, columns=cols[1:])

# Function to add doc entities and tokens to a df
def tidy_tokens(docs, df):
    for ix, doc in enumerate(docs):
        meta = extract_tokens_plus_meta(doc)
        df = df.append(meta.assign(doc_id=ix))
    return df.assign(doc_id=df.doc_id.astype(int)).loc[:, cols]

# Creating a df for the doc entities and their tokens
cols = [
    "doc_id", "token", "token_order", "lemma", 
    "ent_type", "tag", "dep", "pos", "is_stop", 
    "is_alpha", "is_digit", "is_punct"
]
nlp_df = pd.DataFrame(columns=cols[1:])

# extracting and adding the doc entities and tokens to the created df
nlp_df = tidy_tokens(docs, nlp_df)

# Saving the df as a CSV file
nlp_df.to_csv('./data/nlp_castes_descriptions.csv')

# Saving the df as an HTML file
html = nlp_df.to_html()
text_file = open("./data/nlp_castes_descriptions.html", "w")
text_file.write(html)
text_file.close()

  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append(meta.assign(doc_id=ix))
  df = df.append