In [24]:
### This is the main jupyter project notebook for The Ciphers of <i>the Times</i>' computational analysis.
### Within, we analyze two corpora. One is our custom curated corpus of "Newspaper Novels," a genre of Sensation Novels from the 19th century
### that emphasize their relationship to newspapers through content and style. Many of these novels weren coined "Newspaper Novels" by 19th century critics,
### others were categorized as such by contemporary academics investigating the phenomenon of Agony Columns in the fiction novels of the Victorian era.
### The other corpus we analyze here is from the NOVEL450 dataset, curated and analyzed by .txtLAB, a McGill-based research group: https://txtlab.org/
### This corpus serves as a form of control corpus, a crutch to help identify particularities about the "Newspaper Novel" corpus. 

### We have attempted to organize this notebook in a way that would allow to run the same pipeline on both corpora with simplicity, clarity, and reproducability in mind.
### But, we are by no means professional programmers, and this project has been an opportunity for the <i>the Times</i>' team members to learn and develop our skills
### in Digital Humanities computational analysis. 

### We begin by importing the corpus from .txt files into a dataframe, and proceed to clean, extract metadata (basic statistics, TTR and MATTR, and POS tagging), 
### and save the new spreadsheets to perpare them for visualization and continued analysis. 

### The metadata outputs of this notebook already exists within the data/spreadsheets folder on github.
### The full dataframes (with texts) can be reproduced by following the instructions below. We unfortunately could not include the dataframes containing
### the full texts due to the space restriction on github for a single file.



### jupyter lab --notebook-dir=E:/

In [25]:
### DEFINE PATHS:
### Paths are defined for the various spreadsheets and corpora used
### Stopwords were initially taken from: https://towardsdatascience.com/getting-started-with-text-analysis-in-python-ca13590eb4f7
### and then tailored to our pipeline

## to Newpaper Novel corpus:
path_to_nnovels_corpus = '../../data/corpora/corpus_newspaper_novels/' 

## to assets:
path_to_assets = '../../assets/'

## to all spreadsheets:
path_to_spreadsheets = '../../data/spreadsheets/'

## to .txtLab corpus consisting of 150 English-language novels:
# path_to_dirty_txtlab_corpus = '../../data/spreadsheets/dirtyengnovels-211215.csv' ## This no longer exists, left for pipeline viewing. See df_txtlab_meta.csv for results. 

## nnovels corpus metadata:
nnovels_corpus_metadata = '../../data/spreadsheets/nnovels_corpus_metadata.csv' 

## stop words:
stopwords_file = 'stopwords.txt'

## characters and numbers to exclude from texts:
exclude_file = 'characters_and_numbers_to_exclude.txt'



In [26]:
### Imports, utility, and important functions begin here:

## basic libraries:
import os
import codecs
import re
import string
import sys
import numpy as np
import spacy
import pandas as pd
import csv
from collections import Counter

# lexical diversity library:
from lexicalrichness import LexicalRichness

# to be able to see more columns when dataframes are printed out:
pd.set_option('display.max_columns', 100)

# to not get copy warnings when splitting dataframes
pd.options.mode.chained_assignment = None

In [27]:
## STOPWORDS and elements to exclude
# opening stopwords and characters to exclude from assets
with codecs.open(path_to_assets + stopwords_file, 'r', encoding='utf-8', errors="ignore") as stopwords_raw:
    stopwords = stopwords_raw.read()
    stopwords = stopwords.split()
with codecs.open(path_to_assets + exclude_file, 'r', encoding='utf-8', errors="ignore") as characters_to_exclude_raw:
    characters_to_exclude = characters_to_exclude_raw.read()
    characters_to_exclude = characters_to_exclude.split()

In [28]:
### UTILITY CODE:

## This section allows us to open large dataframes by redefining the max size of the csv fields.
## Solution taken from: https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072

maxInt = sys.maxsize
while True:
    # decrease the maxInt value by factor 10 as long as the OverflowError occurs
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)
        

In [29]:
### FUNCTIONS 1/6
### Various functions for the project begin here 

## Progress bar to view the progress of lengthy processes
# As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))
    percents = round(100.1 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)
    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()  
    

## Used to incorperate a metadata spreadsheet and gather an already ordered corpus within a file,
## and returns a dataframe 
def import_corpus_and_meta(path_to_corpus=path_to_nnovels_corpus, path_to_meta_data=nnovels_corpus_metadata):
    # read in metadata
    df = pd.read_csv(path_to_meta_data, engine='python')
    
    # drop faulty index
    df.drop(df.columns[0], axis=1, inplace=True)
    
    # setting up text_index to ensure sequentiallity
    text_index = 0
    
    # grab all texts from corpus and strip of project gutenberg endtext
    #TODO: THIS PROCESS NEEDS TO BE COMPLETED MANUALLY ON CORPUS AND DELETED
    split_on = ["END OF THE PROJECT GUTENBERG","End of the Project Gutenberg EBook","End of Project Gutenberg","End of The Project Gutenberg"] 
    
    # loop through each novel in the directory, open the file
    for textname in os.listdir(path_to_nnovels_corpus):
        with codecs.open(path_to_nnovels_corpus + textname, 'r', encoding='utf-8', errors="ignore") as raw_text:
            dirty_text = raw_text.read()
            
            # getting rid of the project gutenberg endtext
            for text in split_on:
                dirty_text = dirty_text.split(text)[0]
                
            # input into df
            df.at[text_index, 'dirty_text'] = dirty_text
            text_index += 1
        # show progress
        progress(text_index, len(os.listdir(path_to_corpus)))
    return df


In [30]:
### FUNCTIONS 2/6

## Cleans a text string and returns the cleaned text, 
## the text without stopwords filtered (string), the text with stopwords filtered (list), and the text as sentences (list)
def clean_text(text, stopwords=stopwords, characters_to_exclude=characters_to_exclude):
    # lowercasing the text
    text = text.lower()
    
    # removing all characters in characters_to_exclude
    text = ''.join(char for char in text if char not in characters_to_exclude)
    
    # replacing all newline '\n' with spaces
    text = text.replace('\n', " ")
    
    # replacing all multiple spaces with a single space
    text = re.sub('\s+',' ', text)
    
    # getting list of sentences
    text_split_sentences = re.split(r"\.|\:|\?|\!", text)
    
    # getting rid of empty elements in sentences
    text_split_sentences = list(filter(None, text_split_sentences))
    
    # splitting text by spaces for tokenization
    text_split = re.split(r"\s", text)
    
    # removing all empty elements in text_split
    text_split_stopless = list(filter(None, text_split))
    
    # getting rid of end-of-line punctuation:
    text_split_stopless = [word.strip(".?:!") for word in text_split_stopless]
    
    # getting rid of all stopwords:
    text_split_stopped = [word for word in text_split_stopless if word not in stopwords]

    return text, text_split_stopless, text_split_stopped, text_split_sentences


## Takes a dataframe, a dirty_text column, a characters_to_exclude file, and a stopwords file and returns the clean text, inputting it into the df
def clean_up_corpus_and_grab_basic_stats(df, dirty_text_column='dirty_text'):
    print("Cleaning text, assigning them to columns, and grabbing basic stats...")
    
    # creating columns for data
    df['words_standardized_stopped'] = ''
    df['sentences_count'] = 0
    df['average_words_per_sentence'] = 0.0
    df['sentences_standardized_stopless'] = ''
    
    # loop through the dataframe
    for index, row in df.iterrows():
        # get the clean text for each novel
        text, text_split_stopless, text_split_stopped, text_split_sentences = clean_text(row[dirty_text_column])

        # getting basic stats for tokenized texts (words):
        words_count_stopless = len(text_split_stopless)
        words_count_stopped = len(text_split_stopped)
        percentage_stopped_of_stoppless = (words_count_stopped / words_count_stopless) * 100

        # getting basic stats for tokenized texts (sentences):
        sentences_count = len(text_split_sentences)
        words_per_sentence = [len(sentence.split()) for sentence in text_split_sentences]
        total = sum(words_per_sentence)
        average_words_per_sentence = int(total) / len(words_per_sentence)
        
        # inputting data into df
        df.at[index, 'words_as_string_for_vectorizor'] = text
        df.at[index, 'words_count_stopless'] = words_count_stopless
        df.at[index, 'words_count_stopped'] = words_count_stopped
        df.at[index, 'words_standardized_stopped'] = text_split_stopped
        df.at[index, 'percentage_stopped_of_stoppless'] = percentage_stopped_of_stoppless
        df.at[index, 'sentences_standardized_stopless'] = text_split_sentences
        df.at[index, 'sentences_count'] = sentences_count
        df.at[index, 'average_words_per_sentence'] = average_words_per_sentence

        # show progress bar
        progress(index, len(df.index))

    return df

In [31]:
### FUNCTIONS 3/6

## Takes a dataframe, a text column (string), and several parameters, and returns the ttr and mattr (with multiple configurations) for each row:
def run_ttr_analysis_on_df(df, text_column='words_as_string_for_vectorizor', full_text_ttr=True, moving_average_ttr=True, window_sizes=[500, 2000]):
    for index, row in df.iterrows():
        # grabs text column from df
        text = row[text_column]
        # gets lex from lexical richness library
        lex = LexicalRichness(text)
        # these switches are here in case someone wants to run just ttr/mattr (since this can take some time)
        if full_text_ttr == True:
            ttr = lex.ttr
            df.loc[index, 'full_text_ttr'] = ttr
        if moving_average_ttr == True:
            for window_size in window_sizes:
                if (window_size != None) and (len(text) > window_size):
                    mattr = lex.mattr(window_size=window_size)
                    df.loc[index, f'mattr_{str(window_size)}'] = mattr
                    
        # show progress bar
        progress(index, len(df.index))
    return df

In [32]:
### FUNCTIONS 4/6

## Takes in a dataframe, a list of columns to visualize, and a date_column, then visualizes it over time using the matplotlib.
## Created as a slightly simpler way to check and compare visualizations between different columns and/or dataframes

#TODO: make multiple dfs visualizable:
def visualize_numerical_columns__over_time(df, list_of_columns_to_visualize, date_column='book_year', graph_y_label='What are we counting?', title='SOMETHING over Time'):
    # importing libraries
    import matplotlib.pyplot as plt
    import random
    
    # setting up lists to capture the years, counts, labels, and colours to visualize
    years_list = []
    counts_list = []
    labels_list = []
    colors_list = []
    for column_name in list_of_columns_to_visualize:
        
        # grouping the column by years and getting arrays for graph
        grouped_by_year = pd.to_numeric(df[column_name]).groupby(df[date_column])
        grouped_by_year = grouped_by_year.mean().reset_index()
        years = np.array(grouped_by_year[date_column].tolist())
        count_to_visualize = np.array(grouped_by_year[column_name].tolist())
        
        # saving the info for each column to visualize in a list
        years_list.append(years)
        counts_list.append(count_to_visualize)
        labels_list.append(column_name)
        
        # getting random colors to differenciate between visualized data
        r = random.random()
        b = random.random()
        g = random.random()
        color = (r, g, b)
        colors_list.append(color)
        
    # plotting SOMETHING over time:
    plt.figure(figsize=(20,10))
    for index in range(len(labels_list)):
        plt.plot(years_list[index], counts_list[index], label=labels_list[index], c=colors_list[index])
    plt.legend(loc='upper left')
    plt.title(title)
    plt.show()



In [33]:
### FUNCTIONS 5/6
## POS tagging:

# first loading english language support
nlp = spacy.load("en_core_web_sm")

## Takes in a dataframe and clean text column (as string), and returns the df with POS tags for all the texts
## Multiple columns are created, one for each POS tag, and one that contains all POS tags (I did this to more easily
## be able to grab POS percentages afterward)
def get_POS_tags_for_text_in_df(df, text_row_to_analyze='words_as_string_for_vectorizor'):
    
    # setting up column for pos counts
    df['all_pos_counts'] = ''
    
    # loop through df and get all POS tags:
    for index, row in df.iterrows():

        # grab text
        text = row[text_row_to_analyze]
        
        # this is a memory buffer, to extend max length of available ram according to the text being analyzed
        # https://datascience.stackexchange.com/questions/38745/increasing-spacy-max-nlp-limit
        nlp.max_length = len(text) + 100
        
        # disable modules not in use to save memory
        analyzed_doc = nlp(text, disable = ['ner'])
        
        # grabbing all pos counts in the text in non-human readable format
        pos_counts_in_text = analyzed_doc.count_by(spacy.attrs.IDS['POS'])
        
        # setting up list to render pos hashes in human readable format:
        human_readable_pos_count_list = []
        
        # iterating through counts to make hashes human readable:
        for pos, count in pos_counts_in_text.items():
            human_readable_tag = analyzed_doc.vocab[pos].text
            # rendering as list to input back into df
            human_readable_tag_and_count = list((human_readable_tag, count))
            human_readable_pos_count_list.append(human_readable_tag_and_count)
        
        # looping through the human readable counts, assigning their label to the column
        # and the count to the row for each pos tag
        for element in human_readable_pos_count_list:
            df.at[index, 'POS_' + str(element[0])+'_count'] = element[1]
        
        # placing all the pos counts for each text in the all_pos_counts column
        df.at[index, 'all_pos_counts'] = human_readable_pos_count_list
        
        # show progress
        progress(index, len(df.index))
        
    # getting POS percentages for each POS tag in texts
    # There are much easier and more efficient ways to do this rather than looping over the entire df again but we were pressed for time...
    # TODO: integrate this loop into previous loop
    for index, row in df.iterrows():
        total = 0.0
        for name in df.columns.values.tolist():
            if name.startswith("POS_"):
                # get total POS elements count for sanity
                total += row[name]
        df.at[index, "parts_of_speech_total_count"] = int(total)
        
        for name in df.columns.values.tolist():
            if name.startswith("POS_"):
                # assign new name for column
                new_name = "%" + name
                # get % of total POS in text
                percentage = round((row[name] / total) * float(100), 3)
                # if this is the first index, create the column name to avoid errors
                if index == 0:
                    df[new_name] = 0.0
                df.at[index, new_name] = percentage
    return df


In [34]:
### FUNCTIONS 6/6

## Functions that 1) save just metadata, 2) save the full dataframe, 3) load the dataframe

# for metadata output:
def output_metadata(df, spreadsheet_name='FILL_IN_SPREADSHEET_NAME_META', path_to_spreadsheets=path_to_spreadsheets):
    # setting up list of columns NOT to export (since this is just metadata)
    list_of_columns_not_to_include = ['words_standardized_stopped', 'sentences_standardized_stopless','words_as_string_for_vectorizor', 'dirty_text']
    # all other columns are included
    columns_to_include = [column_name for column_name in df.columns.values.tolist() if column_name.lower() not in list_of_columns_not_to_include]
    df_meta = df[columns_to_include]
    df_meta.to_csv(path_to_spreadsheets + spreadsheet_name + '.csv')
    print(spreadsheet_name + ' was saved in '+str(path_to_spreadsheets))
    
# for full output:
def output_full(df, spreadsheet_name='FILL_IN_SPREADSHEET_NAME_FULL', path_to_spreadsheets=path_to_spreadsheets):
    df.to_csv(path_to_spreadsheets + spreadsheet_name + '.csv')
    print(spreadsheet_name + ' was saved in '+str(path_to_spreadsheets))

# load a dataframe
def open_df_and_print(file_name='df_full.csv', path_to_spreadsheets=path_to_spreadsheets, drop_first_column=False):
    df = pd.read_csv(path_to_spreadsheets + file_name, engine='python')
    if drop_first_column == True:
        df.drop(df.columns[0], axis=1, inplace=True)
    return df
    



In [35]:
# RUN UP TO HERE

In [36]:
# # ## BEGINNING OF NNOVEL ANALYSIS:
# # ## IF THIS NEEDS TO BE RUN, UNCOMMENT EVERYTHING AND RUN (make sure that the paths at the top of the notebook correspond with your paths)

# # Complete Initial Pipeline (needs to be run once):
# print("Starting pipeline...")

# print("\nImporting corpus from "+ str(path_to_nnovels_corpus)+ " and metadata from "+ str(nnovels_corpus_metadata))
# df = import_corpus_and_meta(path_to_nnovels_corpus, nnovels_corpus_metadata)

# # for testing purposes: 
# # df = df_all.loc[:5,:]

# print('\nCleaning up corpus and grabbing basic statistics')
# df = clean_up_corpus_and_grab_basic_stats(df)
# df.drop('dirty_text', axis=1, inplace=True)

# print('\nRunning TTR analysis (this can take some time)')
# df = run_ttr_analysis_on_df(df)

# print('\nRunning POS analysis (this can take some time)')
# df = get_POS_tags_for_text_in_df(df, text_row_to_analyze='words_as_string_for_vectorizor')

# df.head()
# print('\nSaving df')
# output_metadata(df, spreadsheet_name = 'df_meta')
# output_full(df, spreadsheet_name = 'df_full')
# print('\nPrinting df')
# df.head()

In [37]:
### If initial pipeline has already run:
### Import the dfs from spreadsheets folder 

# df_nnovels_full = open_df_and_print(file_name = 'df_nnovels_full.csv', path_to_spreadsheets=path_to_spreadsheets, drop_first_column=True)
# df_nnovels_full.head()

In [38]:
### If initial pipeline has already run:
### Import the dfs from spreadsheets folder 

df_nnovels_meta = open_df_and_print(file_name = 'df_nnovels_meta.csv', path_to_spreadsheets=path_to_spreadsheets, drop_first_column=True)
df_nnovels_meta.head()


Unnamed: 0,book_title,book_year,author_name,words_count_stopless,words_count_stopped,percentage_stopped_of_stoppless,sentences_count,average_words_per_sentence,full_text_ttr,mattr_500,mattr_2000,all_pos_counts,50_most_common_nouns,50_most_common_verbs,POS_PUNCT,POS_NOUN,POS_DET,POS_ADJ,POS_VERB,POS_SCONJ,POS_ADP,POS_AUX,POS_PART,POS_CCONJ,POS_ADV,POS_PROPN,POS_PRON,POS_NUM,POS_INTJ,POS_X,noun_percentage,verb_and_aux_percentage,POS_SYM,parts_of_speech_total_count,%POS_PUNCT,%POS_NOUN,%POS_DET,%POS_ADJ,%POS_VERB,%POS_SCONJ,%POS_ADP,%POS_AUX,%POS_PART,%POS_CCONJ,%POS_ADV,%POS_PROPN,%POS_PRON,%POS_NUM,%POS_INTJ,%POS_X,%POS_SYM
0,Agnes_Grey,1847,Anne_Bronte,67718.0,24161.0,35.678845,3201.0,21.175258,0.108841,0.519898,0.368708,"[['PUNCT', 4930], ['NOUN', 10116], ['DET', 825...","[('time', 142), ('mother', 124), ('day', 65), ...","[('would', 379), ('could', 260), ('said', 186)...",4930.0,10116.0,8255.0,4864.0,10808.0,2000.0,6730.0,3776.0,2632.0,3931.0,5084.0,2033.0,8228.0,357.0,277.0,37.0,13.66,14.594,0.0,74058.0,6.657,13.66,11.147,6.568,14.594,2.701,9.087,5.099,3.554,5.308,6.865,2.745,11.11,0.482,0.374,0.05,0.0
1,Armadale,1864,Wilkie_Collins,295546.0,111234.0,37.636781,19298.0,15.344647,0.045656,0.497521,0.341371,"[['ADP', 36354], ['PUNCT', 20620], ['DET', 436...","[('time', 864), ('midwinter', 786), ('man', 70...","[('said', 1136), ('will', 694), ('would', 578)...",20620.0,52174.0,43605.0,18284.0,44704.0,6609.0,36354.0,16230.0,9975.0,9477.0,17419.0,12346.0,32389.0,1712.0,541.0,25.0,16.18,13.863,0.0,322464.0,6.395,16.18,13.522,5.67,13.863,2.05,11.274,5.033,3.093,2.939,5.402,3.829,10.044,0.531,0.168,0.008,0.0
2,Aurora_Floyd,1863,Mary_Elizabeth_Braddon,183016.0,74878.0,40.913363,9425.0,19.422599,0.075907,0.529468,0.381519,"[['PROPN', 9662], ['ADV', 10206], ['DET', 2736...","[('man', 506), ('face', 245), ('time', 233), (...","[('said', 562), ('would', 553), ('could', 449)...",10576.0,33566.0,27368.0,13795.0,25284.0,5435.0,20867.0,9771.0,4237.0,7753.0,10206.0,9662.0,15633.0,1101.0,394.0,56.0,17.151,12.919,2.0,195706.0,5.404,17.151,13.984,7.049,12.919,2.777,10.662,4.993,2.165,3.962,5.215,4.937,7.988,0.563,0.201,0.029,0.001
3,Belinda:A_Novel,1883,Rhoda_Broughton,147937.0,57386.0,38.790837,10248.0,14.100898,0.092824,0.534271,0.38454,"[['PROPN', 5341], ['ADP', 14886], ['DET', 1801...","[('eyes', 227), ('voice', 212), ('one', 175), ...","[('says', 577), ('will', 562), ('would', 431),...",11552.0,23275.0,18018.0,10336.0,20488.0,4025.0,14886.0,9313.0,4134.0,4949.0,10799.0,5341.0,15702.0,908.0,444.0,125.0,15.085,13.278,0.0,154295.0,7.487,15.085,11.678,6.699,13.278,2.609,9.648,6.036,2.679,3.207,6.999,3.462,10.177,0.588,0.288,0.081,0.0
4,Birds_of_Prey,1867,Mary_Elizabeth_Braddon,161547.0,64208.0,39.745709,8357.0,19.335168,0.083881,0.526442,0.37616,"[['PROPN', 8773], ['DET', 23096], ['ADP', 1857...","[('man', 426), ('time', 269), ('life', 233), (...","[('would', 490), ('said', 422), ('could', 327)...",9019.0,29719.0,23096.0,12741.0,21333.0,4212.0,18573.0,9380.0,4281.0,6808.0,8960.0,8773.0,13952.0,1085.0,427.0,46.0,17.238,12.374,0.0,172405.0,5.231,17.238,13.396,7.39,12.374,2.443,10.773,5.441,2.483,3.949,5.197,5.089,8.093,0.629,0.248,0.027,0.0


In [39]:
df_nnovels_meta['sentences_count'].sum()

744238.0

In [None]:
# ## BEGINNING OF TXTLAB ANALYSIS:
# Complete Initial Pipeline on txtLab (needs to be run once):
# print("Starting pipeline...")
# print("\nImporting txtLab corpus from "+ str(path_to_txtlab)+"...")
# df_txtlab = pd.read_csv(path_to_txtlab)
# df_txtlab.drop(df_txtlab.columns[0], axis=1, inplace=True)

# print('\nCleaning up corpus and grabbing basic statistics...')
# df_txtlab = clean_up_corpus_and_grab_basic_stats(df_txtlab, path_to_assets + exclude, path_to_assets + stopwords, dirty_text_column='DIRTY_TEXT')

# print('\nRunning TTR analysis (this can take some time)')
# df_txtlab = run_ttr_analysis_on_df(df_txtlab)

# print('\nRunning POS analysis (this can take some time)')
# df_txtlab = get_POS_tags_for_text_in_df(df_txtlab, text_row_to_analyze='words_as_string_for_vectorizor')

# print('\nSaving df')
# output_metadata(df_txtlab, spreadsheet_name='df_textlab_meta')
# output_full(df_txtlab, spreadsheet_name='df_textlab_full')
# print('\nPrinting df')
# df.head()



In [None]:
### If initial pipeline has already run:
### Import the dfs from spreadsheets folder 

df_txtlab_full = open_df_and_print(file_name = 'df_txtlab_full.csv', path_to_spreadsheets = path_to_spreadsheets, drop_first_column=True)
df_txtlab_full.head()

In [None]:
# df_txtlab_full.rename(columns = {'author':'author_name', 'title':'book_title'}, inplace = True)

In [None]:
## SAVE BOTH:

In [None]:
# print('\nSaving NNOVELS df')
output_metadata(df_nnovels_full, spreadsheet_name = 'df_nnovels_meta')
output_full(df_nnovels_full, spreadsheet_name = 'df_nnovels_full')

In [None]:
# print('\nSaving TXTLAB df')
output_metadata(df_txtlab_full, spreadsheet_name = 'df_txtlab_meta')
output_full(df_txtlab_full, spreadsheet_name = 'df_txtlab_full')

In [None]:
# Sanity NNOVELS:
df_nnovels_full = open_df_and_print(file_name = 'df_nnovels_full.csv', path_to_spreadsheets = path_to_spreadsheets, drop_first_column=True)
df_nnovels_full.head()

In [None]:
# Sanity TXTLAB:
df_txtlab = open_df_and_print(file_name = 'df_textlab_full.csv', path_to_spreadsheets = path_to_spreadsheets, drop_first_column=True)
df_txtlab.head()