In [None]:
### This is the main jupyter notebook for The Ciphers of <i>the Times</i>' computational analysis of our curated corpus of 'Newspaper Novels'
### We begin by importing the corpus from .txt files into a dataframe, and proceed to clean, extract metadata, and save the new
### spreadsheets. 
###
###
###

In [None]:
# Useful links:
# https://towardsdatascience.com/30-very-useful-pandas-functions-for-everyday-data-analysis-tasks-f1eae16409af
# 

In [None]:
### DEFINE PATHS:
### Paths are defined for the various spreadsheets and corpora used
### Stopwords were initially taken from: https://towardsdatascience.com/getting-started-with-text-analysis-in-python-ca13590eb4f7
### and then tailored to our pipeline

## to Newpaper Novel corpus:
path_to_nnovels_corpus = '../../data/corpora/corpus_newspaper_novels/' 

## to assets:
path_to_assets = '../../assets/'

## to all spreadsheets:
path_to_spreadsheets = '../../data/spreadsheets/'

## to .txtLab corpus consisting of 150 English-language novels:
# path_to_dirty_txtlab_corpus = '../../data/spreadsheets/dirtyengnovels-211215.csv' ## This no longer exists, left for pipeline viewing. See df_txtlab_meta.csv for results. 
# Full DF is too large to upload to GitHub upload.


### FILES:

## nnovels corpus metadata:
nnovels_corpus_metadata = '../../data/spreadsheets/nnovels_corpus_metadata.csv' 

## stop words:
stopwords = 'stopwords.txt'

## characters and numbers to exclude from texts:
exclude = 'characters_and_numbers_to_exclude.txt'

# opening stopwords and characters to exclude from assets
with codecs.open(path_to_assets + stopwords, 'r', encoding='utf-8', errors="ignore") as stopwords_raw:
    stopwords = stopwords_raw.read()
    stopwords = stopwords.split()
with codecs.open(path_to_assets + characters_to_exclude, 'r', encoding='utf-8', errors="ignore") as characters_to_exclude_raw:
    characters_to_exclude = characters_to_exclude_raw.read()
    characters_to_exclude = characters_to_exclude.split()



In [None]:
### Imports, utility, and important functions begin here:

## basic libraries:
import os
import codecs
import re
import string
import sys
import numpy as np
import spacy
import pandas as pd
import csv
from collections import Counter

# lexical diversity library:
from lexicalrichness import LexicalRichness

# to see more columns when dataframes are printed out:
pd.set_option('display.max_columns', 100)


In [None]:
### UTILITY CODE:

## This section allows us to open large dataframes by redefining the max size of the csv fields.
## Solution taken from: https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072

maxInt = sys.maxsize
while True:
    # decrease the maxInt value by factor 10 as long as the OverflowError occurs
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)
        

In [None]:
### FUNCTIONS 1/6
### Various functions for the project begin here 

## Progress bar to view the progress of lengthy processes
# As suggested by Rom Ruben (see: http://stackoverflow.com/questions/3173320/text-progress-bar-in-the-console/27871113#comment50529068_27871113)
def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))
    percents = round(100.1 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)
    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush()  
    

## Used to incorperate a metadata spreadsheet and gather an already ordered corpus within a file:
def import_corpus_and_meta(path_to_corpus=path_to_nnovels_corpus, path_to_meta_data=nnovels_corpus_metadata):
    # read in metadata
    df = pd.read_csv(path_to_meta_data, engine='python')
    # drop faulty index
    df.drop(df.columns[0], axis=1, inplace=True)
    # setting up text_index to ensure sequentiallity
    text_index = 0
    # grab all texts from corpus and strip of project gutenberg end text
    #TODO: THIS PROCESS NEEDS TO BE COMPLETED MANUALLY ON CORPUS AND DELETED
    split_on = ["END OF THE PROJECT GUTENBERG","End of the Project Gutenberg EBook","End of Project Gutenberg","End of The Project Gutenberg"] 
    for textname in os.listdir(path_to_nnovels_corpus):
        with codecs.open(path_to_nnovels_corpus + textname, 'r', encoding='utf-8', errors="ignore") as raw_text:
            dirty_text = raw_text.read()
            for text in split_on:
                dirty_text = dirty_text.split(text)[0]
            df.at[text_index, 'dirty_text'] = dirty_text
            text_index += 1
        progress(text_index, len(os.listdir(path_to_corpus)))
    return df


In [None]:
### FUNCTIONS 2/6

## Cleans a text string and returns the cleaned text, 
## the text without stopwords filtered (string), the text with stopwords filtered (list), and the text as sentences (list)
def clean_text(text, stopwords=stopwords, characters_to_exclude=characters_to_exclude):
    # lowercasing the text
    text = text.lower()
    # removing all characters except those in split_by
    text = ''.join(char for char in text if char not in characters_to_exclude)
    # replacing all newline '\n' with spaces
    text = text.replace('\n', " ")
    # replacing all multiple spaces with a single space
    text = re.sub('\s+',' ', text)
    # if sentences is set to true in the function, this will also split text by sentences and input in data frame
    # splitting by spaces
    text = re.split(r"\s", text)
    # removing all empty list items
    text_split_stopless = list(filter(None, text))
    # getting list of sentences
    text_split_sentences = re.split(r"\.|\:|\?|\!", text)
    # getting rid of empty elements
    text_split_sentences = list(filter(None, text_split_sentences))
    # getting rid of end-of-line punctuation:
    text_split_stopless = [word.strip(".?:!") for word in text_split_stopless]
    # getting rid of all stopwords:
    text_split_stopped = [word for word in text_split_stopless if word not in stopwords]

    return text, text_split_stopless, text_split_stopped, text_split_sentences


## Takes a dataframe, a dirty_text column, a characters_to_exclude file, and a stopwords file and returns the clean text, inputting it into the df
def clean_up_corpus_and_grab_basic_stats(df, characters_to_exclude, stopwords, dirty_text_column='dirty_text'):
    print("Cleaning text, assigning them to columns, and grabbing basic stats...")
    # creating columns for collected words:
    df['words_standardized_stopped'] = ''
    df['sentences_count'] = 0
    df['average_words_per_sentence'] = 0.0

    for index, row in df.iterrows():
        text, text_split_stopless, text_split_stopped, text_split_sentences = clean_text(row[dirty_text_column])

        # getting basic stats for tokenized texts (words):
        words_count_stopless = len(text_split_stopless)
        words_count_stopped = len(text_split_stopped)
        percentage_stopped_of_stoppless = (words_count_stopped / words_count_stopless) * 100

        # getting basic stats for tokenized texts (sentences):
        sentences_count = len(text_split_sentences)
        words_per_sentence = [len(sentence.split()) for sentence in text_split_sentences]
        total = sum(words_per_sentence)
        average_words_per_sentence = int(total) / len(words_per_sentence)
        
        # inputting data into df
        df.at[index, 'words_as_string_for_vectorizor'] = text
        df.at[index, 'words_count_stopless'] = words_count_stopless
        df.at[index, 'words_count_stopped'] = words_count_stopped
        df.at[index, 'words_standardized_stopped'] = text_split_stopped
        df.at[index, 'percentage_stopped_of_stoppless'] = percentage_stopped_of_stoppless
        df.at[index, 'sentences_standardized_stopless'] = text_split_sentences
        df.at[index, 'sentences_count'] = sentences_count
        df.at[index, 'average_words_per_sentence'] = average_words_per_sentence

        # show progress bar
        progress(index, len(df.index))

    return df

In [None]:
### FUNCTIONS 3/6

## Takes a dataframe, a text column (string), and several parameters, and returns the ttr and mattr (with multiple configurations) for each row:
def run_ttr_analysis_on_df(df, text_column='words_as_string_for_vectorizor', full_text_ttr=True, moving_average_ttr=True, window_sizes=[500, 2000]):
    for index, row in df.iterrows():
        # grabs text column from df
        text = row[text_column]
        # gets lex from lexical richness library
        lex = LexicalRichness(text)
        # these switches are here in case someone wants to run just ttr/mattr (since this can take some time)
        if full_text_ttr == True:
            ttr = lex.ttr
            df.loc[index, 'full_text_ttr'] = ttr
        if moving_average_ttr == True:
            for window_size in window_sizes:
                if (window_size != None) and (len(text) > window_size):
                    mattr = lex.mattr(window_size=window_size)
                    df.loc[index, f'mattr_{str(window_size)}'] = mattr
                    
        # show progress bar
        progress(index, len(df.index))
    return df

In [None]:
### FUNCTIONS 4/6

## Takes in a dataframe, a list of columns to visualize, and a date_column, then visualizes it over time using matplot.
## Created as a slightly simpler way to check and compare visualizations between different columns and/or dataframes

#TODO: make multiple dfs visualizable:
def visualize_numerical_columns__over_time(df, list_of_columns_to_visualize, date_column='book_year', graph_y_label='What are we counting?', title='SOMETHING over Time'):
    # importing libraries
    import matplotlib.pyplot as plt
    import random
    # setting up lists to capture the years, counts, labels, and colours to visualize
    years_list = []
    counts_list = []
    labels_list = []
    colors_list = []
    for column_name in list_of_columns_to_visualize:
        # grouping the column by years and getting arrays for graph
        grouped_by_year = pd.to_numeric(df[column_name]).groupby(df[date_column])
        grouped_by_year = grouped_by_year.mean().reset_index()
        years = np.array(grouped_by_year[date_column].tolist())
        count_to_visualize = np.array(grouped_by_year[column_name].tolist())
        # saving the info for each column to visualize in a list
        years_list.append(years)
        counts_list.append(count_to_visualize)
        labels_list.append(column_name)
        # getting random colors to differenciate between visualized data
        r = random.random()
        b = random.random()
        g = random.random()
        color = (r, g, b)
        colors_list.append(color)
        
    # plotting SOMETHING over time:
    plt.figure(figsize=(20,10))
    for index in range(len(labels_list)):
        plt.plot(years_list[index], counts_list[index], label=labels_list[index], c=colors_list[index])
    plt.legend(loc='upper left')
    plt.title(title)
    plt.show()



In [None]:
### FUNCTIONS 5/6
## POS tagging:
##
nlp = spacy.load("en_core_web_sm")

def get_POS_tags_for_text_in_df(df, text_row_to_analyze='words_as_string_for_vectorizor', 
                                count_nouns=True, how_many_nouns=20, count_verbs_and_aux=True, how_many_verbs=20):
    df['all_pos_counts'] = ''
    if count_nouns == True:
        df[str(how_many_nouns)+'_most_common_nouns'] = ''
    if count_verbs_and_aux == True:
        df[str(how_many_verbs)+'_most_common_verbs'] = ''

    for index, row in df.iterrows():
        progress(index, len(df.index))
        # setting up lists to capture POS tags
        nouns = []
        verbs_and_aux = []
        adjectives = []
        text = row[text_row_to_analyze]
        # this is a memory buffer, to extend max length of available ram according to the text being analyzed
        # https://datascience.stackexchange.com/questions/38745/increasing-spacy-max-nlp-limit
        nlp.max_length = len(text) + 100
        # disable modules not in use to save memory
        analyzed_doc = nlp(text, disable = ['ner'])
        
        # grabbing all pos counts in the text in non-human readable format
        pos_counts_in_text = analyzed_doc.count_by(spacy.attrs.IDS['POS'])
        # setting up list to render pos hashes in human readable format:
        human_readable_pos_count_list = []
        # iterating through counts to make hashes human readable:
        for pos, count in pos_counts_in_text.items():
            human_readable_tag = analyzed_doc.vocab[pos].text
            add_me = list((human_readable_tag, count))
            human_readable_pos_count_list.append(add_me)
            
        for element in human_readable_pos_count_list:
            
            
            # add_me = list((element[1], percentage_of_POS))
            df.at[index, 'POS_' + str(element[0])+'_percentage'] = element[1]

        
        # grabbing all nouns and verbs/auxilaries to find top 20 for each:
        for token in analyzed_doc:
            if count_nouns == True:
                if token.pos_ == 'NOUN':
                    nouns.append(token.text)
            if count_verbs_and_aux == True:
                if token.pos_ == ('VERB' or 'AUX'):
                    verbs_and_aux.append(token.text)
            else:
                continue

        # count up all nouns and verbs
        nouns_tally = Counter(nouns)
        verbs_and_aux_tally = Counter(verbs_and_aux)
        
        # setting up lists to grab nouns and verbs
        total_counts = 0
        noun_counts = 0
        verb_and_aux_counts = 0
        # grab just nouns and verbs counts
        for element in human_readable_pos_count_list:
            total_counts += element[1]
            if element[0] == 'NOUN':
                noun_counts = element[1]
            if element[0] == ('VERB' or 'AUX'):
                verb_and_aux_counts = element[1]
            else:
                continue

        percentage_nouns_of_pos_counts = round(noun_counts / total_counts * float(100), 3)
        percentage_verbs_and_aux_of_pos_counts = round(verb_and_aux_counts / total_counts * float(100), 3)
        
        
        
        # setting each in their proper column
        

        df.at[index, 'noun_percentage'] = percentage_nouns_of_pos_counts
        df.at[index, 'verb_and_aux_percentage'] = percentage_verbs_and_aux_of_pos_counts
        df.at[index, 'all_pos_counts'] = human_readable_pos_count_list
        df.at[index, str(how_many_nouns)+'_most_common_nouns'] = nouns_tally.most_common(how_many_nouns)
        df.at[index, str(how_many_verbs)+'_most_common_verbs'] = verbs_and_aux_tally.most_common(how_many_verbs)
        progress(index, len(df.index))
    return df


def separate_pos_tags_into_usable_data(list_or_string_of_POS_counts):
    exclude = ["[", "]", "'"]
    human_readable_pos_count_list = []
    if isinstance(list_or_string_of_POS_counts,str):
        print('string!')
        # This entire section is an attempt to to save memory, ignore:
        pos_counts = list_or_string_of_POS_counts
        # get rid of brackets
        pos_counts = pos_counts[1:-1]
        pos_counts = ''.join(char for char in pos_counts if char not in exclude)
        pos_counts_split = pos_counts.split(",")   
        list_of_POS = []
        list_of_values = []
        for i in range(len(pos_counts_split)):
            if i % 2 == 0:
                list_of_POS.append(pos_counts_split[i])
            else:
                list_of_values.append(pos_counts_split[i])
        for i in range(len(list_of_POS)):
            add_me = list((list_of_POS[i].strip(), float(list_of_values[i].strip())))
            human_readable_pos_count_list.append(add_me)
    elif isinstance(list_or_string_of_POS_counts,list):
        human_readable_pos_count_list = list_or_string_of_POS_counts 
    else:
        print("Function input must be string or list.")
    return human_readable_pos_count_list

In [None]:
# FUNCTIONS 6:
# Topic Modeling
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

# Import tools:
# pprint
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Functions:
def texts_to_words(texts):
    i = 0
    for text in texts:
        yield(gensim.utils.simple_preprocess(str(texts), deacc=True))  # deacc=True removes punctuations
        progress(i, len(texts))
        i += 1

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def topic_model_the_df(df, text_column='words_as_string_for_vectorizor'):
    

    return df


In [None]:
# for metadata output:
def output_metadata(df, spreadsheet_name = 'BLA', path_to_spreadsheets = path_to_spreadsheets):
    list_of_columns_not_to_include = ['words_standardized_stopped', 'sentences_standardized_stopless','words_as_string_for_vectorizor', 'dirty_text']
    columns_to_include = [column_name for column_name in df.columns.values.tolist() if column_name.lower() not in list_of_columns_not_to_include]
    df_meta = df[columns_to_include]
    df_meta.to_csv(path_to_spreadsheets + spreadsheet_name + '.csv')
    print(spreadsheet_name + ' was saved in '+str(path_to_spreadsheets))
    
# for full output:
def output_full(df, spreadsheet_name = 'BLA', path_to_spreadsheets = path_to_spreadsheets):
    df.to_csv(path_to_spreadsheets + spreadsheet_name + '.csv')
    print(spreadsheet_name + ' was saved in '+str(path_to_spreadsheets))
    
def open_df_and_print(file_name = 'df_full.csv', path_to_spreadsheets = path_to_spreadsheets, drop_first_column=False):
    df = pd.read_csv(path_to_spreadsheets + file_name, engine='python')
    if drop_first_column == True:
        df.drop(df.columns[0], axis=1, inplace=True)
    return df
    


In [None]:
# RUN UP TO HERE

In [None]:
# ## BEGINNING OF NNOVEL ANALYSIS:
# ## IF THIS NEEDS TO BE RUN, UNCOMMENT EVERYTHING AND RUN (make sure that the paths at the top of the notebook correspond with your paths)


# # Complete Initial Pipeline (needs to be run once):
# print("Starting pipeline...")
# print("\nImporting corpus from "+ str(path_to_nnovels_corpus)+ " and metadata from "+ str(path_to_meta_data)+"...")
# df = import_corpus_and_meta(path_to_nnovels_corpus, path_to_meta_data)
# print('\nCleaning up corpus and grabbing basic statistics...')
# df = clean_up_corpus_and_grab_basic_stats(df, path_to_assets + exclude, path_to_assets + stopwords)
# df.drop('dirty_text', axis=1, inplace=True)
# print('\nRunning TTR analysis (this can take some time)')
# df = run_ttr_analysis_on_df(df)
# print('\nRunning POS analysis (this can take some time)')
# df = get_POS_tags_for_text_in_df(df, text_row_to_analyze='words_as_string_for_vectorizor', 
#                                 count_nouns=True, how_many_nouns=50, count_verbs_and_aux=True, how_many_verbs=50)
# print('\nSaving df')
# output_metadata(df, spreadsheet_name = 'df_meta')
# output_full(df, spreadsheet_name = 'df_full')
# print('\nPrinting df')
# df.head()

In [None]:
# # Getting POS% for each POS tag in texts for NNOVELS:
# df_nnovels_full = df_nnovels_full.fillna(0)
# for index, row in df_nnovels_full.iterrows():
#     total = 0.0
#     for name in df_nnovels_full.columns.values.tolist():
#         if "POS_" in name:
#             if not name.startswith("%"):
#                 # new_name = name + "_percentage"
#                 # print(row[name])
#                 # print(type(row[name]))
#                 total += row[name]
#                 # percentage = (int(row[name]) / total) * float(100)
#                 # df_txtlab_full.at[index, new_name] = percentage
#                 # columns.append(new_name)
#                 # df_txtlab_full[new_name] = 0
#     df_nnovels_full.at[index, "parts_of_speech_total_count"] = int(total)
#     for name in df_nnovels_full.columns.values.tolist():
#         if "POS_" in name:
#             if not name.startswith("%"):
#                 new_name = "%" + name
#                 percentage = round((row[name] / total) * float(100), 3)
#                 if index == 0:
#                     df_nnovels_full[new_name] = 0.0
#                 df_nnovels_full.at[index, new_name] = percentage


In [None]:
# ttrs_and_mattrs = ['full_text_ttr', 'mattr_500', 'mattr_2000']
# visualize_numerical_columns__over_time(df, ttrs_and_mattrs, date_column='book_year', graph_y_label='TTRs', title='Lexical Diversity in Newspaper Novels over time')

# basic_stats_counts = ['words_count_stopless','words_count_stopped','sentences_count','average_words_per_sentence']
# visualize_numerical_columns__over_time(df, basic_stats_counts, date_column='book_year', graph_y_label='Counts in Newspaper Novels over time', title='Counts in Newspaper Novels over time')

In [None]:
# If initial pipeline has already run:
# import exported df to not rerun analysis every time:
df_nnovels_full = open_df_and_print(file_name = 'df_nnovels_full.csv', path_to_spreadsheets = path_to_spreadsheets, drop_first_column=True)
df_nnovels_full.head()

In [None]:
# ## BEGINNING OF TXTLAB ANALYSIS:
# Complete Initial Pipeline on txtLab (needs to be run once):
# print("Starting pipeline...")
# print("\nImporting txtLab corpus from "+ str(path_to_txtlab)+"...")
# df_txtlab = pd.read_csv(path_to_txtlab)
# df_txtlab.drop(df_txtlab.columns[0], axis=1, inplace=True)

# print('\nCleaning up corpus and grabbing basic statistics...')
# df_txtlab = clean_up_corpus_and_grab_basic_stats(df_txtlab, path_to_assets + exclude, path_to_assets + stopwords, dirty_text_column='DIRTY_TEXT')

# print('\nRunning TTR analysis (this can take some time)')
# df_txtlab = run_ttr_analysis_on_df(df_txtlab)

# print('\nRunning POS analysis (this can take some time)')
# df_txtlab = get_POS_tags_for_text_in_df(df_txtlab, text_row_to_analyze='words_as_string_for_vectorizor', 
#                                 count_nouns=True, how_many_nouns=50, count_verbs_and_aux=True, how_many_verbs=50)

# print('\nSaving df')
# output_metadata(df_txtlab, spreadsheet_name = 'df_textlab_meta')
# output_full(df_txtlab, spreadsheet_name = 'df_textlab_full')
# print('\nPrinting df')
# df.head()



In [None]:
# ## Getting POS% for each POS tag in texts for TXTLAB:
# ## Second for TXTLAB:
# for index, row in df_txtlab_full.iterrows():
#     total = 0.0
#     for name in df_txtlab_full.columns.values.tolist():
#         if "POS_" in name:
#             if not name.startswith("%"):
#                 # new_name = name + "_percentage"
#                 # print(row[name])
#                 # print(type(row[name]))
#                 total += row[name]
#                 # percentage = (int(row[name]) / total) * float(100)
#                 # df_txtlab_full.at[index, new_name] = percentage
#                 # columns.append(new_name)
#                 # df_txtlab_full[new_name] = 0
#     df_txtlab_full.at[index, "parts_of_speech_total_count"] = int(total)
#     for name in df_txtlab_full.columns.values.tolist():
#         if "POS_" in name:
#             if not name.startswith("%"):
#                 new_name = "%" + name
#                 percentage = round((row[name] / total) * float(100), 3)
#                 if index == 0:
#                     df_txtlab_full[new_name] = 0.0
#                 df_txtlab_full.at[index, new_name] = percentage



In [None]:
# print('\nSaving df')
# output_metadata(df_txtlab, spreadsheet_name = 'df_txtlab_meta')
# output_full(df_txtlab, spreadsheet_name = 'df_txtlab_full')

In [None]:
# If initial pipeline has already run:
# import exported df to not rerun analysis every time:
df_txtlab_full = open_df_and_print(file_name = 'df_txtlab_full.csv', path_to_spreadsheets = path_to_spreadsheets, drop_first_column=True)
df_txtlab_full.head()

In [None]:
# df_txtlab_full.rename(columns = {'author':'author_name', 'title':'book_title'}, inplace = True)

In [None]:
## SAVE BOTH:

In [None]:
# print('\nSaving NNOVELS df')
output_metadata(df_nnovels_full, spreadsheet_name = 'df_nnovels_meta')
output_full(df_nnovels_full, spreadsheet_name = 'df_nnovels_full')

In [None]:
# print('\nSaving TXTLAB df')
output_metadata(df_txtlab_full, spreadsheet_name = 'df_txtlab_meta')
output_full(df_txtlab_full, spreadsheet_name = 'df_txtlab_full')

In [None]:
# Sanity NNOVELS:
df_nnovels_full = open_df_and_print(file_name = 'df_nnovels_full.csv', path_to_spreadsheets = path_to_spreadsheets, drop_first_column=True)
df_nnovels_full.head()

In [None]:
# Sanity TXTLAB:
df_txtlab = open_df_and_print(file_name = 'df_textlab_full.csv', path_to_spreadsheets = path_to_spreadsheets, drop_first_column=True)
df_txtlab.head()