In [1]:
chunk_size = 100
novel_to_analyze = 'Armadale.txt'

In [2]:
### Installs:
# conda install -c plotly plotly=5.7.0

In [3]:
import codecs
import os
import pandas as pd
# to be able to see more columns when dataframes are printed out:
# pd.set_option('display.max_columns', 100)
pd.set_option('display.max_columns', 1000,'display.max_colwidth', None, 'display.width', 1000, 'display.max_rows',1000)
# to not get copy warnings when splitting dataframes
pd.options.mode.chained_assignment = None
import re
import spacy
import sys
import numpy as np
import matplotlib.pyplot as plt
import random
from nltk.tokenize import sent_tokenize

# Vis:
# import matplotlib
# import cufflinks as cf
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
# import plotly.offline as py
# import plotly.graph_objs as go

# cf.go_offline() # required to use plotly offline (no account required).
# py.init_notebook_mode() # graphs charts inline (IPython).

In [4]:
## to Newpaper Novel corpus:
path_to_nnovels_corpus = '../../data/corpora/corpus_newspaper_novels/' 

## to assets:
path_to_assets = '../../assets/'
## characters and numbers to exclude from texts:
exclude_file = 'characters_and_numbers_to_exclude.txt'

# characters to exclude
with codecs.open(path_to_assets + exclude_file, 'r', encoding='utf-8', errors="ignore") as characters_to_exclude_raw:
    characters_to_exclude = characters_to_exclude_raw.read()
    characters_to_exclude = characters_to_exclude.split()

In [5]:
with codecs.open(path_to_nnovels_corpus + novel_to_analyze, 'r', encoding='utf-8', errors="ignore") as raw_text:
    dirty_text = raw_text.read()

prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
dashes = "(-)+"

dirty_text = re.sub(prefixes,"\\1<prd>", dirty_text)
dirty_text = re.sub(dashes, ' \\1 ', dirty_text)
dirty_text = re.sub('(“|”)', '"', dirty_text)
clean_text_list = sent_tokenize(dirty_text)

clean_sentences = []
for sent in clean_text_list:
    # res.append(re.sub('\n', '', sent))
    # removing newline notations
    clean_sent = re.sub('\n', ' ', sent)
    clean_sent = re.sub('\r', ' ', clean_sent)
    # transforming multiple spaces to one space
    clean_sent = re.sub('\s+',' ', clean_sent)
    clean_sentences.append(clean_sent)

In [6]:
text_dict = {"sentences":clean_sentences}
df = pd.DataFrame.from_dict(text_dict)
df.head(10)

Unnamed: 0,sentences
0,﻿THE TRAVELERS.
1,"It was the opening of the season of eighteen hundred and thirty - two, at the Baths of Wildbad."
2,"The evening shadows were beginning to gather over the quiet little German town, and the diligence was expected every minute."
3,"Before the door of the principal inn, waiting the arrival of the first visitors of the year, were assembled the three notable personages of Wildbad, accompanied by their wives - the mayor, representing the inhabitants; the doctor, representing the waters; the landlord, representing his own establishment."
4,"Beyond this select circle, grouped snugly about the trim little square in front of the inn, appeared the towns - people in general, mixed here and there with the country people, in their quaint German costume, placidly expectant of the diligence - the men in short black jackets, tight black breeches, and three - cornered beaver hats; the women with their long light hair hanging in one thickly plaited tail behind them, and the waists of their short woolen gowns inserted modestly in the region of their shoulder - blades."
5,"Round the outer edge of the assemblage thus formed, flying detachments of plump white - headed children careered in perpetual motion; while, mysteriously apart from the rest of the inhabitants, the musicians of the Baths stood collected in one lost corner, waiting the appearance of the first visitors to play the first tune of the season in the form of a serenade."
6,The light of a May evening was still bright on the tops of the great wooded hills watching high over the town on the right hand and the left; and the cool breeze that comes before sunset came keenly fragrant here with the balsamic odor of the first of the Black Forest.
7,"""Mr<prd> Landlord,"" said the mayor’s wife (giving the landlord his title), ""have you any foreign guests coming on this first day of the season?"""
8,"""Madame Mayoress,"" replied the landlord (returning the compliment), ""I have two."
9,"They have written - the one by the hand of his servant, the other by his own hand apparently - to order their rooms; and they are from England, both, as I think by their names."


In [7]:
df['is_upper'] = 0
for index, row in df.iterrows():
    text = row['sentences']
    if text.isupper():
        df.at[index, 'is_upper'] = 1
df = df[df.is_upper != 1]
df = df.drop('is_upper', 1)
df['narration_is_0_dialogue_is_1'] = 0
df.head()


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only



Unnamed: 0,sentences,narration_is_0_dialogue_is_1
1,"It was the opening of the season of eighteen hundred and thirty - two, at the Baths of Wildbad.",0
2,"The evening shadows were beginning to gather over the quiet little German town, and the diligence was expected every minute.",0
3,"Before the door of the principal inn, waiting the arrival of the first visitors of the year, were assembled the three notable personages of Wildbad, accompanied by their wives - the mayor, representing the inhabitants; the doctor, representing the waters; the landlord, representing his own establishment.",0
4,"Beyond this select circle, grouped snugly about the trim little square in front of the inn, appeared the towns - people in general, mixed here and there with the country people, in their quaint German costume, placidly expectant of the diligence - the men in short black jackets, tight black breeches, and three - cornered beaver hats; the women with their long light hair hanging in one thickly plaited tail behind them, and the waists of their short woolen gowns inserted modestly in the region of their shoulder - blades.",0
5,"Round the outer edge of the assemblage thus formed, flying detachments of plump white - headed children careered in perpetual motion; while, mysteriously apart from the rest of the inhabitants, the musicians of the Baths stood collected in one lost corner, waiting the appearance of the first visitors to play the first tune of the season in the form of a serenade.",0


In [8]:
dialogue_started = False
for index, row in df.iterrows():
    sent = row['sentences']
    if sent.startswith('"'):
        dialogue_started = True
    if dialogue_started:
        df.at[index, 'narration_is_0_dialogue_is_1'] = 1

        if sent.endswith('"'):
            dialogue_started = False
            
        else:
            continue
    else:
        df.at[index, 'narration_is_0_dialogue_is_1'] = 0

    # if index == 20:
    #     break

In [9]:
df.head(50)

Unnamed: 0,sentences,narration_is_0_dialogue_is_1
1,"It was the opening of the season of eighteen hundred and thirty - two, at the Baths of Wildbad.",0
2,"The evening shadows were beginning to gather over the quiet little German town, and the diligence was expected every minute.",0
3,"Before the door of the principal inn, waiting the arrival of the first visitors of the year, were assembled the three notable personages of Wildbad, accompanied by their wives - the mayor, representing the inhabitants; the doctor, representing the waters; the landlord, representing his own establishment.",0
4,"Beyond this select circle, grouped snugly about the trim little square in front of the inn, appeared the towns - people in general, mixed here and there with the country people, in their quaint German costume, placidly expectant of the diligence - the men in short black jackets, tight black breeches, and three - cornered beaver hats; the women with their long light hair hanging in one thickly plaited tail behind them, and the waists of their short woolen gowns inserted modestly in the region of their shoulder - blades.",0
5,"Round the outer edge of the assemblage thus formed, flying detachments of plump white - headed children careered in perpetual motion; while, mysteriously apart from the rest of the inhabitants, the musicians of the Baths stood collected in one lost corner, waiting the appearance of the first visitors to play the first tune of the season in the form of a serenade.",0
6,The light of a May evening was still bright on the tops of the great wooded hills watching high over the town on the right hand and the left; and the cool breeze that comes before sunset came keenly fragrant here with the balsamic odor of the first of the Black Forest.,0
7,"""Mr<prd> Landlord,"" said the mayor’s wife (giving the landlord his title), ""have you any foreign guests coming on this first day of the season?""",1
8,"""Madame Mayoress,"" replied the landlord (returning the compliment), ""I have two.",1
9,"They have written - the one by the hand of his servant, the other by his own hand apparently - to order their rooms; and they are from England, both, as I think by their names.",1
10,"If you ask me to pronounce those names, my tongue hesitates; if you ask me to spell them, here they are, letter by letter, first and second in their order as they come.",1


In [10]:
df["sentence_length"] = 0
df["location"] = 0
location = 0
count = 0

for index, row in df.iterrows():
    if index % chunk_size == 0:
        location += 1
    df.at[index, "location"] = location
    df.at[index, "sentence_length"] = len(row["sentences"].split())
    

In [11]:
df.head(50)

Unnamed: 0,sentences,narration_is_0_dialogue_is_1,sentence_length,location
1,"It was the opening of the season of eighteen hundred and thirty - two, at the Baths of Wildbad.",0,19,0
2,"The evening shadows were beginning to gather over the quiet little German town, and the diligence was expected every minute.",0,20,0
3,"Before the door of the principal inn, waiting the arrival of the first visitors of the year, were assembled the three notable personages of Wildbad, accompanied by their wives - the mayor, representing the inhabitants; the doctor, representing the waters; the landlord, representing his own establishment.",0,46,0
4,"Beyond this select circle, grouped snugly about the trim little square in front of the inn, appeared the towns - people in general, mixed here and there with the country people, in their quaint German costume, placidly expectant of the diligence - the men in short black jackets, tight black breeches, and three - cornered beaver hats; the women with their long light hair hanging in one thickly plaited tail behind them, and the waists of their short woolen gowns inserted modestly in the region of their shoulder - blades.",0,90,0
5,"Round the outer edge of the assemblage thus formed, flying detachments of plump white - headed children careered in perpetual motion; while, mysteriously apart from the rest of the inhabitants, the musicians of the Baths stood collected in one lost corner, waiting the appearance of the first visitors to play the first tune of the season in the form of a serenade.",0,62,0
6,The light of a May evening was still bright on the tops of the great wooded hills watching high over the town on the right hand and the left; and the cool breeze that comes before sunset came keenly fragrant here with the balsamic odor of the first of the Black Forest.,0,52,0
7,"""Mr<prd> Landlord,"" said the mayor’s wife (giving the landlord his title), ""have you any foreign guests coming on this first day of the season?""",1,24,0
8,"""Madame Mayoress,"" replied the landlord (returning the compliment), ""I have two.",1,11,0
9,"They have written - the one by the hand of his servant, the other by his own hand apparently - to order their rooms; and they are from England, both, as I think by their names.",1,36,0
10,"If you ask me to pronounce those names, my tongue hesitates; if you ask me to spell them, here they are, letter by letter, first and second in their order as they come.",1,33,0


In [13]:
## POS tagging:

def progress(count, total, status=''):
    bar_len = 60
    filled_len = int(round(bar_len * count / float(total)))
    percents = round(100.1 * count / float(total), 1)
    bar = '#' * filled_len + '-' * (bar_len - filled_len)
    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status))
    sys.stdout.flush() 
    
# first loading english language support
nlp = spacy.load("en_core_web_sm")

## Takes in a dataframe and clean text column (as string), and returns the df with POS tags for all the texts
## Multiple columns are created, one for each POS tag, and one that contains all POS tags (I did this to more easily
## be able to grab POS percentages afterward)
def get_POS_tags_for_text_in_df(df, text_row_to_analyze='sentences'):
    # setting up column for pos counts
    df['all_pos_counts'] = ''
    df["parts_of_speech_total_count"] = ''
    # loop through df and get all POS tags:
    for index, row in df.iterrows():
        # show progress
        progress(index, len(df.index))
        
        # grab text
        text = row[text_row_to_analyze]
        
        # this is a memory buffer, to extend max length of available ram according to the text being analyzed
        # https://datascience.stackexchange.com/questions/38745/increasing-spacy-max-nlp-limit
        nlp.max_length = len(text) + 100
        
        # disable modules not in use to save memory
        analyzed_doc = nlp(text, disable = ['ner'])
        
        # grabbing all pos counts in the text in non-human readable format
        pos_counts_in_text = analyzed_doc.count_by(spacy.attrs.IDS['POS'])
        
        # setting up list to render pos hashes in human readable format:
        human_readable_pos_count_list = []
        
        # iterating through counts to make hashes human readable:
        for pos, count in pos_counts_in_text.items():
            human_readable_tag = analyzed_doc.vocab[pos].text
            # rendering as list to input back into df
            human_readable_tag_and_count = list((human_readable_tag, count))
            human_readable_pos_count_list.append(human_readable_tag_and_count)
        # looping through the human readable counts, assigning their label to the column
        # and the count to the row for each pos tag
        for element in human_readable_pos_count_list:
            df.at[index, 'POS_' + str(element[0])+'_count'] = element[1]
        
        # placing all the pos counts for each text in the all_pos_counts column
        df.at[index, 'all_pos_counts'] = human_readable_pos_count_list
        
        
    df = df.fillna(0)
    # getting POS percentages for each POS tag in texts
    # There are much easier and more efficient ways to do this rather than looping over the entire df again but we were pressed for time...
    # TODO: integrate this loop into previous loop
    for index, row in df.iterrows():
        total = 0.0
        for name in df.columns.values.tolist():
            if name.startswith("POS_"):
                # get total POS elements count for sanity
                total += row[name]
        try:
            df.at[index, "parts_of_speech_total_count"] = int(total)
        except Exception as e:
            print(e)
        for name in df.columns.values.tolist():
            if name.startswith("POS_"):
                # assign new name for column
                new_name = "%" + name
                # get % of total POS in text
                if total != 0:
                    percentage = round((row[name] / total) * float(100), 3)
                else:
                    print(row[text_row_to_analyze])
                # if this is the first index, create the column name to avoid errors
                if index == 0:
                    df[new_name] = 0.0
                df.at[index, new_name] = percentage
    return df

OSError: [E049] Can't find spaCy data directory: 'None'. Check your installation and permissions, or use spacy.util.set_data_path to customise the location if necessary.

In [None]:

df = get_POS_tags_for_text_in_df(df, text_row_to_analyze='sentences')
df.head(10)

In [None]:
### FUNCTIONS 2/3

## This function takes in a list of dataframes, a list of columns to visualize, and a date column (with the same name in both dataframes)
## then plots those columns over time.

def visualize_numerical_columns__over_time(dfs, list_of_columns_to_visualize, date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel'):
    # Setting lists to capture the elements for visualization
    years_list = []
    counts_list = []
    labels_list = []
    colors_list = []
    
    # lazy way to grab some relevant info to have comparison points when looking at visuals
    print_info = []
    
    # the first dataframe in the list will be "DF_0", the second "DF_1", and so on.
    # in our case, DF_0 == df_nnovels_meta and DF_1 == df_txtlab_meta
    which_df = 0
    # loops for each df in list
    for df in dfs:
        # loops for each column in list
        for column_name in list_of_columns_to_visualize:
            
            # first grouping the specific column by year
            grouped_by_year = pd.to_numeric(df[column_name]).groupby(df[date_column])
            grouped_by_year = grouped_by_year.mean().reset_index()
            
            # grabbing the array of the years 
            years = np.array(grouped_by_year[date_column].tolist())
            # grabbing the array of the column we're visualizing
            count_to_visualize = np.array(grouped_by_year[column_name].tolist())
            
            # placing these elements in a list, and grabbing a label to know which DF is which
            years_list.append(years)
            counts_list.append(count_to_visualize)
            labels_list.append("DF_"+ str(which_df) +"_"+ column_name)
            
            # getting random colors to visualize
            r = random.random()
            b = random.random()
            g = random.random()
            color = (r, g, b)
            colors_list.append(color)
            
            # lazy way to add additional info 
            print_info.append(["The AVG of " + column_name + " of DF_" + str(which_df) +" is "+ str(df[column_name].mean())])
            print_info.append(["The MIN of " + column_name + " of DF_" + str(which_df) +" is "+ str(df[column_name].min())])
            print_info.append(["The MAX of " + column_name + " of DF_" + str(which_df) +" is "+ str(df[column_name].max())])
            
        # once loop finishes, increment DF number    
        which_df += 1
        
    # setting the size of the plot
    plt.figure(figsize=(20,10))
    # looping through the labels we added (essentially each DF), and plotting the elements sequentially using the index for each of the lists created above
    for index in range(len(labels_list)):
        plt.plot(years_list[index], counts_list[index], label=labels_list[index], c=colors_list[index])
    # showing the plot    
    plt.legend(loc='upper left')
    plt.title(title)
    plt.show()
    
    # printing out supplementary info
    for info in print_info:
        print(info)
        
    return None

In [None]:
dfs = [df]


In [None]:
visualize_numerical_columns__over_time(dfs, ['POS_NOUN_count'], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['POS_VERB_count'], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['POS_PROPN_count'], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['POS_PRON_count'], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['%POS_NOUN_count'], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['%POS_VERB_count'], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['%POS_PROPN_count'], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['%POS_PRON_count'], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
grouped_by_year = pd.to_numeric(df['%POS_NOUN_count']).groupby(df['location'])
grouped_by_year.head(10)

In [None]:
grouped_by_year = grouped_by_year.mean().reset_index()
grouped_by_year.head()

In [None]:
visualize_numerical_columns__over_time(dfs, ['POS_NOUN_count','POS_VERB_count',], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['POS_PROPN_count','POS_PRON_count',], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['%POS_NOUN_count','%POS_VERB_count',], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['sentence_length',], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
visualize_numerical_columns__over_time(dfs, ['%POS_PROPN_count','%POS_PRON_count',], date_column='location', graph_y_label='What are we counting?', title='SOMETHING over the novel')

In [None]:
### Divide the DF into 5 sections and understand changes that happen in those sections:

list_of_elements_to_analyze = ['%POS_NOUN_count', '%POS_DET_count', '%POS_ADP_count', '%POS_PROPN_count', '%POS_ADJ_count', '%POS_ADV_count', '%POS_PRON_count', '%POS_VERB_count', '%POS_AUX_count']
N = 3
# columns_in_df = df2.columns.values.tolist()
print(f"Printing out information for {novel_to_analyze}. The DF is split into {N} sections.")
for name in list_of_elements_to_analyze:
    if name.startswith("%"):
        split_arrays = np.array_split(df[name], N)
        print(f"The POS tag being shown is {name}.")
        for i in range(N):
            mean_of_section = split_arrays[i].mean()
            max_of_section = split_arrays[i].max()
            min_of_section = split_arrays[i].min()
            median_of_section = split_arrays[i].median()
            print(f"SECTION {i}: The MEAN value is: {mean_of_section}. The MAX value is: {max_of_section}. The MIN value is: {min_of_section}. The MEDIAN value is: {median_of_section}.")


In [None]:
print(df.columns.tolist())

In [None]:
fig = px.histogram(df, x="location", nbins=10, color="narration_is_0_dialogue_is_1", title="Visitors per region")
fig.show()