In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/Users/justinchen/Downloads/BookDataset2.csv')
df = df.drop(['Image'], axis = 1)
df.replace(np.nan, '', regex=True, inplace = True)
df

Unnamed: 0,Date,Title,Subtitle,Author,Transcript,URL
0,1850.0,,,,"The Scarlet Letter, a Romance",https://en.wikipedia.org/wiki/The_Scarlet_Letter
1,1851.0,,,,"The House of the Seven Gables, A Romance",https://en.wikipedia.org/wiki/The_House_of_the...
2,1851.0,,,,"Moby-Dick; or, The Whale. By Herman Melville, ...",https://en.wikipedia.org/wiki/Moby-Dick
3,1853.0,,,,"Bartleby, the Scrivener: A Story of Wall Street","https://en.wikipedia.org/wiki/Bartleby,_the_Sc..."
4,1853.0,,,,Bleak House,https://en.wikipedia.org/wiki/Bleak_House
...,...,...,...,...,...,...
184,2014.0,,,,The Sixth Extinction: An Unnatural History,https://en.wikipedia.org/wiki/The_Sixth_Extinc...
185,2015.0,,,,The Argonauts,https://en.wikipedia.org/wiki/The_Argonauts
186,2016.0,,,,The Underground Railroad,https://en.wikipedia.org/wiki/The_Underground_...
187,2016.0,,,,Autumn,https://en.wikipedia.org/wiki/Autumn_(Smith_no...


In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [4]:
#functions to find most often appearing parts of speech etc.
from operator import itemgetter
def most_pos(df, N):
    POS_dict = {}
    for text in df['Transcript']:
        doc = nlp(text)
        for token in doc:
            pos = token.pos_
            if pos in POS_dict.keys():
                POS_dict[pos] += 1
            else:
                POS_dict[pos] = 1
    final_dict = dict(sorted(POS_dict.items(), key = itemgetter(1), reverse=True)[:N])
    return final_dict

def most_tag(df, N):
    word_dict = {}
    for text in df['Transcript']:
        doc = nlp(text)
        for token in doc:
            pos = token.tag_
            if pos in word_dict.keys():
                word_dict[pos] += 1
            else:
                word_dict[pos] = 1
    final_dict = dict(sorted(word_dict.items(), key = itemgetter(1), reverse=True)[:N])
    return final_dict

def most_dep(df, N):
    word_dict = {}
    for text in df['Transcript']:
        doc = nlp(text)
        for token in doc:
            pos = token.dep_
            if pos in word_dict.keys():
                word_dict[pos] += 1
            else:
                word_dict[pos] = 1
    final_dict = dict(sorted(word_dict.items(), key = itemgetter(1), reverse=True)[:N])
    return final_dict

def most_stop(df, N):
    word_dict = {}
    for text in df['Transcript']:
        doc = nlp(text)
        for token in doc:
            pos = token.is_stop
            if pos in word_dict.keys():
                word_dict[pos] += 1
            else:
                word_dict[pos] = 1
    final_dict = dict(sorted(word_dict.items(), key = itemgetter(1), reverse=True)[:N])
    return final_dict

def most_lemma(df, N):
    word_dict = {}
    for text in df['Transcript']:
        doc = nlp(text)
        for token in doc:
            pos = token.lemma_
            if pos in word_dict.keys():
                word_dict[pos] += 1
            else:
                word_dict[pos] = 1
    final_dict = dict(sorted(word_dict.items(), key = itemgetter(1), reverse=True)[:N])
    return final_dict

In [5]:
print(most_pos(df, 5))
print(most_tag(df, 5))
print(most_dep(df, 5))
print(most_stop(df, 2))
print(most_lemma(df, 5))

{'PROPN': 375, 'DET': 163, 'ADP': 129, 'NOUN': 101, 'PUNCT': 87}
{'NNP': 359, 'DT': 166, 'IN': 130, 'NN': 74, ',': 28}
{'ROOT': 189, 'det': 161, 'compound': 129, 'pobj': 128, 'prep': 114}
{False: 599, True: 386}
{'the': 125, 'of': 62, 'a': 35, ',': 28, 'by': 23}


In [6]:
import spacy

parsed_df = pd.DataFrame(columns = ['Date', 'Most POS', 'Most TAG', 'Most DEP', 'Num of Stopwords', 'Most Lemma'])
nlp = spacy.load("en_core_web_sm")
for a in range(9, 150, 10):
    df1 = df.iloc[a-9:a]
    pos = most_pos(df1, 5)
    tag = most_tag(df1, 5)
    dep = most_dep(df1, 5)
    stop = most_stop(df1, 2)
    lemma = most_lemma(df1, 5)
    period = 1700 + (a-9)
    df2 = pd.DataFrame({'Date': period, 'Most POS': [pos], 'Most TAG': [tag], 'Most DEP':[ dep], 'Num of Stopwords': [stop], 'Most Lemma':[lemma]})
    parsed_df = pd.concat([parsed_df, df2], ignore_index = True)      

In [7]:
parsed_df

Unnamed: 0,Date,Most POS,Most TAG,Most DEP,Num of Stopwords,Most Lemma
0,1700,"{'PROPN': 34, 'PUNCT': 27, 'ADP': 13, 'DET': 1...","{'NNP': 31, 'IN': 13, 'DT': 12, ',': 11, 'NN': 5}","{'punct': 27, 'pobj': 13, 'det': 11, 'prep': 1...","{False: 69, True: 28}","{',': 11, 'the': 9, '""': 9, 'of': 6, 'a': 3}"
1,1710,"{'PROPN': 29, 'ADP': 12, 'PUNCT': 10, 'DET': 7...","{'NNP': 27, 'IN': 12, 'DT': 7, '.': 5, 'NNS': 4}","{'ROOT': 12, 'pobj': 12, 'compound': 10, 'punc...","{False: 45, True: 25}","{'.': 5, 'the': 4, 'by': 4, 'in': 3, 'of': 3}"
2,1720,"{'PROPN': 36, 'PUNCT': 17, 'ADP': 17, 'DET': 8...","{'NNP': 35, 'IN': 17, 'DT': 9, ',': 8, 'CC': 5}","{'punct': 17, 'pobj': 17, 'prep': 13, 'compoun...","{False: 63, True: 38}","{',': 8, 'the': 8, 'of': 5, '.': 5, 'and': 4}"
3,1730,"{'PROPN': 31, 'DET': 8, 'ADP': 8, 'PUNCT': 7, ...","{'NNP': 31, 'DT': 8, 'IN': 8, '.': 3, ':': 1}","{'compound': 12, 'ROOT': 10, 'det': 8, 'pobj':...","{False: 40, True: 18}","{'the': 6, 'of': 5, '.': 3, 'a': 2, 'by': 2}"
4,1740,"{'PROPN': 20, 'DET': 15, 'ADP': 11, 'NOUN': 9,...","{'NNP': 19, 'DT': 16, 'IN': 11, 'NN': 8, 'VBZ'...","{'det': 15, 'pobj': 12, 'prep': 11, 'ROOT': 9,...","{True: 38, False: 34}","{'the': 12, 'of': 6, 'be': 3, 'by': 3, 'this': 2}"
5,1750,"{'PROPN': 15, 'DET': 7, 'ADP': 6, 'NOUN': 4, '...","{'NNP': 15, 'DT': 8, 'IN': 6, 'NNS': 2, 'NN': 2}","{'ROOT': 8, 'det': 7, 'prep': 6, 'pobj': 6, 'c...","{False: 21, True: 18}","{'the': 6, 'of': 5, ':': 2, 'a': 2, 'Dracula': 1}"
6,1760,"{'PROPN': 18, 'DET': 6, 'ADP': 3, 'CCONJ': 1}","{'NNP': 17, 'DT': 6, 'IN': 3, 'NNPS': 1, 'CC': 1}","{'ROOT': 8, 'det': 6, 'compound': 5, 'prep': 3...","{False: 18, True: 10}","{'the': 6, 'of': 3, 'House': 1, 'Mirth': 1, 'R..."
7,1770,"{'PROPN': 19, 'DET': 9, 'ADP': 8, 'NOUN': 6, '...","{'NNP': 18, 'DT': 10, 'IN': 8, 'NNS': 3, 'NN': 3}","{'det': 9, 'ROOT': 8, 'prep': 8, 'compound': 8...","{False: 28, True: 18}","{'the': 7, 'of': 6, 'a': 3, 'Ohio': 2, 'rider'..."
8,1780,"{'PROPN': 10, 'DET': 6, 'VERB': 3, 'NOUN': 2, ...","{'NNP': 10, 'DT': 6, 'NN': 2, 'IN': 2, 'VBG': 2}","{'ROOT': 8, 'det': 6, 'compound': 4, 'nsubj': ...","{False: 15, True: 11}","{'the': 5, 'a': 1, 'passage': 1, 'to': 1, 'Ind..."
9,1790,"{'PROPN': 10, 'NOUN': 5, 'DET': 4, 'ADP': 2, '...","{'NNP': 9, 'NN': 5, 'DT': 4, 'IN': 2, 'VBZ': 1}","{'ROOT': 8, 'compound': 5, 'det': 4, 'prep': 2...","{False: 16, True: 7}","{'the': 4, 'Brave': 1, 'New': 1, 'World': 1, '..."
