In [1]:
import pandas as pd 
df = pd.read_csv('phil_nlp.csv')

df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str
116144,Discourse On Method,Descartes,rationalism,Of this we have abundant proof in the ordinary...,Of this we have abundant proof in the ordinary...,424,of this we have abundant proof in the ordinary...,"['of', 'this', 'we', 'have', 'abundant', 'proo...",of this -PRON- have abundant proof in the ord...
248446,Critique Of Practical Reason,Kant,german_idealism,So distinctly and sharply cut are the boundari...,So distinctly and sharply cut are the boundari...,188,so distinctly and sharply cut are the boundari...,"['so', 'distinctly', 'and', 'sharply', 'cut', ...",so distinctly and sharply cut be the boundary...
46974,Aristotle - Complete Works,Aristotle,aristotle,"It is impossible, therefore, for anything to b...","It is impossible, therefore, for anything to b...",67,"it is impossible, therefore, for anything to b...","['it', 'is', 'impossible', 'therefore', 'for',...","-PRON- be impossible , therefore , for anythi..."
285275,Elements Of The Philosophy Of Right,Hegel,german_idealism,This method leaves out of account what is alon...,This method leaves out of account what is alon...,118,this method leaves out of account what is alon...,"['this', 'method', 'leaves', 'out', 'of', 'acc...",this method leave out of account what be alon...
137709,Philosophical Studies,Moore,analytic,"Perhaps, even, there may be some justification...","Perhaps, even, there may be some justification...",339,"perhaps, even, there may be some justification...","['perhaps', 'even', 'there', 'may', 'be', 'som...","perhaps , even , there may be some justificat..."


In [2]:
df['school'] = df['school'].apply(lambda x: x.replace('_', ' ').title())
df['school'].value_counts(normalize=True)

Analytic           0.168647
Aristotle          0.148419
German Idealism    0.128210
Plato              0.116812
Continental        0.102785
Phenomenology      0.086939
Rationalism        0.069830
Empiricism         0.060644
Capitalism         0.055359
Communism          0.054644
Stoicism           0.007713
Name: school, dtype: float64

In [3]:
import wordcloud
import nltk 
nltk.download('stopwords')
from nltk import FreqDist
from nltk.corpus import stopwords
import string
import re
import plotly.express as px 
import pandas as pd
from nltk.collocations import BigramCollocationFinder
from gensim.utils import simple_preprocess

classifier_dict = {}
for author in df['author'].unique():
  classifier_dict[author] = 'author'
for title in df['title'].unique():
  classifier_dict[title] = 'title'
for school in df['school'].unique():
  classifier_dict[school] = 'school'

stopwords_list = stopwords.words('english') + list(string.punctuation) 
stopwords_list += ['“','”','...',"''",'’','``', "'", "‘"]
custom_stopwords = ['–', 'also', 'something', 'cf', 'thus', 'two', 'now', 'would', 
                    'make', 'eb', 'u', 'well', 'even', 'said', 'eg', 'us',
                    'n', 'sein', 'e', 'da', 'therefore', 'however', 'would', 
                    'thing', 'must', 'merely', 'way', 'since', 'latter', 'first',
                    'B', 'mean', 'upon', 'yet', 'cannot', 'c', 'C', 'let', 'may', 
                    'might', "'s", 'b', 'ofthe', 'p.', '_', '-', 'eg', 'e.g.',
                    'ie', 'i.e.', 'f', 'l', "n't", 'e.g', 'i.e', '—', '--', 
                    'hyl', 'phil', 'one', 'another', 'could', 'come', 'things', 'thing',
                    'else', 'every', 'shall'] + stopwords_list

df['gensim_tokenized'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=500))

def get_average_word_length(input, df, classifier_dict):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', '[']
  num_words = 0
  sum_word_lengths = 0
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    sentence_list = [re.sub("[',]", '', word) for word in sentence_list]
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    for word in no_punctuation_tokens:
      num_words += 1
      sum_word_lengths += len(word)
  return round((sum_word_lengths / num_words), 2)

def get_average_sentence_length(input, df, classifier_dict):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', ']']
  num_sentences = 0
  sum_sentence_lengths = 0
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    num_sentences += 1
    sum_sentence_lengths += len(no_punctuation_tokens)
  return round(sum_sentence_lengths / num_sentences, 2)

def make_word_cloud(input, df, classifier, stopwords=stopwords.words('english')):
    text = ''
    for sentence in df[df[classifier[input]]==input]['sentence_str']:
      text += sentence
    cloud = wordcloud.WordCloud(width=500, 
                            height=400, 
                            background_color='#D1D1D1', 
                            max_words=30, 
                            stopwords=stopwords, 
                            color_func=lambda *args, **kwargs: (95,95,95)).generate(text)
    return cloud

def get_num_unique_words(input, df, classifier_dict):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', ']']
  word_list = []
  num_words = 0
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    num_words += len(no_punctuation_tokens)
    for word in no_punctuation_tokens:
      word_list.append(word)
  num_unique_words = len(set(word_list))
  return num_unique_words, num_words

def plot_word_frequency(input, df, classifier_dict, stopwords):
  word_list = []
  for sentence in df[df[classifier_dict[input]]==input]['gensim_tokenized'][:50]:
    for word in sentence:
      word_list.append(word)
  cleaned_words = [x.lower() for x in word_list if x.lower() not in stopwords]
  freq_dist = FreqDist(cleaned_words)
  freq_dict = {'words': [x[0] for x in freq_dist.most_common(7)], 
              'frequency': [x[1] for x in freq_dist.most_common(7)]}
  freq_df = pd.DataFrame(freq_dict)
  fig = px.bar(freq_df,
              x='words',
              y='frequency')
  fig.update_xaxes(title_text='Words')
  fig.update_yaxes(title_text='Count')
  fig.update_layout(title_text=f'{input.title()} Word Frequency Chart', title_x=0.5)
  return fig

def plot_ngram_frequency(input, df, classifier_dict, stopwords): 
  word_list = []
  for sent in df[df[classifier_dict[input]]==input]['gensim_tokenized']:
    for word in sent:
      word_list.append(word)
  cleaned = [word.lower() for word in word_list if word not in custom_stopwords]
  bigram_finder = BigramCollocationFinder.from_words(cleaned, window_size=3)
  top_10 = sorted(bigram_finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:7]
  bigram_df = pd.DataFrame(top_10, columns=['bigram', 'frequency'])
  bigram_df['bigram'] = bigram_df['bigram'].apply(lambda x: ', '.join(x))
  fig = px.bar(bigram_df,
              x='bigram',
              y='frequency')
  fig.update_xaxes(title_text='Phrases')
  fig.update_yaxes(title_text='Count')
  fig.update_layout(title_text=f'{input.title()} N-gram Frequency Chart', title_x=0.5)
  return fig

def get_title_list(input, df, classifier_dict):
  title_list = list(df[df[classifier_dict[input]]==input]['title'].unique())
  title_list = [title.title() for title in title_list] 
  return ', '.join(title_list)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kcali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
stats_dict_master = {}
for option in classifier_dict.keys():
    stats_dict = {}
    stats_dict['title_list'] = get_title_list(option, df, classifier_dict)
    stats_dict['ngram_chart'] = plot_ngram_frequency(option, df, classifier_dict, custom_stopwords)
    stats_dict['word_freq_chart'] = plot_word_frequency(option, df, classifier_dict, custom_stopwords)
    stats_dict['num_unique'] = get_num_unique_words(option, df, classifier_dict)
    stats_dict['mean_sent_length'] = get_average_sentence_length(option, df, classifier_dict)
    stats_dict['mean_word_length'] = get_average_word_length(option, df, classifier_dict)
    stats_dict_master[option] = stats_dict

In [5]:
stats_dict_master['Stoicism']

{'title_list': 'Enchiridion, Meditations',
 'ngram_chart': Figure({
     'data': [{'alignmentgroup': 'True',
               'hoverlabel': {'namelength': 0},
               'hovertemplate': 'bigram=%{x}<br>frequency=%{y}',
               'legendgroup': '',
               'marker': {'color': '#636efa'},
               'name': '',
               'offsetgroup': '',
               'orientation': 'v',
               'showlegend': False,
               'textposition': 'auto',
               'type': 'bar',
               'x': array(['unto, thee', 'thou, art', 'thou, shalt', 'thou, hast', 'thee, thou',
                           'thou, dost', 'thy, mind'], dtype=object),
               'xaxis': 'x',
               'y': array([93, 92, 87, 65, 64, 54, 51], dtype=int64),
               'yaxis': 'y'}],
     'layout': {'barmode': 'relative',
                'height': 600,
                'legend': {'tracegroupgap': 0},
                'margin': {'t': 60},
                'template': '...',
         

In [6]:
import pickle

for option in stats_dict_master.keys():
    dict_pkl = open(f'../stats_app/stats_pickles/{option.title()}_stats.pkl', 'wb')
    pickle.dump(stats_dict_master[option], dict_pkl)
    dict_pkl.close()


Now we get the updated dropdown menu

In [7]:
all_options = sorted([x.title() for x in list(classifier_dict.keys())])
all_options

['A General Theory Of Employment, Interest, And Money',
 'A Treatise Concerning The Principles Of Human Knowledge',
 'A Treatise Of Human Nature',
 'Analytic',
 'Anti-Oedipus',
 'Aristotle',
 'Aristotle - Complete Works',
 'Being And Time',
 'Berkeley',
 'Capital',
 'Capitalism',
 'Communism',
 'Continental',
 'Critique Of Judgement',
 'Critique Of Practical Reason',
 'Critique Of Pure Reason',
 'Deleuze',
 'Derrida',
 'Descartes',
 'Dialogues Concerning Natural Religion',
 'Difference And Repetition',
 'Discourse On Method',
 'Elements Of The Philosophy Of Right',
 'Empiricism',
 'Enchiridion',
 'Epictetus',
 'Essay Concerning Human Understanding',
 'Essential Works Of Lenin',
 'Ethics',
 'Fichte',
 'Foucault',
 'German Idealism',
 'Hegel',
 'Heidegger',
 'Hume',
 'Husserl',
 'Kant',
 'Keynes',
 'Kripke',
 'Leibniz',
 'Lenin',
 'Lewis',
 'Lewis - Papers',
 'Locke',
 'Madness And Civilization',
 'Malebranche',
 'Marcus Aurelius',
 'Marx',
 'Meditations',
 'Meditations On First Philosop

In [8]:
dropdown_menu = []
for source in all_options:
    dropdown_menu.append({'label': source, 'value': source})

dropdown_menu

[{'label': 'A General Theory Of Employment, Interest, And Money',
  'value': 'A General Theory Of Employment, Interest, And Money'},
 {'label': 'A Treatise Concerning The Principles Of Human Knowledge',
  'value': 'A Treatise Concerning The Principles Of Human Knowledge'},
 {'label': 'A Treatise Of Human Nature',
  'value': 'A Treatise Of Human Nature'},
 {'label': 'Analytic', 'value': 'Analytic'},
 {'label': 'Anti-Oedipus', 'value': 'Anti-Oedipus'},
 {'label': 'Aristotle', 'value': 'Aristotle'},
 {'label': 'Aristotle - Complete Works',
  'value': 'Aristotle - Complete Works'},
 {'label': 'Being And Time', 'value': 'Being And Time'},
 {'label': 'Berkeley', 'value': 'Berkeley'},
 {'label': 'Capital', 'value': 'Capital'},
 {'label': 'Capitalism', 'value': 'Capitalism'},
 {'label': 'Communism', 'value': 'Communism'},
 {'label': 'Continental', 'value': 'Continental'},
 {'label': 'Critique Of Judgement', 'value': 'Critique Of Judgement'},
 {'label': 'Critique Of Practical Reason',
  'value'