In [1]:
import pandas as pd 
df = pd.read_csv('phil_nlp.csv')

df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,sentence_length,sentence_lowered,tokenized_txt,lemmatized_str,sentence
8296,Plato - Complete Works,Plato,Plato,"And, in partaking of time, is it and does it c...","And, in partaking of time, is it and does it c...",146,"and, in partaking of time, is it and does it c...","['and', 'in', 'partaking', 'of', 'time', 'is',...","and , in partake of time , be -PRON- and do -...","And, in partaking of time, is it and does it c..."
20198,Plato - Complete Works,Plato,Plato,I think there are both kinds.,I think there are both kinds.,29,i think there are both kinds.,"['think', 'there', 'are', 'both', 'kinds']",-PRON- think there be both kind .,I think there are both kinds.
257242,Critique Of Pure Reason,Kant,German Idealism,I an experience in general and the synthetic u...,I an experience in general and the synthetic u...,102,i an experience in general and the synthetic u...,"['an', 'experience', 'in', 'general', 'and', '...",-PRON- an experience in general and the synth...,I an experience in general and the synthetic u...
115609,Theodicy,Leibniz,Rationalism,I find in the arguments that have just been qu...,I find in the arguments that have just been qu...,98,i find in the arguments that have just been qu...,"['find', 'in', 'the', 'arguments', 'that', 'ha...",-PRON- find in the argument that have just be...,I find in the arguments that have just been qu...
9200,Plato - Complete Works,Plato,Plato,And so we know by now what we mean by perception?,And so we know by now what we mean by perception?,49,and so we know by now what we mean by perception?,"['and', 'so', 'we', 'know', 'by', 'now', 'what...",and so -PRON- know by now what -PRON- mean by...,And so we know by now what we mean by perception?


In [2]:
df['school'] = df['school'].apply(lambda x: x.replace('_', ' ').title())
df['school'].value_counts(normalize=True)

Analytic           0.160771
Aristotle          0.141493
German Idealism    0.122223
Plato              0.111288
Continental        0.097982
Phenomenology      0.082881
Rationalism        0.066568
Empiricism         0.057814
Capitalism         0.052775
Communism          0.052091
Nietzsche          0.039298
Feminism           0.007463
Stoicism           0.007353
Name: school, dtype: float64

In [3]:
import wordcloud
import nltk 
nltk.download('stopwords')
from nltk import FreqDist
from nltk.corpus import stopwords
import string
import re
import plotly.express as px 
import pandas as pd
from nltk.collocations import BigramCollocationFinder
from gensim.utils import simple_preprocess

classifier_dict = {}
for author in df['author'].unique():
  classifier_dict[author] = 'author'
for title in df['title'].unique():
  classifier_dict[title] = 'title'
for school in df['school'].unique():
  classifier_dict[school] = 'school'

stopwords_list = stopwords.words('english') + list(string.punctuation) 
stopwords_list += ['“','”','...',"''",'’','``', "'", "‘"]
custom_stopwords = ['–', 'also', 'something', 'cf', 'thus', 'two', 'now', 'would', 
                    'make', 'eb', 'u', 'well', 'even', 'said', 'eg', 'us',
                    'n', 'sein', 'e', 'da', 'therefore', 'however', 'would', 
                    'thing', 'must', 'merely', 'way', 'since', 'latter', 'first',
                    'B', 'mean', 'upon', 'yet', 'cannot', 'c', 'C', 'let', 'may', 
                    'might', "'s", 'b', 'ofthe', 'p.', '_', '-', 'eg', 'e.g.',
                    'ie', 'i.e.', 'f', 'l', "n't", 'e.g', 'i.e', '—', '--', 
                    'hyl', 'phil', 'one', 'another', 'could', 'come', 'things', 'thing',
                    'else', 'every', 'shall', 'thee', 'thy', 'thou', 'unto'] + stopwords_list

df['gensim_tokenized'] = df['sentence_str'].map(lambda x: simple_preprocess(x.lower(),deacc=True,
                                                        max_len=500))

def get_average_word_length(input, df, classifier_dict):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', '[']
  num_words = 0
  sum_word_lengths = 0
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    sentence_list = [re.sub("[',]", '', word) for word in sentence_list]
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    for word in no_punctuation_tokens:
      num_words += 1
      sum_word_lengths += len(word)
  return round((sum_word_lengths / num_words), 2)

def get_average_sentence_length(input, df, classifier_dict):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', ']']
  num_sentences = 0
  sum_sentence_lengths = 0
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    num_sentences += 1
    sum_sentence_lengths += len(no_punctuation_tokens)
  return round(sum_sentence_lengths / num_sentences, 2)

def make_word_cloud(input, df, classifier, stopwords=stopwords.words('english')):
    text = ''
    for sentence in df[df[classifier[input]]==input]['sentence_str']:
      text += sentence
    cloud = wordcloud.WordCloud(width=500, 
                            height=400, 
                            background_color='#D1D1D1', 
                            max_words=30, 
                            stopwords=stopwords, 
                            color_func=lambda *args, **kwargs: (95,95,95)).generate(text)
    return cloud

def get_num_unique_words(input, df, classifier_dict):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', ']']
  word_list = []
  num_words = 0
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    num_words += len(no_punctuation_tokens)
    for word in no_punctuation_tokens:
      word_list.append(word)
  num_unique_words = len(set(word_list))
  return num_unique_words, num_words

def plot_word_frequency(input, df, classifier_dict, stopwords):
  word_list = []
  for sentence in df[df[classifier_dict[input]]==input]['gensim_tokenized'][:50]:
    for word in sentence:
      word_list.append(word)
  cleaned_words = [x.lower() for x in word_list if x.lower() not in stopwords]
  freq_dist = FreqDist(cleaned_words)
  freq_dict = {'words': [x[0] for x in freq_dist.most_common(7)], 
              'frequency': [x[1] for x in freq_dist.most_common(7)]}
  freq_df = pd.DataFrame(freq_dict)
  fig = px.bar(freq_df,
              x='words',
              y='frequency')
  fig.update_xaxes(title_text='Words')
  fig.update_yaxes(title_text='Count')
  fig.update_layout(title_text=f'{input.title()} Word Frequency Chart', title_x=0.5)
  return fig

def plot_ngram_frequency(input, df, classifier_dict, stopwords): 
  word_list = []
  for sent in df[df[classifier_dict[input]]==input]['gensim_tokenized']:
    for word in sent:
      word_list.append(word)
  cleaned = [word.lower() for word in word_list if word not in custom_stopwords]
  bigram_finder = BigramCollocationFinder.from_words(cleaned, window_size=3)
  top_10 = sorted(bigram_finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:7]
  bigram_df = pd.DataFrame(top_10, columns=['bigram', 'frequency'])
  bigram_df['bigram'] = bigram_df['bigram'].apply(lambda x: ', '.join(x))
  fig = px.bar(bigram_df,
              x='bigram',
              y='frequency')
  fig.update_xaxes(title_text='Phrases')
  fig.update_yaxes(title_text='Count')
  fig.update_layout(title_text=f'{input.title()} N-gram Frequency Chart', title_x=0.5)
  return fig

def get_title_list(input, df, classifier_dict):
  title_list = list(df[df[classifier_dict[input]]==input]['title'].unique())
  title_list = [title.title() for title in title_list] 
  return ', '.join(title_list)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kcali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
stats_dict_master = {}
for option in classifier_dict.keys():
    stats_dict = {}
    stats_dict['title_list'] = get_title_list(option, df, classifier_dict)
    stats_dict['ngram_chart'] = plot_ngram_frequency(option, df, classifier_dict, custom_stopwords)
    stats_dict['word_freq_chart'] = plot_word_frequency(option, df, classifier_dict, custom_stopwords)
    stats_dict['num_unique'] = get_num_unique_words(option, df, classifier_dict)
    stats_dict['mean_sent_length'] = get_average_sentence_length(option, df, classifier_dict)
    stats_dict['mean_word_length'] = get_average_word_length(option, df, classifier_dict)
    stats_dict_master[option] = stats_dict

In [5]:
stats_dict_master['Stoicism']

{'title_list': 'Enchiridion, Meditations',
 'ngram_chart': Figure({
     'data': [{'alignmentgroup': 'True',
               'hoverlabel': {'namelength': 0},
               'hovertemplate': 'bigram=%{x}<br>frequency=%{y}',
               'legendgroup': '',
               'marker': {'color': '#636efa'},
               'name': '',
               'offsetgroup': '',
               'orientation': 'v',
               'showlegend': False,
               'textposition': 'auto',
               'type': 'bar',
               'x': array(['according, nature', 'nature, universe', 'doth, happen',
                           'whatsoever, doth', 'nature, doth', 'man, man', 'man, doth'],
                          dtype=object),
               'xaxis': 'x',
               'y': array([44, 29, 26, 25, 24, 22, 20], dtype=int64),
               'yaxis': 'y'}],
     'layout': {'barmode': 'relative',
                'height': 600,
                'legend': {'tracegroupgap': 0},
                'margin': {'t': 60

In [10]:
import pickle

for option in stats_dict_master.keys():
    dict_pkl = open(f'../apps/stats_app/stats_pickles/{option.title()}_stats.pkl', 'wb')
    pickle.dump(stats_dict_master[option], dict_pkl)
    dict_pkl.close()


In [6]:
classifier_dict2 = {}
for author in df['author'].unique():
  classifier_dict2[author] = 'AUTHOR'
for title in df['title'].unique():
  classifier_dict2[title] = 'TITLE'
for school in df['school'].unique():
  classifier_dict2[school] = 'SCHOOL'

classifier_dict2

{'Plato': 'SCHOOL',
 'Aristotle': 'SCHOOL',
 'Locke': 'AUTHOR',
 'Hume': 'AUTHOR',
 'Berkeley': 'AUTHOR',
 'Spinoza': 'AUTHOR',
 'Leibniz': 'AUTHOR',
 'Descartes': 'AUTHOR',
 'Malebranche': 'AUTHOR',
 'Russell': 'AUTHOR',
 'Moore': 'AUTHOR',
 'Wittgenstein': 'AUTHOR',
 'Lewis': 'AUTHOR',
 'Quine': 'AUTHOR',
 'Popper': 'AUTHOR',
 'Kripke': 'AUTHOR',
 'Foucault': 'AUTHOR',
 'Derrida': 'AUTHOR',
 'Deleuze': 'AUTHOR',
 'Merleau-Ponty': 'AUTHOR',
 'Husserl': 'AUTHOR',
 'Heidegger': 'AUTHOR',
 'Kant': 'AUTHOR',
 'Fichte': 'AUTHOR',
 'Hegel': 'AUTHOR',
 'Marx': 'AUTHOR',
 'Lenin': 'AUTHOR',
 'Smith': 'AUTHOR',
 'Ricardo': 'AUTHOR',
 'Keynes': 'AUTHOR',
 'Epictetus': 'AUTHOR',
 'Marcus Aurelius': 'AUTHOR',
 'Nietzsche': 'SCHOOL',
 'Mary Wollstonecraft': 'AUTHOR',
 'Plato - Complete Works': 'TITLE',
 'Aristotle - Complete Works': 'TITLE',
 'Second Treatise On Government': 'TITLE',
 'Essay Concerning Human Understanding': 'TITLE',
 'A Treatise Of Human Nature': 'TITLE',
 'Dialogues Concerning Na

In [8]:
all_options = sorted([x.title() for x in list(classifier_dict.keys())])

In [9]:
dropdown_menu = []
for source in all_options:
    dropdown_menu.append({'label': source, 'value': source})

dropdown_menu

[{'label': 'A General Theory Of Employment, Interest, And Money',
  'value': 'A General Theory Of Employment, Interest, And Money'},
 {'label': 'A Treatise Concerning The Principles Of Human Knowledge',
  'value': 'A Treatise Concerning The Principles Of Human Knowledge'},
 {'label': 'A Treatise Of Human Nature',
  'value': 'A Treatise Of Human Nature'},
 {'label': 'Analytic', 'value': 'Analytic'},
 {'label': 'Anti-Oedipus', 'value': 'Anti-Oedipus'},
 {'label': 'Aristotle', 'value': 'Aristotle'},
 {'label': 'Aristotle - Complete Works',
  'value': 'Aristotle - Complete Works'},
 {'label': 'Being And Time', 'value': 'Being And Time'},
 {'label': 'Berkeley', 'value': 'Berkeley'},
 {'label': 'Beyond Good And Evil', 'value': 'Beyond Good And Evil'},
 {'label': 'Capital', 'value': 'Capital'},
 {'label': 'Capitalism', 'value': 'Capitalism'},
 {'label': 'Communism', 'value': 'Communism'},
 {'label': 'Continental', 'value': 'Continental'},
 {'label': 'Critique Of Judgement', 'value': 'Critique

In [None]:
df = pd.DataFrame.from_dict(stats_dict_master, orient='index')

In [15]:
df

In [26]:
df.dtypes

title_list           object
ngram_chart          object
word_freq_chart      object
num_unique           object
mean_sent_length    float64
mean_word_length    float64
dtype: object

In [28]:
df = df.astype({'title_list': 'string', 'ngram_chart': 'string', 'word_freq_chart': 'string'})

In [29]:
import sqlalchemy
from sqlalchemy import create_engine 

engine = create_engine('url',  
                       echo = False)

df.to_sql('stats_database', con = engine, dtype={'ngram_chart': sqlalchemy.types.String(), 'word_freq_chart': sqlalchemy.types.String()}, if_exists='append')

print(engine.execute("SELECT * FROM stats_database").fetchone()) 

    

('Plato', 'Plato - Complete Works', "Figure({\n    'data': [{'alignmentgroup': 'True',\n              'hoverlabel': {'namelength': 0},\n              'hovertemplate': 'bigram=%{x}<br>fre ... (920 characters truncated) ... 'domain': [0.0, 0.98], 'title': {'text': 'Phrases'}},\n               'yaxis': {'anchor': 'x', 'domain': [0.0, 1.0], 'title': {'text': 'Count'}}}\n})", "Figure({\n    'data': [{'alignmentgroup': 'True',\n              'hoverlabel': {'namelength': 0},\n              'hovertemplate': 'words=%{x}<br>freq ... (862 characters truncated) ... , 'domain': [0.0, 0.98], 'title': {'text': 'Words'}},\n               'yaxis': {'anchor': 'x', 'domain': [0.0, 1.0], 'title': {'text': 'Count'}}}\n})", '(25533,795612)', '20.72', '4.46')


In [34]:
query = """select * from stats_database where index = 'Plato'"""

In [37]:
connection = engine.raw_connection('url')
c = connection.cursor()

In [40]:
print(c.execute(query).fetchall())

AttributeError: 'NoneType' object has no attribute 'fetchall'

In [50]:
results = pd.read_sql(query, connection)

In [51]:
results

Unnamed: 0,index,title_list,ngram_chart,word_freq_chart,num_unique,mean_sent_length,mean_word_length
0,Plato,Plato - Complete Works,Figure({\n 'data': [{'alignmentgroup': 'Tru...,Figure({\n 'data': [{'alignmentgroup': 'Tru...,"(25533,795612)",20.72,4.46


In [55]:
print(c.execute("""select ngram_chart from stats_database where index = 'Plato'"""))

None


In [59]:
import matplotlib as plt 

plt.fig(c.fetchall())

AttributeError: module 'matplotlib' has no attribute 'fig'

In [65]:
chart = results['ngram_chart'].astype('object')

In [71]:
from matplotlib.controllers import Controller

c = Controller(chart)

fig = c.figure
# plt.plot(chart)

ModuleNotFoundError: No module named 'matplotlib.controllers'

Now we get the updated dropdown menu