<a href="https://colab.research.google.com/github/kcalizadeh/phil_nlp/blob/master/dash_text_analysis_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Imports and Mounting Drive

In [None]:
# this cell mounts drive, sets the correct directory, then imports all functions
# and relevant libraries via the functions.py file
from google.colab import drive
import sys

# install relevent libraries not included with colab
!pip install lime
!pip install symspellpy

drive.mount('/gdrive',force_remount=True)

drive_path = '/gdrive/MyDrive/Colab_Projects/Phil_NLP'

sys.path.append(drive_path)

Mounted at /gdrive


In [None]:
%load_ext autoreload
%autoreload 2
import plotly.express as px
from functions import *

np.random_seed=17

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [None]:
df = pd.read_csv('/gdrive/MyDrive/Colab_Projects/Phil_NLP/phil_nlp.csv')

df.sample(5)

Unnamed: 0,title,author,school,sentence_spacy,sentence_str,sentence_length,sentence_lowered,lemmatized_str,tokenized_txt
131428,analysis of mind,Russell,analytic,"We may believe that this is the case, but if w...","We may believe that this is the case, but if w...",166,"we may believe that this is the case, but if w...","-PRON- may believe that this be the case , bu...","['We', 'may', 'believe', 'that', 'this', 'is',..."
42265,complete works vol 1,Aristotle,aristotle,And it is easier to define the particular than...,And it is easier to define the particular than...,129,and it is easier to define the particular than...,and -PRON- be easy to define the particular t...,"['And', 'it', 'is', 'easier', 'to', 'define', ..."
6832,complete works,Plato,plato,Given a completely adequate response to the ma...,Given a completely adequate response to the ma...,61,given a completely adequate response to the ma...,give a completely adequate response to the ma...,"['Given', 'a', 'completely', 'adequate', 'resp..."
320691,"a general theory of employment, interest, and ...",Keynes,capitalism,"That is to say, the real wage of an employed p...","That is to say, the real wage of an employed p...",427,"that is to say, the real wage of an employed p...","that be to say , the real wage of an employed...","['That', 'is', 'to', 'say', ',', 'the', 'real'..."
186948,madness and civilization,Foucault,continental,"Through time, they connect with a kind of madn...","Through time, they connect with a kind of madn...",146,"through time, they connect with a kind of madn...","through time , -PRON- connect with a kind of ...","['Through', 'time', ',', 'they', 'connect', 'w..."


In [None]:
stopwords_list = stopwords.words('english') + list(string.punctuation) 
stopwords_list += ['“','”','...',"''",'’','``', "'", "‘"]
custom_stopwords = ['–', 'also', 'something', 'cf', 'thus', 'two', 'now', 'would', 
                    'make', 'eb', 'u', 'well', 'even', 'said', 'eg', 'us',
                    'n', 'sein', 'e', 'da', 'therefore', 'however', 'would', 
                    'thing', 'must', 'merely', 'way', 'since', 'latter', 'first',
                    'B', 'mean', 'upon', 'yet', 'cannot', 'c', 'C', 'let', 'may', 
                    'might', "'s", 'b', 'ofthe', 'p.', '_', '-', 'eg', 'e.g.',
                    'ie', 'i.e.', 'f', 'l', "n't", 'e.g', 'i.e', '—', '--', 
                    'hyl', 'phil', 'one'] + stopwords_list

In [None]:
classifier_dict = {}
for author in df['author'].unique():
  classifier_dict[author] = 'author'
for title in df['title'].unique():
  classifier_dict[title] = 'title'
for school in df['school'].unique():
  classifier_dict[school] = 'school'

In [None]:
classifier_dict

### Word Cloud Function

In [None]:
# this function takes as inputs an author, title, or school, and returns 
# a word cloud for that group's texts

def make_word_cloud(input, stopwords=stopwords.words('english')):
    text = ''
    for sentence in df[df[classifier_dict[input]]==input]['sentence_str']:
      text += sentence + ' '
    cloud = wordcloud.WordCloud(width=1100, 
                            height=800, 
                            background_color='#D1D1D1', 
                            max_words=30, 
                            stopwords=stopwords, 
                            color_func=lambda *args, **kwargs: (95,95,95)).generate(text)
    return cloud

In [None]:
def make_word_cloud(input, df, classifier, stopwords=stopwords.words('english')):
    text = ''
    for sentence in df[df[classifier[input]]==input]['sentence_str']:
      text += sentence
    cloud = wordcloud.WordCloud(width=500, 
                            height=400, 
                            background_color='#D1D1D1', 
                            max_words=30, 
                            stopwords=stopwords, 
                            color_func=lambda *args, **kwargs: (95,95,95)).generate(text)
    return cloud

In [None]:
# testing it out
cloud = make_word_cloud('Plato', df, classifier_dict, stopwords=custom_stopwords)

In [None]:
px.imshow(cloud.to_image())

### Average Word Length Function

In [None]:
def get_average_word_length(input):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', '[']
  num_words = 0
  sum_word_lengths = 0
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    sentence_list = [re.sub("[',]", '', word) for word in sentence_list]
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    for word in no_punctuation_tokens:
      num_words += 1
      sum_word_lengths += len(word)
  return sum_word_lengths / num_words

In [None]:
# test it out
for school in df['school'].unique():
  print(school)
  print(get_average_word_length(school))
  print()

plato
4.219800084945763

aristotle
4.316441579485012

empiricism
4.500572602099245

rationalism
4.439002183955361

analytic
4.544839803887312

continental
4.8112526414942645

phenomenology
4.7367242048362455

german_idealism
4.725714199829411

communism
4.769301039274698

capitalism
4.620634322859528



### Average Sentence Length Function

In [None]:
def get_average_sentence_length(input):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', ']']
  num_sentences = 0
  sum_sentence_lengths = 0
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    num_sentences += 1
    sum_sentence_lengths += len(no_punctuation_tokens)
  return sum_sentence_lengths / num_sentences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# testing
for school in df['school'].unique():
  print(school)
  print(get_average_sentence_length(school))
  print()

plato
23.9527197852321

aristotle
30.391855346532697

empiricism
36.65442944785276

rationalism
32.76701012999435

analytic
23.665816848726205

continental
32.26270636792453

phenomenology
27.58667644286163

german_idealism
34.10015605050362

communism
29.05610799400033

capitalism
36.516657534246576



### Median Sentence Length Function

In [None]:
def get_median_sentence_length(input):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', ']']
  sentence_lengths = []
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    sentence_lengths.append(len(no_punctuation_tokens))
  return np.median(sentence_lengths)

In [None]:
for author in df['author'].unique():
  print(author)
  print(get_median_sentence_length(author))
  print()

plato
20.0

aristotle
26.0

locke
34.0

hume
33.0

berkeley
23.0

spinoza
25.0

leibniz
28.0

descartes
42.0

malebranche
29.0

russell
26.0

moore
30.0

wittgenstein
16.0

lewis
19.0

quine
21.0

popper
23.0

kripke
21.0

foucault
30.0

derrida
23.0

deleuze
26.0

merleau-ponty
29.0

husserl
29.0

heidegger
20.0

kant
33.0

fichte
27.0

hegel
29.0

marx
24.0

lenin
28.0

smith
33.0

ricardo
33.0

keynes
32.0



### N-gram Frequency Function

### Word Frequency Function

### Number of Unique Words Function

In [None]:
def get_num_unique_words(input):
  punctuations = list(string.punctuation) + ['“','”','...',"''",'’','``', "'", "‘", '[', ']']
  word_list = []
  num_words = 0
  for sentence in df[df[classifier_dict[input]]==input]['tokenized_txt']:
    sentence_list = sentence.split()
    no_punctuation_tokens = [word for word in sentence_list if word not in punctuations]
    no_punctuation_tokens = [word for word in no_punctuation_tokens if len(word) > 0]
    num_words += len(no_punctuation_tokens)
    for word in no_punctuation_tokens:
      word_list.append(word)
  num_unique_words = len(set(word_list))
  return num_unique_words, num_words

In [None]:
get_num_unique_words('hegel')

(24952, 759122)