# Lexical Richness Index (LRI)

## John Locke

In [140]:
with open("LockeComplete.txt", "r", encoding="utf-8") as file:
    Locke = file.read()

#For accurate comparison, Jockers recommends comparing random 10,000 word chunks of each corpus

len(Locke)

2095552

In [141]:
# Import Lexical Richness module
# Documentation: https://pypi.org/project/lexicalrichness/

from lexicalrichness import LexicalRichness

locke = LexicalRichness(Locke)

# Return word count
print("Lexical Richness of John Locke")

# Return (unique) word count
print("Unique Word Count: %s" % locke.terms)

# Return Type Token Ratio (TTR) of text
print("Type Token Ratio: %s" % locke.ttr)

# Return Root Type Token Ratio (RTTR) of text
print("Root Type Token Ratio: %s" % locke.rttr)

# Return Corrected Type Token Ratio (CTTR) of text
print("Corrected Type Token Ratio: %s" % locke.cttr)

# Return Mean Segmental Type Token Ratio (MSTTR) of text
print("Mean Segmental Type Token Ratio: %s" % locke.msttr(segment_window=25))

# Return Moving Average Type Token Ratio (MATTR) of text
print("Moving Average Type Token Ratio: %s" % locke.mattr(window_size=25))

# Return Measure of Textual Lexical Diversity (MTLD)
print("Measure of Textual Lexical Diversity: %s" % locke.mtld(threshold=0.72))

# Return hypergeometric distribution diversity (HD-D) measure.
print("Hypergeometric Distribution Diversity: %s" % locke.hdd(draws=42))

Lexical Richness of John Locke
Unique Word Count: 11102
Type Token Ratio: 0.028750411108607772
Root Type Token Ratio: 17.865807122203112
Corrected Type Token Ratio: 12.63303336748074
Mean Segmental Type Token Ratio: 0.8731943545253649
Moving Average Type Token Ratio: 0.8739445830012871
Measure of Textual Lexical Diversity: 66.38223299845599
Hypergeometric Distribution Diversity: 0.8476594498588272


### Word Frequency and Count

In [142]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# Write out new file with stopwords removed for each text file.

stop_words = set(stopwords.words('english')) 

print(stop_words)

{'against', 'when', "hadn't", 'out', "it's", 'should', "don't", 'haven', 'himself', 'that', "weren't", 'isn', 'doesn', 'he', "you'd", 'both', "shan't", 'an', 'theirs', 'hadn', "that'll", 'down', "you'll", 'your', 'is', 'why', 'myself', 'during', 'each', 'will', 'shan', 'who', 'for', 'over', 'once', 'ourselves', 'before', 'be', 'can', "mustn't", 'only', "mightn't", 'the', "haven't", 'being', 'because', 'mustn', 'you', "isn't", 'while', 'll', "you're", "she's", "should've", 'couldn', 'its', 'through', 'yourselves', 'of', 'don', 'am', 'been', 'a', 'her', 'did', 'herself', 'won', 'most', 'too', 'which', 'so', 'from', 'further', "couldn't", 'hasn', 'doing', 'again', 'nor', 'here', 'o', 'very', 'these', 'by', "aren't", 'where', 'some', 'my', 'own', 'hers', 'or', 'shouldn', 'aren', 'all', 't', 'we', 'now', "you've", 'until', 'not', 'in', 'than', 'on', 'are', "hasn't", 'this', 'has', 'me', "shouldn't", 'they', 'but', 'more', 'itself', 'them', 'whom', 'below', 'have', 'off', 'had', 'our', 'up',

In [143]:
file = open("Locke_HumanUnderstandingCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Locke_HumanUnderstandingSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Locke_TwoTreatisesCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Locke_TwoTreatisesSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("LockeComplete.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('LockeCompleteSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 

In [144]:
with open("Locke_HumanUnderstandingSTOPWORDS.txt", "r", encoding="utf-8") as file:
    LockeHum = file.read()
    
with open("Locke_TwoTreatisesSTOPWORDS.txt", "r", encoding="utf-8") as file:
    LockeTwo = file.read()
    
with open("LockeCompleteSTOPWORDS.txt", "r", encoding="utf-8") as file:
    LockeCom = file.read()

In [145]:
LockeHum = nltk.word_tokenize(LockeHum)
LockeHumWord = nltk.Text(LockeHum)

LockeTwo = nltk.word_tokenize(LockeTwo)
LockeTwoWord = nltk.Text(LockeTwo)

LockeCom = nltk.word_tokenize(LockeCom)
LockeComWord = nltk.Text(LockeCom)

In [146]:
LockeHumFreq = nltk.FreqDist(LockeHumWord)
LockeTwoFreq = nltk.FreqDist(LockeTwoWord)
LockeComFreq = nltk.FreqDist(LockeComWord)

In [147]:
LockeHumFreq.tabulate(10)
LockeTwoFreq.tabulate(10)
LockeComFreq.tabulate(10)

    ideas       one      idea      mind       may knowledge       man        us    things       men 
    15798      9930      8556      5802      5778      5214      5178      5046      4962      4902 
     power      right        one government        men        may        man        god       adam     nature 
      3865       2845       2670       1695       1650       1620       1595       1430       1395       1385 
 ideas    one   idea    may    man  power    men     us   mind things 
 13170  10945   7130   6435   5910   5905   5735   5065   4890   4490 


In [148]:
from collections import Counter

LockeHumCount = Counter(LockeHumFreq)
LockeTwoCount = Counter(LockeTwoFreq)
LockeComCount = Counter(LockeComFreq)

print(Counter(LockeHumCount).most_common(10))
print(Counter(LockeTwoCount).most_common(10))
print(Counter(LockeComCount).most_common(10))

[('ideas', 15798), ('one', 9930), ('idea', 8556), ('mind', 5802), ('may', 5778), ('knowledge', 5214), ('man', 5178), ('us', 5046), ('things', 4962), ('men', 4902)]
[('power', 3865), ('right', 2845), ('one', 2670), ('government', 1695), ('men', 1650), ('may', 1620), ('man', 1595), ('god', 1430), ('adam', 1395), ('nature', 1385)]
[('ideas', 13170), ('one', 10945), ('idea', 7130), ('may', 6435), ('man', 5910), ('power', 5905), ('men', 5735), ('us', 5065), ('mind', 4890), ('things', 4490)]


## George Berkeley

In [149]:
with open("BerkeleyComplete.txt", "r", encoding="utf-8") as file:
    Berkeley = file.read()

#For accurate comparison, Jockers recommends comparing random 10,000 word chunks of each corpus

len(Berkeley)

1021784

In [150]:
berkeley = LexicalRichness(Berkeley)

# Return word count
print("Lexical Richness of George Berkeley")

# Return (unique) word count
print("Unique Word Count: %s" % berkeley.terms)

# Return Type Token Ratio (TTR) of text
print("Type Token Ratio: %s" % berkeley.ttr)

# Return Root Type Token Ratio (RTTR) of text
print("Root Type Token Ratio: %s" % berkeley.rttr)

# Return Corrected Type Token Ratio (CTTR) of text
print("Corrected Type Token Ratio: %s" % berkeley.cttr)

# Return Mean Segmental Type Token Ratio (MSTTR) of text
print("Mean Segmental Type Token Ratio: %s" % berkeley.msttr(segment_window=25))

# Return Moving Average Type Token Ratio (MATTR) of text
print("Moving Average Type Token Ratio: %s" % berkeley.mattr(window_size=25))

# Return Measure of Textual Lexical Diversity (MTLD)
print("Measure of Textual Lexical Diversity: %s" % berkeley.mtld(threshold=0.72))

# Return hypergeometric distribution diversity (HD-D) measure.
print("Hypergeometric Distribution Diversity: %s" % berkeley.hdd(draws=42))

Lexical Richness of George Berkeley
Unique Word Count: 9531
Type Token Ratio: 0.051520589856968335
Root Type Token Ratio: 22.1594842432482
Corrected Type Token Ratio: 15.66912157599725
Mean Segmental Type Token Ratio: 0.8864549263414692
Moving Average Type Token Ratio: 0.885777369303587
Measure of Textual Lexical Diversity: 80.28268012005267
Hypergeometric Distribution Diversity: 0.8592101099541077


### Word Frequency and Count

In [151]:
file = open("Berkeley_AlciphronCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Berkeley_AlciphronSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Berkeley_HumanKnowledgeCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Berkeley_HumanKnowledgeSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Berkeley_TheoryOfVisionCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Berkeley_TheoryOfVisionSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Berkeley_ThreeDialoguesCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Berkeley_ThreeDialoguesSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("BerkeleyComplete.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('BerkeleyCompleteSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 

In [152]:
with open("Berkeley_AlciphronSTOPWORDS.txt", "r") as file:
    BerkeleyAlc = file.read()
    
with open("Berkeley_HumanKnowledgeSTOPWORDS.txt", "r") as file:
    BerkeleyHum = file.read()
    
with open("Berkeley_TheoryOfVisionSTOPWORDS.txt", "r") as file:
    BerkeleyThe = file.read()
    
with open("Berkeley_ThreeDialoguesSTOPWORDS.txt", "r") as file:
    BerkeleyThr = file.read()
    
with open("BerkeleyCompleteSTOPWORDS.txt", "r") as file:
    BerkeleyCom = file.read()

In [160]:
BerkeleyAlcToken = nltk.word_tokenize(BerkeleyAlc)
BerkeleyAlcWord = nltk.Text(BerkeleyAlcToken)

BerkeleyHumToken = nltk.word_tokenize(BerkeleyHum)
BerkeleyHumWord = nltk.Text(BerkeleyHumToken)

BerkeleyTheToken = nltk.word_tokenize(BerkeleyThe)
BerkeleyTheWord = nltk.Text(BerkeleyTheToken)

BerkeleyThrToken = nltk.word_tokenize(BerkeleyThr)
BerkeleyThrWord = nltk.Text(BerkeleyThrToken)

BerkeleyComToken = nltk.word_tokenize(BerkeleyCom)
BerkeleyComWord = nltk.Text(BerkeleyComToken)

print(BerkeleyAlcWord[:10])
print(BerkeleyHumWord[:10])
print(BerkeleyTheWord[:10])
print(BerkeleyThrWord[:10])
print(BerkeleyComWord[:10])

['alciphron', 'minute', 'philosopher', 'defence', 'christian', 'religion', 'socalled', 'freethinkers', 'george', 'berkeley']
['treatise', 'concerning', 'principles', 'human', 'knowledge', 'wherein', 'chief', 'causes', 'error', 'difficulty']
['essay', 'towards', 'new', 'theory', 'vision', 'george', 'berkeley', 'contents', 'sect', 'design']
['three', 'dialogues', 'hylas', 'philonous', 'opposition', 'sceptics', 'atheists', 'george', 'berkeley', 'contents']
['treatise', 'concerning', 'principles', 'human', 'knowledge', 'wherein', 'chief', 'causes', 'error', 'difficulty']


In [154]:
BerkeleyAlcFreq = nltk.FreqDist(BerkeleyAlcWord)
BerkeleyHumFreq = nltk.FreqDist(BerkeleyHumWord)
BerkeleyTheFreq = nltk.FreqDist(BerkeleyTheWord)
BerkeleyThrFreq = nltk.FreqDist(BerkeleyThrWord)
BerkeleyComFreq = nltk.FreqDist(BerkeleyComWord)

In [155]:
BerkeleyAlcFreq.tabulate(10)
BerkeleyHumFreq.tabulate(10)
BerkeleyTheFreq.tabulate(10)
BerkeleyThrFreq.tabulate(10)
BerkeleyComFreq.tabulate(10)

     men      man   things      one      god religion    would     good     dont     cant 
    1137      888      780      690      666      621      612      585      573      573 
  ideas    mind     may  things    idea without   sense     one  motion   thing 
   2096    1504    1496    1224    1184    1000     936     880     816     800 
 distance   visible     sight    object   objects     ideas  tangible       one       eye magnitude 
     1592      1392      1272      1200      1104      1040      1040       976       920       816 
   things      mind     ideas      dont    matter     exist existence perceived  sensible     think 
     2096      1736      1432      1344      1200      1128      1088      1064      1056      1048 
 ideas things   mind    one    may   idea  sense  think    see    men 
  2517   2500   2285   1771   1664   1476   1388   1308   1301   1284 


In [156]:
BerkeleyAlcCount = Counter(BerkeleyAlcFreq)
print(Counter(BerkeleyAlcCount).most_common(50))

BerkeleyHumCount = Counter(BerkeleyHumFreq)
print(Counter(BerkeleyHumCount).most_common(50))

BerkeleyTheCount = Counter(BerkeleyTheFreq)
print(Counter(BerkeleyTheCount).most_common(50))

BerkeleyThrCount = Counter(BerkeleyThrFreq)
print(Counter(BerkeleyThrCount).most_common(50))

BerkeleyComCount = Counter(BerkeleyComFreq)
print(Counter(BerkeleyComCount).most_common(50))

[('men', 1137), ('man', 888), ('things', 780), ('one', 690), ('god', 666), ('religion', 621), ('would', 612), ('good', 585), ('dont', 573), ('cant', 573), ('think', 558), ('see', 519), ('may', 516), ('alciphron', 495), ('sense', 492), ('said', 465), ('us', 432), ('world', 429), ('make', 423), ('know', 417), ('way', 414), ('well', 393), ('much', 378), ('mind', 378), ('reason', 375), ('human', 369), ('true', 357), ('time', 348), ('virtue', 342), ('people', 342), ('philosophers', 342), ('minute', 339), ('great', 339), ('natural', 336), ('without', 336), ('even', 333), ('nature', 333), ('isnt', 333), ('something', 330), ('faith', 324), ('euphranor', 321), ('many', 321), ('like', 318), ('truth', 315), ('seems', 315), ('thought', 303), ('knowledge', 303), ('say', 297), ('doesnt', 294), ('ideas', 282)]
[('ideas', 2096), ('mind', 1504), ('may', 1496), ('things', 1224), ('idea', 1184), ('without', 1000), ('sense', 936), ('one', 880), ('motion', 816), ('thing', 800), ('exist', 752), ('nature', 7

## David Hume

In [157]:
with open("HumeComplete.txt", "r", encoding="utf-8") as file:
    Hume = file.read()

#For accurate comparison, Jockers recommends comparing random 10,000 word chunks of each corpus

len(Hume)

2151680

In [158]:
hume = LexicalRichness(Hume)

# Return word count
print("Lexical Richness of David Hume")

# Return (unique) word count
print("Unique Word Count: %s" % hume.terms)

# Return Type Token Ratio (TTR) of text
print("Type Token Ratio: %s" % hume.ttr)

# Return Root Type Token Ratio (RTTR) of text
print("Root Type Token Ratio: %s" % hume.rttr)

# Return Corrected Type Token Ratio (CTTR) of text
print("Corrected Type Token Ratio: %s" % hume.cttr)

# Return Mean Segmental Type Token Ratio (MSTTR) of text
print("Mean Segmental Type Token Ratio: %s" % hume.msttr(segment_window=25))

# Return Moving Average Type Token Ratio (MATTR) of text
print("Moving Average Type Token Ratio: %s" % hume.mattr(window_size=25))

# Return Measure of Textual Lexical Diversity (MTLD)
print("Measure of Textual Lexical Diversity: %s" % hume.mtld(threshold=0.72))

# Return hypergeometric distribution diversity (HD-D) measure.
print("Hypergeometric Distribution Diversity: %s" % hume.hdd(draws=42))

Lexical Richness of David Hume
Unique Word Count: 18158
Type Token Ratio: 0.04855599529361429
Root Type Token Ratio: 29.693092842300015
Corrected Type Token Ratio: 20.996187303192077
Mean Segmental Type Token Ratio: 0.8842465570262564
Moving Average Type Token Ratio: 0.8841193145365478
Measure of Textual Lexical Diversity: 86.84633597280066
Hypergeometric Distribution Diversity: 0.8555445898281581


### Word Frequency and Count

In [163]:
file = open("Hume_EssaysMoralPoliticalLiteraryCLEAN.txt", encoding="utf-8") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Hume_EssaysMoralPoliticalLiterarySTOPWORDS.txt','a', encoding="utf-8") 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Hume_HumanUnderstandingCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Hume_HumanUnderstandingSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Hume_NaturalReligionCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Hume_NaturalReligionSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Hume_SourcesofMoralsCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Hume_SourcesofMoralsSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("HumeComplete.txt", encoding="utf-8") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('HumeCompleteSTOPWORDS.txt','a', encoding="utf-8") 
        appendFile.write(" "+r) 
        appendFile.close() 

In [165]:
with open("Hume_EssaysMoralPoliticalLiterarySTOPWORDS.txt", "r", encoding="utf-8") as file:
    HumeEss = file.read()
    
with open("Hume_HumanUnderstandingSTOPWORDS.txt", "r") as file:
    HumeHum = file.read()
    
with open("Hume_NaturalReligionSTOPWORDS.txt", "r") as file:
    HumeNat = file.read()
    
with open("Hume_SourcesofMoralsSTOPWORDS.txt", "r") as file:
    HumeSou = file.read()
    
with open("HumeCompleteSTOPWORDS.txt", "r", encoding="utf-8") as file:
    HumeCom = file.read()

In [166]:
HumeEssToken = nltk.word_tokenize(HumeEss)
HumeEssWord = nltk.Text(HumeEssToken)
HumeHumToken = nltk.word_tokenize(HumeHum)
HumeHumWord = nltk.Text(HumeHumToken)
HumeNatToken = nltk.word_tokenize(HumeNat)
HumeNatWord = nltk.Text(HumeNatToken)
HumeSouToken = nltk.word_tokenize(HumeSou)
HumeSouWord = nltk.Text(HumeSouToken)
HumeComToken = nltk.word_tokenize(HumeCom)
HumeComWord = nltk.Text(HumeComToken)

print(HumeEssWord[:10])
print(HumeHumWord[:10])
print(HumeNatWord[:10])
print(HumeSouWord[:10])
print(HumeComWord[:10])

['david', 'hume', 'essays', 'moral', 'political', 'literary', 'lf', 'ed', 'part', 'essays']
['enquiry', 'concerning', 'human', 'understanding', 'david', 'hume', 'contents', 'sect', 'different', 'species']
['dialogues', 'concerning', 'natural', 'religion', 'david', 'hume', 'contents', 'letter', 'pamphilus', 'hermippus']
['enquiry', 'sources', 'morals', 'david', 'hume', 'sources', 'morals', 'david', 'hume', 'contents']
['david', 'hume', 'essays', 'moral', 'political', 'literary', 'lf', 'ed', 'part', 'essays']


In [167]:
HumeEssFreq = nltk.FreqDist(HumeEssWord)
HumeHumFreq = nltk.FreqDist(HumeHumWord)
HumeNatFreq = nltk.FreqDist(HumeNatWord)
HumeSouFreq = nltk.FreqDist(HumeSouWord)
HumeComFreq = nltk.FreqDist(HumeComWord)

In [168]:
HumeEssFreq.tabulate(10)
HumeHumFreq.tabulate(10)
HumeNatFreq.tabulate(10)
HumeSouFreq.tabulate(10)
HumeComFreq.tabulate(10)

       may        one      every      great       must      would        men       much government     people 
      1646       1298       1152       1126       1056       1018        904        882        874        824 
       may        one     nature         us       must experience       mind      cause      human     effect 
       548        380        368        322        312        294        282        276        256        236 
    human       one     world    nature     would       god    reason     cause        us cleanthes 
      276       256       230       228       212       206       200       198       196       196 
       us     would       one   society       man     human   justice sentiment      even   general 
      330       304       302       262       262       240       232       226       214       208 
   may    one  every  would   must     us  great nature   even    men 
  1241   1118    875    860    838    744    705    691    663    645 


In [169]:
HumeEssCount = Counter(HumeEssFreq)
HumeHumCount = Counter(HumeHumFreq)
HumeNatCount = Counter(HumeNatFreq)
HumeSouCount = Counter(HumeSouFreq)
HumeComCount = Counter(HumeComFreq)

print(Counter(HumeEssCount).most_common(50))
print(Counter(HumeHumCount).most_common(50))
print(Counter(HumeNatCount).most_common(50))
print(Counter(HumeSouCount).most_common(50))
print(Counter(HumeComCount).most_common(50))

[('may', 1646), ('one', 1298), ('every', 1152), ('great', 1126), ('must', 1056), ('would', 1018), ('men', 904), ('much', 882), ('government', 874), ('people', 824), ('even', 770), ('man', 724), ('though', 714), ('upon', 704), ('us', 640), ('without', 624), ('nature', 594), ('life', 574), ('time', 570), ('lib', 568), ('public', 562), ('first', 548), ('never', 530), ('state', 520), ('power', 514), ('human', 500), ('yet', 500), ('present', 498), ('among', 496), ('could', 482), ('many', 482), ('money', 478), ('ancient', 470), ('ever', 466), ('reason', 464), ('part', 454), ('hume', 452), ('general', 446), ('well', 442), ('liberty', 440), ('see', 440), ('interest', 438), ('two', 434), ('authority', 430), ('always', 426), ('whole', 420), ('make', 416), ('essay', 408), ('therefore', 408), ('still', 400)]
[('may', 548), ('one', 380), ('nature', 368), ('us', 322), ('must', 312), ('experience', 294), ('mind', 282), ('cause', 276), ('human', 256), ('effect', 236), ('every', 234), ('never', 234), (