# Lexical Richness Index (LRI)

In [213]:
# Import required modules

# Lexical Richness module
# Documentation: https://pypi.org/project/lexicalrichness/
from lexicalrichness import LexicalRichness

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

from collections import Counter

import random

## John Locke

In [214]:
with open("LockeComplete.txt", "r", encoding="utf-8") as file:
    Locke = file.read()

len(Locke)

2095552

In [215]:
# LRI of full text
locke = LexicalRichness(Locke)

print("Lexical Richness of John Locke")

# Return (unique) word count
print("Unique Word Count: %s" % locke.terms)

# Return Type Token Ratio (TTR) of text
print("Type Token Ratio: %s" % locke.ttr)

# Return Root Type Token Ratio (RTTR) of text
print("Root Type Token Ratio: %s" % locke.rttr)

# Return Corrected Type Token Ratio (CTTR) of text
print("Corrected Type Token Ratio: %s" % locke.cttr)

# Return Mean Segmental Type Token Ratio (MSTTR) of text
print("Mean Segmental Type Token Ratio: %s" % locke.msttr(segment_window=25))

# Return Moving Average Type Token Ratio (MATTR) of text
print("Moving Average Type Token Ratio: %s" % locke.mattr(window_size=25))

# Return Measure of Textual Lexical Diversity (MTLD)
print("Measure of Textual Lexical Diversity: %s" % locke.mtld(threshold=0.72))

# Return hypergeometric distribution diversity (HD-D) measure.
print("Hypergeometric Distribution Diversity: %s" % locke.hdd(draws=42))

Lexical Richness of John Locke
Unique Word Count: 11102
Type Token Ratio: 0.028750411108607772
Root Type Token Ratio: 17.865807122203112
Corrected Type Token Ratio: 12.63303336748074
Mean Segmental Type Token Ratio: 0.8731943545253649
Moving Average Type Token Ratio: 0.8739445830012871
Measure of Textual Lexical Diversity: 66.38223299845599
Hypergeometric Distribution Diversity: 0.8476594498588272


### LRI Mean Average Function

In [216]:
# For accurate comparison, Jockers recommends comparing random 10,000 word chunks of each corpus
# Use without stopwords as all vocabulary matters here
# Build a function to select 10,000 random words and find mean average of multiple LRIs
def LRI (times, text):
    
    # Empty variables for LRI mean averages
    UWQavg = []
    TTRavg = []
    RTTRavg = []
    CTTRavg = []
    MSTTRavg = []
    MATTRavg = []
    MTLDavg = []
    HDDavg = []
    
    # Tokenize text for randomization with NLTK
    textToke = nltk.word_tokenize(text)
    
    # Iterate through the function multiple times
    for i in range(times):
        
        # Pick 10000 random words via RANDOM
        textRand = random.sample(textToke, 10000)
    
        # Convert back to string
        textStr = ' '.join(textRand)
    
        # Perform LRI with LexicalRichness
        textLRI = LexicalRichness(textStr)
    
        # Unique Word Count
        UWQ = textLRI.terms
        # Type Token Ratio 
        TTR = textLRI.ttr
        # Root Type Token Ratio 
        RTTR = textLRI.rttr
        # Corrected Type Token Ratio 
        CTTR = textLRI.cttr
        # Mean Segmental Type Token Ratio 
        MSTTR = textLRI.msttr(segment_window=25)
        # Return Moving Average Type Token Ratio (MATTR) of text
        MATTR = textLRI.mattr(window_size=25)
        # Measure of Textual Lexical Diversity 
        MTLD = textLRI.mtld(threshold=0.72)
        # Hypergeometric Distribution Diversity measure
        HDD = textLRI.hdd(draws=42)
        
        # Append results for mean average
        UWQavg.append(UWQ)
        TTRavg.append(TTR)
        RTTRavg.append(RTTR)
        CTTRavg.append(CTTR)
        MSTTRavg.append(MSTTR)
        MATTRavg.append(MATTR)
        MTLDavg.append(MTLD)
        HDDavg.append(HDD)
                                         # Average results, round to 2 decimal places
    print("Unique Word Count: %s" % round(sum(UWQavg)/len(UWQavg), 2))
    print("Type Token Ratio: %s" % round(sum(TTRavg)/len(TTRavg), 2))
    print("Root Type Token Ratio: %s" % round(sum(RTTRavg)/len(RTTRavg), 2))
    print("Corrected Type Token Ratio: %s" % round(sum(CTTRavg)/len(CTTRavg), 2))
    print("Mean Segmental Type Token Ratio: %s" % round(sum(MSTTRavg)/len(MSTTRavg), 2))
    print("Moving Average Type Token Ratio: %s" % round(sum(MATTRavg)/len(MATTRavg), 2))
    print("Measure of Textual Lexical Diversity: %s" % round(sum(MTLDavg)/len(MTLDavg), 2))
    print("Hypergeometric Distribution Diversity: %s" % round(sum(HDDavg)/len(HDDavg), 2))
    return;

# Iterate through function multiple times and average results
LRI(10, Locke)

Unique Word Count: 1886.8
Type Token Ratio: 0.19
Root Type Token Ratio: 18.87
Corrected Type Token Ratio: 13.34
Mean Segmental Type Token Ratio: 0.9
Moving Average Type Token Ratio: 0.9
Measure of Textual Lexical Diversity: 106.85
Hypergeometric Distribution Diversity: 0.85


### Word Frequency and Count

#Write out new file with stopwords removed for each text file.

stop_words = set(stopwords.words('english')) 

print(stop_words)

#Build stopword files

file = open("Locke_HumanUnderstandingCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Locke_HumanUnderstandingSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Locke_TwoTreatisesCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Locke_TwoTreatisesSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("LockeComplete.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('LockeCompleteSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 

In [217]:
# Open .txt files with stopwords removed
with open("Locke_HumanUnderstandingSTOPWORDS.txt", "r", encoding="utf-8") as file:
    LockeHum = file.read()
    
with open("Locke_TwoTreatisesSTOPWORDS.txt", "r", encoding="utf-8") as file:
    LockeTwo = file.read()
    
with open("LockeCompleteSTOPWORDS.txt", "r", encoding="utf-8") as file:
    LockeCom = file.read()

In [218]:
# Tokenize, Word Frequency and Count function
def WordFreq (text):
    textToke = nltk.word_tokenize(text)
    textWord = nltk.Text(textToke)
    textFreq = nltk.FreqDist(textWord)
    textCount = Counter(textFreq)
    print(Counter(textCount).most_common(10))
    return;

WordFreq(LockeHum)
WordFreq(LockeTwo)
WordFreq(LockeCom)

[('ideas', 21319), ('one', 13407), ('idea', 11485), ('mind', 7894), ('may', 7831), ('knowledge', 7064), ('man', 7027), ('us', 6812), ('men', 6740), ('things', 6683)]
[('power', 5411), ('right', 3983), ('one', 3738), ('government', 2373), ('men', 2310), ('may', 2268), ('man', 2233), ('god', 2002), ('adam', 1953), ('nature', 1939)]
[('ideas', 18438), ('one', 15323), ('idea', 9982), ('may', 9009), ('man', 8274), ('power', 8267), ('men', 8029), ('us', 7091), ('mind', 6846), ('things', 6286)]


## George Berkeley

In [219]:
with open("BerkeleyComplete.txt", "r", encoding="utf-8") as file:
    Berkeley = file.read()

len(Berkeley)

1021784

In [220]:
berkeley = LexicalRichness(Berkeley)

# Return word count
print("Lexical Richness of George Berkeley")

# Return (unique) word count
print("Unique Word Count: %s" % berkeley.terms)

# Return Type Token Ratio (TTR) of text
print("Type Token Ratio: %s" % berkeley.ttr)

# Return Root Type Token Ratio (RTTR) of text
print("Root Type Token Ratio: %s" % berkeley.rttr)

# Return Corrected Type Token Ratio (CTTR) of text
print("Corrected Type Token Ratio: %s" % berkeley.cttr)

# Return Mean Segmental Type Token Ratio (MSTTR) of text
print("Mean Segmental Type Token Ratio: %s" % berkeley.msttr(segment_window=25))

# Return Moving Average Type Token Ratio (MATTR) of text
print("Moving Average Type Token Ratio: %s" % berkeley.mattr(window_size=25))

# Return Measure of Textual Lexical Diversity (MTLD)
print("Measure of Textual Lexical Diversity: %s" % berkeley.mtld(threshold=0.72))

# Return hypergeometric distribution diversity (HD-D) measure.
print("Hypergeometric Distribution Diversity: %s" % berkeley.hdd(draws=42))

Lexical Richness of George Berkeley
Unique Word Count: 9531
Type Token Ratio: 0.051520589856968335
Root Type Token Ratio: 22.1594842432482
Corrected Type Token Ratio: 15.66912157599725
Mean Segmental Type Token Ratio: 0.8864549263414692
Moving Average Type Token Ratio: 0.885777369303587
Measure of Textual Lexical Diversity: 80.28268012005267
Hypergeometric Distribution Diversity: 0.8592101099541077


In [221]:
# LRI Mean Average Function
LRI(10, Berkeley)

Unique Word Count: 2133.7
Type Token Ratio: 0.21
Root Type Token Ratio: 21.34
Corrected Type Token Ratio: 15.09
Mean Segmental Type Token Ratio: 0.91
Moving Average Type Token Ratio: 0.91
Measure of Textual Lexical Diversity: 125.97
Hypergeometric Distribution Diversity: 0.86


### Word Frequency and Count

file = open("Berkeley_AlciphronCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Berkeley_AlciphronSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Berkeley_HumanKnowledgeCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Berkeley_HumanKnowledgeSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Berkeley_TheoryOfVisionCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Berkeley_TheoryOfVisionSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Berkeley_ThreeDialoguesCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Berkeley_ThreeDialoguesSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("BerkeleyComplete.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('BerkeleyCompleteSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 

In [222]:
# Open .txt files with stopwords removed
with open("Berkeley_AlciphronSTOPWORDS.txt", "r") as file:
    BerkeleyAlc = file.read()
    
with open("Berkeley_HumanKnowledgeSTOPWORDS.txt", "r") as file:
    BerkeleyHum = file.read()
    
with open("Berkeley_TheoryOfVisionSTOPWORDS.txt", "r") as file:
    BerkeleyThe = file.read()
    
with open("Berkeley_ThreeDialoguesSTOPWORDS.txt", "r") as file:
    BerkeleyThr = file.read()
    
with open("BerkeleyCompleteSTOPWORDS.txt", "r") as file:
    BerkeleyCom = file.read()

In [223]:
# Tokenize, Word Frequency and Count 
WordFreq(BerkeleyAlc)
WordFreq(BerkeleyHum)
WordFreq(BerkeleyThe)
WordFreq(BerkeleyThr)
WordFreq(BerkeleyCom)

[('men', 1895), ('man', 1480), ('things', 1300), ('one', 1150), ('god', 1110), ('religion', 1035), ('would', 1020), ('good', 975), ('dont', 955), ('cant', 955)]
[('ideas', 2620), ('mind', 1880), ('may', 1870), ('things', 1530), ('idea', 1480), ('without', 1250), ('sense', 1170), ('one', 1100), ('motion', 1020), ('thing', 1000)]
[('distance', 1990), ('visible', 1740), ('sight', 1590), ('object', 1500), ('objects', 1380), ('ideas', 1300), ('tangible', 1300), ('one', 1220), ('eye', 1150), ('magnitude', 1020)]
[('things', 2620), ('mind', 2170), ('ideas', 1790), ('dont', 1680), ('matter', 1500), ('exist', 1410), ('existence', 1360), ('perceived', 1330), ('sensible', 1320), ('think', 1310)]
[('things', 4004), ('ideas', 3847), ('mind', 3543), ('one', 2877), ('may', 2658), ('men', 2294), ('idea', 2268), ('sense', 2240), ('think', 2144), ('see', 2123)]


## David Hume

In [224]:
with open("HumeComplete.txt", "r", encoding="utf-8") as file:
    Hume = file.read()

len(Hume)

2151680

In [225]:
hume = LexicalRichness(Hume)

# Return word count
print("Lexical Richness of David Hume")

# Return (unique) word count
print("Unique Word Count: %s" % hume.terms)

# Return Type Token Ratio (TTR) of text
print("Type Token Ratio: %s" % hume.ttr)

# Return Root Type Token Ratio (RTTR) of text
print("Root Type Token Ratio: %s" % hume.rttr)

# Return Corrected Type Token Ratio (CTTR) of text
print("Corrected Type Token Ratio: %s" % hume.cttr)

# Return Mean Segmental Type Token Ratio (MSTTR) of text
print("Mean Segmental Type Token Ratio: %s" % hume.msttr(segment_window=25))

# Return Moving Average Type Token Ratio (MATTR) of text
print("Moving Average Type Token Ratio: %s" % hume.mattr(window_size=25))

# Return Measure of Textual Lexical Diversity (MTLD)
print("Measure of Textual Lexical Diversity: %s" % hume.mtld(threshold=0.72))

# Return hypergeometric distribution diversity (HD-D) measure.
print("Hypergeometric Distribution Diversity: %s" % hume.hdd(draws=42))

Lexical Richness of David Hume
Unique Word Count: 18158
Type Token Ratio: 0.04855599529361429
Root Type Token Ratio: 29.693092842300015
Corrected Type Token Ratio: 20.996187303192077
Mean Segmental Type Token Ratio: 0.8842465570262564
Moving Average Type Token Ratio: 0.8841193145365478
Measure of Textual Lexical Diversity: 86.84633597280066
Hypergeometric Distribution Diversity: 0.8555445898281581


In [226]:
# LRI Mean Average Function
LRI(10, Hume)

Unique Word Count: 2706.5
Type Token Ratio: 0.27
Root Type Token Ratio: 27.07
Corrected Type Token Ratio: 19.14
Mean Segmental Type Token Ratio: 0.9
Moving Average Type Token Ratio: 0.9
Measure of Textual Lexical Diversity: 135.67
Hypergeometric Distribution Diversity: 0.86


### Word Frequency and Count

file = open("Hume_EssaysMoralPoliticalLiteraryCLEAN.txt", encoding="utf-8") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Hume_EssaysMoralPoliticalLiterarySTOPWORDS.txt','a', encoding="utf-8") 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Hume_HumanUnderstandingCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Hume_HumanUnderstandingSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Hume_NaturalReligionCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Hume_NaturalReligionSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("Hume_SourcesofMoralsCLEAN.txt") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('Hume_SourcesofMoralsSTOPWORDS.txt','a') 
        appendFile.write(" "+r) 
        appendFile.close() 
        
file = open("HumeComplete.txt", encoding="utf-8") 
line = file.read() # Use this to read file content as a stream: 
words = line.split() 
for r in words: 
    if not r in stop_words: 
        appendFile = open('HumeCompleteSTOPWORDS.txt','a', encoding="utf-8") 
        appendFile.write(" "+r) 
        appendFile.close() 

In [227]:
# Open .txt files with stopwords removed
with open("Hume_EssaysMoralPoliticalLiterarySTOPWORDS.txt", "r", encoding="utf-8") as file:
    HumeEss = file.read()
    
with open("Hume_HumanUnderstandingSTOPWORDS.txt", "r") as file:
    HumeHum = file.read()
    
with open("Hume_NaturalReligionSTOPWORDS.txt", "r") as file:
    HumeNat = file.read()
    
with open("Hume_SourcesofMoralsSTOPWORDS.txt", "r") as file:
    HumeSou = file.read()
    
with open("HumeCompleteSTOPWORDS.txt", "r", encoding="utf-8") as file:
    HumeCom = file.read()

In [228]:
# Tokenize, Word Frequency and Count 
WordFreq(HumeEss)
WordFreq(HumeHum)
WordFreq(HumeNat)
WordFreq(HumeSou)
WordFreq(HumeCom)

[('may', 3292), ('one', 2596), ('every', 2304), ('great', 2252), ('must', 2112), ('would', 2036), ('men', 1808), ('much', 1764), ('government', 1748), ('people', 1648)]
[('may', 1096), ('one', 760), ('nature', 736), ('us', 644), ('must', 624), ('experience', 588), ('mind', 564), ('cause', 552), ('human', 512), ('effect', 472)]
[('human', 552), ('one', 512), ('world', 460), ('nature', 456), ('would', 424), ('god', 412), ('reason', 400), ('cause', 396), ('us', 392), ('cleanthes', 392)]
[('us', 660), ('would', 608), ('one', 604), ('society', 524), ('man', 524), ('human', 480), ('justice', 464), ('sentiment', 452), ('even', 428), ('general', 416)]
[('may', 3723), ('one', 3354), ('every', 2625), ('would', 2580), ('must', 2514), ('us', 2232), ('great', 2115), ('nature', 2073), ('even', 1989), ('men', 1935)]
