In [2]:
import nltk , re, string
from nltk import FreqDist
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.stem import *
from nltk.corpus import stopwords
from nltk import FreqDist

In [3]:
persuation=nltk.corpus.gutenberg.words('austen-persuasion.txt')

In [4]:
def preparedata(data):
    processed_data = []
    for arrayCount in range(len(data)):
        speaker = re.sub(r'[^a-zA-Z_ ]', '', data[arrayCount])
#         tokens = nltk.word_tokenize(speaker)
        speaker_tokenized = [w.lower() for w in speaker if not re.fullmatch('[' + string.punctuation + ']+', w)]
        processed_data.append([w for w in speaker_tokenized if w not in stopwords.words("english")])
    return processed_data

In [5]:
def preprocess_data(text):
    preprocess_data = []
    for w in text:
        if(w.isalpha()):
            if not re.fullmatch('[' + string.punctuation + ']+', w):
                preprocess_data.append(w.lower())
    return preprocess_data

In [6]:
def stem(data, stemtype):
    words = []
    if(stemtype == "porter"):
        stemmer = PorterStemmer()
    elif(stemtype == "snowball"):
        stemmer = SnowballStemmer('english')
    elif (stemtype == "lancaster"):
        stemmer = LancasterStemmer()

    for lines in data:
        words.append(stemmer.stem(lines))
#     print(words)
    return words

In [7]:
def retwords(words):
    top_words = []
    for i in range(len(words)):
        for wordss in words[i]:
            top_words.append(wordss)
    return top_words

In [8]:
cleanedPersuation=preprocess_data(persuation)
cleanedPersuation

['persuasion',
 'by',
 'jane',
 'austen',
 'chapter',
 'sir',
 'walter',
 'elliot',
 'of',
 'kellynch',
 'hall',
 'in',
 'somersetshire',
 'was',
 'a',
 'man',
 'who',
 'for',
 'his',
 'own',
 'amusement',
 'never',
 'took',
 'up',
 'any',
 'book',
 'but',
 'the',
 'baronetage',
 'there',
 'he',
 'found',
 'occupation',
 'for',
 'an',
 'idle',
 'hour',
 'and',
 'consolation',
 'in',
 'a',
 'distressed',
 'one',
 'there',
 'his',
 'faculties',
 'were',
 'roused',
 'into',
 'admiration',
 'and',
 'respect',
 'by',
 'contemplating',
 'the',
 'limited',
 'remnant',
 'of',
 'the',
 'earliest',
 'patents',
 'there',
 'any',
 'unwelcome',
 'sensations',
 'arising',
 'from',
 'domestic',
 'affairs',
 'changed',
 'naturally',
 'into',
 'pity',
 'and',
 'contempt',
 'as',
 'he',
 'turned',
 'over',
 'the',
 'almost',
 'endless',
 'creations',
 'of',
 'the',
 'last',
 'century',
 'and',
 'there',
 'if',
 'every',
 'other',
 'leaf',
 'were',
 'powerless',
 'he',
 'could',
 'read',
 'his',
 'own',


In [9]:
persuationStem=stem(cleanedPersuation,'porter')
persuationStem[:10]

['persuas',
 'by',
 'jane',
 'austen',
 'chapter',
 'sir',
 'walter',
 'elliot',
 'of',
 'kellynch']

In [10]:
persuationFdist = nltk.FreqDist(persuationStem)

In [11]:
persuationFdist.most_common()

[('the', 3329),
 ('to', 2808),
 ('and', 2801),
 ('of', 2570),
 ('a', 1595),
 ('in', 1389),
 ('wa', 1337),
 ('her', 1215),
 ('had', 1186),
 ('be', 1171),
 ('she', 1146),
 ('i', 1124),
 ('it', 1110),
 ('he', 961),
 ('not', 934),
 ('that', 882),
 ('as', 809),
 ('for', 708),
 ('have', 681),
 ('but', 664),
 ('hi', 659),
 ('with', 654),
 ('you', 628),
 ('mr', 547),
 ('at', 533),
 ('all', 530),
 ('ann', 497),
 ('been', 496),
 ('s', 485),
 ('him', 467),
 ('could', 451),
 ('veri', 434),
 ('they', 433),
 ('were', 426),
 ('by', 418),
 ('which', 416),
 ('is', 398),
 ('on', 396),
 ('so', 359),
 ('no', 356),
 ('would', 355),
 ('captain', 305),
 ('their', 301),
 ('elliot', 295),
 ('from', 295),
 ('there', 286),
 ('or', 274),
 ('more', 273),
 ('them', 270),
 ('thi', 250),
 ('an', 245),
 ('than', 243),
 ('ladi', 242),
 ('one', 240),
 ('do', 238),
 ('must', 228),
 ('when', 228),
 ('my', 223),
 ('onli', 219),
 ('wentworth', 218),
 ('such', 211),
 ('much', 205),
 ('if', 202),
 ('ani', 199),
 ('other', 197

In [12]:
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m + ':', persuationFdist[m])
    
print(" ")

can: 107
could: 451
may: 87
might: 166
must: 228
will: 172
 


In [13]:
pronouns = ['he', 'him', 'himself', 'she', 'her', 'herself']

for p in pronouns:
    print(p + ':', persuationFdist[p])

he: 961
him: 467
himself: 95
she: 1146
her: 1215
herself: 159


In [14]:
moby = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
cleanedMoby=preprocess_data(moby)
mobyStem=stem(cleanedMoby,'porter')
mobyFdist = nltk.FreqDist(mobyStem)
print(mobyFdist.most_common())



In [15]:
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m + ':', mobyFdist[m])
    
print(" ")

pronouns = ['he', 'him', 'himself', 'she', 'her', 'herself']

for p in pronouns:
    print(p + ':', mobyFdist[p])

can: 236
could: 216
may: 240
might: 183
must: 283
will: 398
 
he: 1896
him: 1067
himself: 205
she: 120
her: 338
herself: 7


Here we got the lenght of the both corpora as, Austen's corpora has 84121 words after stemming and Moby Dick has 218361 words after stemming.

As we can see the verb frequency distribution of Austen's corpora are lesser than the Moby Dick corpora, but here we also have to conside the lenght of the corpora.

Also as we can see the pronoun frequency distribution for Austen's corpora has larger counts for 'her, herself, she' than Moby Dick's corpora. So we can refer Austen's corpora as a female oriented text.

On the other hand, the frequency distributon for Moby Dick's corpora has larger counts for 'he, himself, him' than Austen's corpora. So we can refer Moby Dick's corpora as a male oriented text.





In [21]:
merged=[]
for w in persuationStem:
    merged.append(('persuation',w))
    

In [22]:
for w in mobyStem:
    merged.append(('moby',w))

In [23]:
auths=['persuation','moby']
modals = ['can', 'could', 'may', 'might', 'must', 'will']

In [24]:
merged[:10]

[('persuation', 'persuas'),
 ('persuation', 'by'),
 ('persuation', 'jane'),
 ('persuation', 'austen'),
 ('persuation', 'chapter'),
 ('persuation', 'sir'),
 ('persuation', 'walter'),
 ('persuation', 'elliot'),
 ('persuation', 'of'),
 ('persuation', 'kellynch')]

In [25]:
cfd = nltk.ConditionalFreqDist(
           (genre, word)
           for genre in auths
           for k,word in merged
           if k == genre)
cfd.tabulate(conditions=auths, samples=modals)

             can could   may might  must  will 
persuation   107   451    87   166   228   172 
      moby   236   216   240   183   283   398 
