In [1]:
# Setup and imports
# Import module with custom functions
from lex_processing import * 

# Import Numpy and Pandas to work with dataframes
import pandas as pd
import numpy as np

# Import os for access to files
import os

# Import stemmer
from nltk.stem.snowball import SnowballStemmer

# Import Counter
from collections import Counter

In [2]:
# Path is where the data I want to process is.
# For Mac
# pathAC = '/Users/Joe/dropbox/Data/Original Data/Three Authors/Agatha Christie/'
# pathIM = '/Users/Joe/dropbox/Data/Original Data/Three Authors/Iris Murdoch/'
# pathPDJ = '/Users/Joe/dropbox/Data/Original Data/Three Authors/P James/'

# For Linux
pathAC = '/home/CAMPUS/alcantaj/Dropbox/Data/Original Data/Three Authors/Agatha Christie/'
pathIM = '/home/CAMPUS/alcantaj/Dropbox/Data/Original Data/Three Authors/Iris Murdoch/'
pathPDJ = '/home/CAMPUS/alcantaj/Dropbox/Data/Original Data/Three Authors/P James/'

# For Windows
# pathReagan = '/Users/jomar/Dropbox/Data/Edited Data/Presidents Data/ReaganSpeeches/'
# pathBush = '/Users/jomar/Dropbox/Data/Edited Data/Presidents Data/BushSpeeches/'
# pathTrump = '/Users/jomar/Dropbox/Data/Edited Data/Presidents Data/TrumpSpeeches/'

In [3]:
# Create Data Frames for the datasets. I am including one data frame for 2 terms of Reagan
# and 2 separate dataframes for each term.
dfAC = pd.DataFrame()
dfPDJ = pd.DataFrame()
dfIM = pd.DataFrame()

In [4]:
stemmer = SnowballStemmer("english")

In [5]:
for filename in os.listdir(pathAC):
    if filename.endswith('txt'):
        f = open(pathAC + filename)
        raw = f.read()
        # Clear raw of punctuation and tokenize for word counts.
        wordsNoPunct = strip_punctuation(raw)
        #hesitations = wordsNoPunct.count('—')
        wordsNoPunct.replace("—", ' ')
        wordsNoPunct = word_tokenize(wordsNoPunct)
        words = word_tokenize(raw)
        tokens_stemmed = [stemmer.stem(x) for x in words]
    
        # Word Counts for certain words
        c = Counter(words)
        Fillers = c['well'] + c['so'] + c['basically'] + c['actually'] + c['literally'] + c['um'] + c['ah']
        NSNouns = c['something'] + c['anything'] + c['thing'] + c['everything']
        LIVerbs = c['be'] + c['come'] + c['do'] + c['get'] + c['give'] + c['go'] + c['have'] + c['know'] + c['look']
        + c['make'] + c['see'] + c['tell'] + c['think'] + c['want']
        
        sents = sent_tokenize(raw)
        processed = preprocess(raw)
        lex = lexical_diversity(wordsNoPunct)
        mls = meanLengthSentence(processed)
        wordDict = wordCount(processed)
        thetuple = {'Filename': filename, 'TTR': lex,
                    'WordCount':len(wordsNoPunct), 
                    'UniqueWords':len(set(wordsNoPunct)),
                    'UniqueStems':len(set(tokens_stemmed)),
                    'MLU': mls, 'Fillers': Fillers,
                   'NSNouns': NSNouns, 'LIVerbs': LIVerbs}
        finalDict = {**thetuple, **wordDict}
        dfAC = dfAC.append(finalDict, ignore_index = True)

In [6]:
for filename in os.listdir(pathIM):
    if filename.endswith('txt'):
        f = open(pathIM + filename)
        raw = f.read()
        # Clear raw of punctuation and tokenize for word counts.
        wordsNoPunct = strip_punctuation(raw)
        #hesitations = wordsNoPunct.count('—')
        wordsNoPunct.replace("—", ' ')
        wordsNoPunct = word_tokenize(wordsNoPunct)
        words = word_tokenize(raw)
        tokens_stemmed = [stemmer.stem(x) for x in words]
    
        # Word Counts for certain words
        c = Counter(words)
        Fillers = c['well'] + c['so'] + c['basically'] + c['actually'] + c['literally'] + c['um'] + c['ah']
        NSNouns = c['something'] + c['anything'] + c['thing'] + c['everything']
        LIVerbs = c['be'] + c['come'] + c['do'] + c['get'] + c['give'] + c['go'] + c['have'] + c['know'] + c['look']
        + c['make'] + c['see'] + c['tell'] + c['think'] + c['want']
        
        sents = sent_tokenize(raw)
        processed = preprocess(raw)
        lex = lexical_diversity(wordsNoPunct)
        mls = meanLengthSentence(processed)
        wordDict = wordCount(processed)
        thetuple = {'Filename': filename, 'TTR': lex,
                    'WordCount':len(wordsNoPunct), 
                    'UniqueWords':len(set(wordsNoPunct)),
                    'UniqueStems':len(set(tokens_stemmed)),
                    'MLU': mls, 'Fillers': Fillers,
                   'NSNouns': NSNouns, 'LIVerbs': LIVerbs}
        finalDict = {**thetuple, **wordDict}
        dfIM = dfIM.append(finalDict, ignore_index = True)

In [7]:
for filename in os.listdir(pathPDJ):
    if filename.endswith('txt'):
        f = open(pathPDJ + filename)
        raw = f.read()
        # Clear raw of punctuation and tokenize for word counts.
        wordsNoPunct = strip_punctuation(raw)
        #hesitations = wordsNoPunct.count('—')
        wordsNoPunct.replace("—", ' ')
        wordsNoPunct = word_tokenize(wordsNoPunct)
        words = word_tokenize(raw)
        tokens_stemmed = [stemmer.stem(x) for x in words]
    
        # Word Counts for certain words
        c = Counter(words)
        Fillers = c['well'] + c['so'] + c['basically'] + c['actually'] + c['literally'] + c['um'] + c['ah']
        NSNouns = c['something'] + c['anything'] + c['thing'] + c['everything']
        LIVerbs = c['be'] + c['come'] + c['do'] + c['get'] + c['give'] + c['go'] + c['have'] + c['know'] + c['look']
        + c['make'] + c['see'] + c['tell'] + c['think'] + c['want']
        
        sents = sent_tokenize(raw)
        processed = preprocess(raw)
        lex = lexical_diversity(wordsNoPunct)
        mls = meanLengthSentence(processed)
        wordDict = wordCount(processed)
        thetuple = {'Filename': filename, 'TTR': lex,
                    'WordCount':len(wordsNoPunct), 
                    'UniqueWords':len(set(wordsNoPunct)),
                    'UniqueStems':len(set(tokens_stemmed)),
                    'MLU': mls, 'Fillers': Fillers,
                   'NSNouns': NSNouns, 'LIVerbs': LIVerbs}
        finalDict = {**thetuple, **wordDict}
        dfPDJ = dfPDJ.append(finalDict, ignore_index = True)

In [8]:
names = pd.read_csv('/home/CAMPUS/alcantaj/Dropbox/Data/Original Data/Three Authors/novels - author age and year published.csv')
LIWC = pd.read_csv('/home/CAMPUS/alcantaj/Dropbox/Data/Original Data/Three Authors/LIWC2015 Results.csv')

In [9]:
dfPDJ = pd.merge(dfPDJ, LIWC, on='Filename', how='inner')
dfIM = pd.merge(dfIM, LIWC, on='Filename', how='inner')
dfAC = pd.merge(dfAC, LIWC, on='Filename', how='inner')

In [10]:
dfPDJ = pd.merge(dfPDJ, names, on='Filename', how='inner')
dfIM = pd.merge(dfIM, names, on='Filename', how='inner')
dfAC = pd.merge(dfAC, names, on='Filename', how='inner')

In [11]:
# Fill NA's with 0s as in this dataset, NAN represent the feature NOT occuring in a particular document.
dfPDJ = dfPDJ.fillna(0)
dfIM = dfIM.fillna(0)
dfAC = dfAC.fillna(0)

In [12]:
dfPDJ = dfPDJ.sort_values(by=['Year of Publication'])
dfIM = dfIM.sort_values(by=['Year of Publication'])
dfAC = dfAC.sort_values(by=['Year of Publication'])

In [14]:
# Calculate some new aggregate columns
dfPDJ['Nouns'] = dfPDJ['NN'] + dfPDJ['NNS']+ dfPDJ['NNP'] + dfPDJ['NNPS']
dfPDJ['Nouns/100'] = dfPDJ['Nouns'] / 100
dfPDJ['NounsNormalised'] = dfPDJ['Nouns'] / dfPDJ['WordCount']
dfPDJ['Adjectives'] = dfPDJ['JJ'] + dfPDJ['JJR'] + dfPDJ['JJS']
dfPDJ['Adjectives/100'] = dfPDJ['Adjectives'] / 100
dfPDJ['AdjectivesNormalised'] = dfPDJ['Adjectives'] / dfPDJ['WordCount']
dfPDJ['Adverbs'] = dfPDJ['RB'] + dfPDJ['RBR'] + dfPDJ['RBS']
dfPDJ['Adverbs/100'] = dfPDJ['Adverbs'] / 100
dfPDJ['AdverbsNormalised'] = dfPDJ['Adverbs'] / dfPDJ['WordCount']
dfPDJ['Verbs'] = dfPDJ['VB'] + dfPDJ['VBD'] + dfPDJ['VBG'] + dfPDJ['VBN'] + dfPDJ['VBP'] + dfPDJ['VBZ']
dfPDJ['Verbs/100'] = dfPDJ['Verbs'] / 100
dfPDJ['VerbsNormalised'] = dfPDJ['Verbs'] / dfPDJ['WordCount']
dfPDJ['Pronouns'] = dfPDJ['PRP'] + dfPDJ['PRP$']
dfPDJ['PronounsNormalised'] = dfPDJ['Pronouns'] / dfPDJ['WordCount']
dfPDJ['UniqueWordsNormalised'] = dfPDJ['UniqueWords'] / dfPDJ['WordCount']
dfPDJ['UniqueStemsNormalised'] = dfPDJ['UniqueStems'] / dfPDJ['WordCount']

In [15]:
# Calculate some new aggregate columns
dfIM['Nouns'] = dfIM['NN'] + dfIM['NNS']+ dfIM['NNP'] + dfIM['NNPS']
dfIM['Nouns/100'] = dfIM['Nouns'] / 100
dfIM['NounsNormalised'] = dfIM['Nouns'] / dfIM['WordCount']
dfIM['Adjectives'] = dfIM['JJ'] + dfIM['JJR'] + dfIM['JJS']
dfIM['Adjectives/100'] = dfIM['Adjectives'] / 100
dfIM['AdjectivesNormalised'] = dfIM['Adjectives'] / dfIM['WordCount']
dfIM['Adverbs'] = dfIM['RB'] + dfIM['RBR'] + dfIM['RBS']
dfIM['Adverbs/100'] = dfIM['Adverbs'] / 100
dfIM['AdverbsNormalised'] = dfIM['Adverbs'] / dfIM['WordCount']
dfIM['Verbs'] = dfIM['VB'] + dfIM['VBD'] + dfIM['VBG'] + dfIM['VBN'] + dfIM['VBP'] + dfIM['VBZ']
dfIM['Verbs/100'] = dfIM['Verbs'] / 100
dfIM['VerbsNormalised'] = dfIM['Verbs'] / dfIM['WordCount']
dfIM['Pronouns'] = dfIM['PRP'] + dfIM['PRP$']
dfIM['PronounsNormalised'] = dfIM['Pronouns'] / dfIM['WordCount']
dfIM['UniqueWordsNormalised'] = dfIM['UniqueWords'] / dfIM['WordCount']
dfIM['UniqueStemsNormalised'] = dfIM['UniqueStems'] / dfIM['WordCount']

In [18]:
# Calculate some new aggregate columns
dfAC['Nouns'] = dfAC['NN'] + dfAC['NNS']+ dfAC['NNP'] + dfAC['NNPS']
dfAC['Nouns/100'] = dfAC['Nouns'] / 100
dfAC['NounsNormalised'] = dfAC['Nouns'] / dfAC['WordCount']
dfAC['Adjectives'] = dfAC['JJ'] + dfAC['JJR'] + dfAC['JJS']
dfAC['Adjectives/100'] = dfAC['Adjectives'] / 100
dfAC['AdjectivesNormalised'] = dfAC['Adjectives'] / dfAC['WordCount']
dfAC['Adverbs'] = dfAC['RB'] + dfAC['RBR'] + dfIM['RBS']
dfAC['Adverbs/100'] = dfAC['Adverbs'] / 100
dfAC['AdverbsNormalised'] = dfAC['Adverbs'] / dfAC['WordCount']
dfAC['Verbs'] = dfAC['VB'] + dfAC['VBD'] + dfAC['VBG'] + dfAC['VBN'] + dfAC['VBP'] + dfAC['VBZ']
dfAC['Verbs/100'] = dfAC['Verbs'] / 100
dfAC['VerbsNormalised'] = dfAC['Verbs'] / dfAC['WordCount']
dfAC['Pronouns'] = dfAC['PRP'] + dfAC['PRP$']
dfAC['PronounsNormalised'] = dfAC['Pronouns'] / dfAC['WordCount']
dfAC['UniqueWordsNormalised'] = dfAC['UniqueWords'] / dfAC['WordCount']
dfAC['UniqueStemsNormalised'] = dfAC['UniqueStems'] / dfAC['WordCount']

In [19]:
# Export dataset to csv files
dfPDJ.to_csv('PDJ.csv')
dfIM.to_csv('IM.csv')
dfAC.to_csv('AC.csv')