# Cleaning the data and feature engineering

In this notebook, I compile all the text from all the books into one dataframe, then do some cleaning and do some feature engineering.

### Import things

In [1]:
import os
import gc
import string
import pandas as pd

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import WordPunctTokenizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
try:
    os.mkdir('./CSVs')
except:
    pass

In [3]:
#look at the names of some of the files in the txt folder
txt_path = './TXT'
books = os.listdir(txt_path)
books[0:5]

['450 From Paddington - Agatha Christie.txt',
 'A Caribbean Mystery_ A Miss Mar - Agatha Christie.txt',
 'A Murder Is Announced_ A Miss M - Agatha Christie.txt',
 'A Pocket Full of Rye - Agatha Christie.txt',
 'After the Funeral_ A Hercule Po - Agatha Christie.txt']

### Concatenating all the text from all the books:

In [4]:
books_list = []
for file in books:  
    d={}
    book = os.path.join('./TXT', file)
    text = pd.read_csv(book, sep='\n', header=None)[0].str.cat(sep=' ')
    d['title'] = file[:-22].split('_')[0]
    d['text'] = text
    books_list.append(d)
df = pd.DataFrame(books_list)
df.head()

Unnamed: 0,title,text
0,450 From Paddington,Agatha Christie 4:50 from Paddington A Miss Ma...
1,A Caribbean Mystery,Agatha Christie A Caribbean Mystery A Miss Mar...
2,A Murder Is Announced,Agatha Christie A Murder Is Announced A Miss M...
3,A Pocket Full of Rye,A Pocket Full of Rye A Miss Marple Mystery Aga...
4,After the Funeral,Agatha Christie After the Funeral A Hercule Po...


### Extracting the copyright dates for the book:

In [5]:
def get_copyright(df):
    copyright_list = []
    for i in range(len(df['text'])):
        for word in range(len(df['text'][i])):
            d = {}
            if df['text'][i][word] == '©':
                d['title'] = df['title'][i] 
                d['copyright'] = df['text'][i][word: word + 14]
                copyright_list.append(d)
    
    return pd.DataFrame(copyright_list)

In [6]:
copyright_df = get_copyright(df)
copyright_df.head(9)

Unnamed: 0,title,copyright
0,450 From Paddington,© 2011 Agatha
1,450 From Paddington,© 1957. Publis
2,450 From Paddington,© MAY 2011 ISB
3,A Caribbean Mystery,© 2011 Agatha
4,A Caribbean Mystery,© 1965. Publis
5,A Caribbean Mystery,© MAY 2011 ISB
6,A Murder Is Announced,© 2011 Agatha
7,A Murder Is Announced,© 1950. Publis
8,A Murder Is Announced,© MAY 2011 ISB


#### It looks like each book has 3 copy right dates. I only want the original copyright date
I assume the original copyright date is the earliest year

In [7]:
copyright_df['year'] = copyright_df['copyright'].map(lambda x: ''.join([y for y in list(x) if y in '0123456789']))

#drop blank years
copyright_df = copyright_df[copyright_df['year'] != '']

#convert year strings into integers
copyright_df['year'] = copyright_df['year'].astype(int)

#also get rid of any years that are like, 201:
copyright_df = copyright_df[copyright_df['year'] > 1000]

#sort by name, then sort by year
copyright_df = copyright_df.sort_values(['title', 'year'], ascending = [True, True])
copyright_df.head(20)

#drop duplicates, keep the earliest year
copyright_df = copyright_df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)

In [8]:
df = pd.merge(df, copyright_df, left_on='title', right_on = 'title')
df = df.drop('copyright', axis=1)

### Cleaning the df
Get rid of text that wasn't written by Agatha Christie

In [9]:
#split by 'about the publisher' if the book has one:
for row in range(len(df)):
     if int(len(df.loc[row]['text'].lower().split('about the publisher'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('about the publisher')[1]

In [10]:
for row in range(len(df)):
    if 3 > int(len(df.loc[row]['text'].lower().split('about the author'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('about the author')[0]
    elif int(len(df.loc[row]['text'].lower().split('about the author'))) == 3:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('about the author')[1]

In [11]:
for row in range(len(df)):
     if int(len(df.loc[row]['text'].lower().split('* * *'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('* * *')[0]

In [12]:
for row in range(len(df)):
     if int(len(df.loc[row]['text'].lower().split('about the publisher'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('e-book')[0]

### Remove forewords/introductions by different people

In [13]:
#get rid of everything before prologue if the book has a prologue
for row in range(len(df)):
     if int(len(df.loc[row]['text'].split('prologue'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].split('prologue')[1]

In [14]:
#the mysterious affairs at styles didn't have a prologue, so I'm splitting by the last sentence before the book starts
for row in range(len(df)):
     if int(len(df.loc[row]['text'].split('contents in my two books Agatha Christies Secret Notebooks and Murder in the Making published by HarperCollins'.lower()))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].split('contents in my two books Agatha Christies Secret Notebooks and Murder in the Making published by HarperCollins'.lower())[1]

In [15]:
#getting rid of foreword in The ABC Murders
for row in range(len(df)):
     if int(len(df.loc[row]['text'].split('real genius in the way he tackled a problem entirely unlike any which had previously come his way'.lower()))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].split('real genius in the way he tackled a problem entirely unlike any which had previously come his way'.lower())[1]

In [16]:
#getting rid of introductin on the seven dials mystery
for row in range(len(df)):
     if int(len(df.loc[row]['text'].split('the quickness of christies hand still continues to deceive our eyes all those years later'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].split('the quickness of christies hand still continues to deceive our eyes all those years later')[1]

In [17]:
for row in range(len(df)):
    if df.loc[row]['text'][:1] == ' ':
        df.loc[row,'text'] = df.loc[row]['text'][1:]
        
for row in range(len(df)):
    if df.loc[row]['text'][:2] == 'i ':
        df.loc[row,'text'] = df.loc[row]['text'][2:]
        
for row in range(len(df)):
    if df.loc[row]['text'][:3] == 'one':
        df.loc[row,'text'] = df.loc[row]['text'][3:]
        
for row in range(len(df)):
    if df.loc[row]['text'][:11] == 'chapter one':
        df.loc[row,'text'] = df.loc[row]['text'][11:]

### Drop the short stories:
Dropany books with fewer than 45000 words

In [18]:
df = df[df['text'].map(lambda x: len(x.split(' '))) >  45000].reset_index(drop=True)

### Look through the books and fix years/drop the short stories
Research what these books are/when they were published and decide whether or not to keep them

In [19]:
pd.set_option('display.max_rows', 95)
df

Unnamed: 0,title,text,year
0,450 From Paddington,mrs. mcgillicuddy panted along the platform i...,1957
1,A Caribbean Mystery,major palgrave tells a story “take all this b...,1965
2,A Murder Is Announced,a murder is announced i between 7:30 and 8:30...,1950
3,A Pocket Full of Rye,it was miss somers’s turn to make the tea. mi...,1954
4,After the Funeral,i old lanscombe moved totteringly from room t...,1953
5,An Autobiography,s’ society and i had read their periodical. i ...,1977
6,And Then There Were None,author’s note i had written this book because ...,2009
7,Appointment With Death,"part i chapter 1 ‘you do see, don’t you, that ...",1938
8,At Bertram's Hotel,"in the heart of the west end, there are many ...",1966
9,Black Coffee,foreword it was almost certainly because of he...,1997


In [20]:
#Original copyright of And Then There Were None is 1939, 
#this book changed names a lot because they were inappropriate
df.loc[6, 'year'] = 1939
df[df['title'] == 'And Then There Were None']

Unnamed: 0,title,text,year
6,And Then There Were None,author’s note i had written this book because ...,1939


In [21]:
#original copyright of The Mirror Crack'd From Side to Side is 1962
df.loc[63, 'year'] = 1962
df[df['title'] == "The Mirror Crack'd From Side to"]

Unnamed: 0,title,text,year
63,The Mirror Crack'd From Side to,i miss jane marple was sitting by her window....,1962


### Sleeping Murder and Curtain were written in 1940, but published after she passed away. 
I need to fix the date in the dataframe  
source: https://en.wikipedia.org/wiki/Sleeping_Murder
source: https://en.wikipedia.org/wiki/Curtain_(novel)

In [22]:
df.loc[50]

title                                      Sleeping Murder
text      a house gwenda reed stood, shivering a little...
year                                                  1976
Name: 50, dtype: object

In [23]:
df.loc[14]

title                                              Curtain
text      i who is there who has not felt a sudden star...
year                                                  1975
Name: 14, dtype: object

In [24]:
df.loc[14, 'year'] = 1940
df.loc[50, 'year'] = 1940

In [25]:
# Drop books that were published after she passed away (1976)
# I will use her autobiography later, and all the books that were published posthumously 
# were short stories or novelizations of her plays written by another author
df[df['year'] > 1976]

Unnamed: 0,title,text,year
5,An Autobiography,s’ society and i had read their periodical. i ...,1977
9,Black Coffee,foreword it was almost certainly because of he...,1997
31,Miss Marple's Final Cases,sanctuary i the vicar’s wife came round the co...,1979
32,Miss Marple,author’s foreword to the thirteen problems the...,2011
59,The Harlequin Tea Set and Other,the edge “the edge” was first published in pe...,1997


In [26]:
df = df[df['year'] <= 1976].reset_index(drop=True)

In [27]:
#drop the short stories collections:
stories_list = df[df['title'].str.lower().str.contains('and other s|poirot|miss marple')]['title'].to_list()
stories_list

['Double Sin and Other Stories',
 "Hercule Poirot's Christmas",
 'Hercule Poirot',
 'Poirot Investigates',
 "Poirot's Early Cases",
 'The Golden Ball and Other Stori',
 'The Under Dog and Other Stories',
 'Three Blind Mice and Other Stor']

In [28]:
for row in range(len(df)):
    if df.loc[row]['title'] in stories_list:
        df.drop(row, inplace=True)

In [29]:
df = df.reset_index(drop=True)

In [30]:
more_stories = ['The Adventure of the Christmas', 
                'The Hound of Death', 
                'The Listerdale Mystery',
                'The Mysterious Mr. Quin',
                'Parker Pyne Investigates',
                'Partners in Crime',
                'Murder in the Mews',
                'The Labours of Hercules',
                'The Witness for the Prosecution',
                'The Thirteen Problems',
                'Problem at Pollensa Bay']

In [31]:
for row in range(len(df)):
    if df.loc[row]['title'] in more_stories:
        df.drop(row, inplace=True)
df = df.reset_index(drop=True)

In [32]:
df['sentence_count'] = df['text'].map(lambda x: len(x.split('.')))

In [33]:
#get rid of periods and other punctuation
df['text'] = df['text'].map(lambda x: ''.join([y for y in list(x.lower()) if y in string.ascii_lowercase + ' -—']))
df['text'] = df['text'].map(lambda x: x.replace('-', ' '))
df['text'] = df['text'].map(lambda x: x.replace('—', ' '))

In [34]:
#save the df for later in case things go awry
df.to_csv('./CSVs/books.csv', index=False)

### Feature engineering for readability index

In [35]:
df['word_count'] = df['text'].map(lambda x: len(x.split(' ')))

In [36]:
#source: https://stackoverflow.com/questions/46759492/syllable-count-in-python

def syllables_count(sentence):
    count = 0
    for word in sentence:
        word = word.lower()
        vowels = "aeiouy"
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if word.endswith("e"):
            count -= 1
        if count == 0:
            count += 1
    return count

df['syllable_count'] = df['text'].map(lambda x: syllables_count(x.split()))

### Calculating the Flesch Reading Ease:

![](./assets/flesch_reading_ease.png)

In [37]:
df['flesch_reading_ease'] = 206.835 - (1.015* (df['word_count']/df['sentence_count'])) -(84.6*(df['syllable_count']/df['word_count']))

In [38]:
df['avg_words_per_sentence'] = df['word_count']/df['sentence_count']

In [39]:
df['avg_syllables_per_word'] = df['syllable_count']/df['word_count']

### Feature engineering for parts of speech tagging
The authors didn't mention stemming, but I'm going to do it

In [40]:
tok = WordPunctTokenizer()
stemmer = PorterStemmer()

In [41]:
df['stemmed_text'] = df['text'].map(lambda x: stemmer.stem(x))

In [42]:
df['unique_word_count'] = df['stemmed_text'].map(lambda x: len(set(x.split(' '))))

In [43]:
df['unique_words_%'] = 100*(df['unique_word_count']/df['word_count'])

In [44]:
def get_adj_words(df):
    #making a column with a list of all the adjectives in the text
    adj_word_list = []
    for i in range(len(df)):
        d = {}
        adj_words = '' 
        adj_counter = 0
        tagged_words = pos_tag(tok.tokenize(df.loc[i]['stemmed_text']))
        for j in range(len(tagged_words)):
            if tagged_words[j][1] in ['JJ', 'JJR', 'JJS']:
                adj_words = adj_words + tagged_words[j][0] + ' '
                adj_counter += 1
        d['adj_word_list'] = adj_words
        d['total_adj_count'] = adj_counter
        adj_word_list.append(d)
        gc.collect()
        
    adj_words_df = pd.DataFrame(adj_word_list)
    df = pd.concat([df, adj_words_df], axis=1)
    df['unique_adj_count'] = df['adj_word_list'].map(lambda x: len(set(x.split())))
    df = df.drop(['adj_word_list'], axis=1)

    return df

In [45]:
def get_noun_words(df):
    #making a column with a list of all the nouns in the text
    nouns_word_list = []
    for i in range(len(df)):
        d = {}
        noun_counter = 0
        noun_words = ''    
        tagged_words = pos_tag(tok.tokenize(df.loc[i]['stemmed_text']))
        for j in range(len(tagged_words)):
            if tagged_words[j][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
                noun_words = noun_words + tagged_words[j][0] + ' '
                noun_counter += 1
        d['noun_word_list'] = noun_words
        d['total_noun_count'] = noun_counter
        nouns_word_list.append(d)
        gc.collect()
        
    noun_word_df = pd.DataFrame(nouns_word_list)
    df = pd.concat([df, noun_word_df], axis=1)
    df['unique_noun_count'] = df['noun_word_list'].map(lambda x: len(set(x.split())))
    df = df.drop(['noun_word_list'], axis=1)
    return df

In [46]:
df = get_adj_words(df)
df = get_noun_words(df)

### Feature engineering for trigrams and indefinite words

In [47]:
#I need to rename the columns because count vectorizer end up making columns with the same names
#since some of my column names are one word long
new_column_names = {'title' : 'book_title', 'text': 'book_text', 'year': 'year_written'}
df = df.rename(columns = new_column_names)

### Indefinite word usage and other vague words:

In [48]:
def word_counter(df, word_interest):
    cv = CountVectorizer()
    df2 = pd.DataFrame(cv.fit_transform(df['book_text']).todense(), columns=cv.get_feature_names())

    text_and_counts = df2[word_interest]
    text_and_counts.columns = [f'{word_interest}_count']
    
    df = pd.concat([df, text_and_counts], axis=1)
    df[f'%_{word_interest}'] = df[f'{word_interest}']/df['word_count']
    gc.collect()
    
    return df

In [49]:
#indefinite words from the paper
df = word_counter(df, 'thing')
df = word_counter(df, 'something')
df = word_counter(df, 'anything')

In [50]:
#other vague words
df = word_counter(df, 'stuff')
df = word_counter(df, 'lot')
df = word_counter(df, 'very')

In [51]:
def count_trigrams(df):
    cv_3 = CountVectorizer(ngram_range=(3, 3))
    trigrams = pd.DataFrame(cv_3.fit_transform(df['book_text']).todense(), columns=cv_3.get_feature_names())
    
    tri = []
    for x in range(len(trigrams)):  
        t = {}
        t['repeated_trigrams'] = sum(trigrams.iloc[x] > 1)
        tri.append(t)
        gc.collect()
    
    trigram_df = pd.DataFrame(tri)
    df = pd.concat([df, trigram_df], axis=1 )
    return df

In [52]:
df = count_trigrams(df)

### Sentiment analysis

I get the sentiment of each sentence in a book, and then add the number of positive, neutral, negative sentences up.  
I determine sentence length of each text by dividing the book text by the average words per sentence.

In [53]:
sentiment_analyzer = SentimentIntensityAnalyzer()

In [54]:
def get_sentiment(df):
    overall_scores = []
    for i in range(len(df)): 
        #this part splits the long string of text into words, 
        #then it divides the text length (in words) by the average word per sentence
        split_strings = []
        for index in range(0, len(df.loc[i]['book_text'].split(' ')), int(df.loc[i]['avg_words_per_sentence'])):
            split_strings.append(' '.join(df.loc[i]['book_text'].split(' ')[index : index + int(df.loc[i]['avg_words_per_sentence'])]))

        pos = 0
        neu = 0
        neg = 0

        #this part iterates through the split up sentences and does sentiment analysis on each split up sentence
        sentiment_dict = {}
        for n in split_strings:
            score = sentiment_analyzer.polarity_scores(n)
            if score['compound'] > 0:
                pos +=1
            elif score['compound'] < 0:
                neg +=1
            else:
                neu +=1

            sentiment_dict['total_pos_sentences'] = pos
            sentiment_dict['total_neu_sentences'] = neu
            sentiment_dict['total_neg_sentences'] = neg

        overall_scores.append(sentiment_dict)
    text_sentiment = pd.DataFrame(overall_scores)
    df = pd.concat([df, text_sentiment], axis=1)
    return df

In [55]:
df = get_sentiment(df)

In [56]:
df['%_pos_sentences'] = df['total_pos_sentences']/df['sentence_count']
df['%_neu_sentences'] = df['total_neu_sentences']/df['sentence_count']
df['%_neg_sentences'] = df['total_neg_sentences']/df['sentence_count']

### See if her usage of  stopwords in general increased

In [57]:
sw = stopwords.words("english")

In [58]:
df['stemmed_removed_stopwords'] = df['stemmed_text'].map(lambda x: ' '.join([y for y in x.split() if y not in sw]))
df['only_stopwords'] = df['stemmed_text'].map(lambda x: ' '.join([y for y in x.split() if y in sw]))

In [59]:
df['stopwords_count'] = df['only_stopwords'].map(lambda x: len(x.split()))
df['%_stop_words'] = (df['stopwords_count'])/df['word_count']

In [60]:
df.columns

Index(['book_title', 'book_text', 'year_written', 'sentence_count',
       'word_count', 'syllable_count', 'flesch_reading_ease',
       'avg_words_per_sentence', 'avg_syllables_per_word', 'stemmed_text',
       'unique_word_count', 'unique_words_%', 'total_adj_count',
       'unique_adj_count', 'total_noun_count', 'unique_noun_count', 'thing',
       '%_thing', 'something', '%_something', 'anything', '%_anything',
       'stuff', '%_stuff', 'lot', '%_lot', 'very', '%_very',
       'repeated_trigrams', 'total_pos_sentences', 'total_neu_sentences',
       'total_neg_sentences', '%_pos_sentences', '%_neu_sentences',
       '%_neg_sentences', 'stemmed_removed_stopwords', 'only_stopwords',
       'stopwords_count', '%_stop_words'],
      dtype='object')

### Make sure the features that I am going to use are normalized

In [61]:
df['%_repeated_trigrams'] = df['repeated_trigrams']/(df['word_count']/3)
df['%_unique_adj'] = df['unique_adj_count']/df['total_adj_count']
df['%_unique_noun'] = df['unique_noun_count']/df['total_noun_count']
df['%_adj_in_text'] = df['total_adj_count']/df['word_count']
df['%_noun_in_text'] = df['total_noun_count']/df['word_count']

### I think it's mostly clean now!

In [62]:
df['year_written'] = df['year_written'].astype(int)
df = df.sort_values(by='year_written').reset_index(drop=True)

In [63]:
df.to_csv('./CSVs/all_clean.csv', index=False)

### Clean up one of the short story collections to test my model on

In [64]:
harlequin = pd.read_csv('./TXT/The Harlequin Tea Set and Other - Agatha Christie.txt', sep='\n', header=None)[0].str.cat(sep=' ')

In [65]:
#list of short stories in the harlequin tea set
short_stories = ['The Edge', 
                 'The Actress', 
                 'While the Light Lasts', 
                 'The House of Dreams', 
                 'The Lonely God',
                 'Manx Gold',
                 'Within a Wall', 
                 'The Mystery of the Spanish Chest', 
                 'The Harlequin Tea Set']

#make them all caps
short_stories = [x.upper() for x in short_stories]

In [66]:
short_stories

['THE EDGE',
 'THE ACTRESS',
 'WHILE THE LIGHT LASTS',
 'THE HOUSE OF DREAMS',
 'THE LONELY GOD',
 'MANX GOLD',
 'WITHIN A WALL',
 'THE MYSTERY OF THE SPANISH CHEST',
 'THE HARLEQUIN TEA SET']

In [67]:
stories = []
for story in range(len(short_stories)-1):
    s = {}
    s['title'] = short_stories[story]
    s['text'] = harlequin.split(short_stories[story])[1].split(short_stories[story+1])[0]
    stories.append(s)

In [68]:
last = harlequin.split('THE HARLEQUIN TEA SET')[1].split('About the Author')[0]

s = {}
s['title'] = 'THE HARLEQUIN TEA SET'
s['text'] = last

stories.append(s)

stories_df = pd.DataFrame(stories)
stories_df

Unnamed: 0,title,text
0,THE EDGE,“The Edge” was first published in Pearson’s M...
1,THE ACTRESS,“The Actress” was first published as “A Trap ...
2,WHILE THE LIGHT LASTS,“While the Light Lasts” was first published i...
3,THE HOUSE OF DREAMS,“The House of Dreams” was first published in ...
4,THE LONELY GOD,“The Lonely God” was first published in Royal...
5,MANX GOLD,“Manx Gold” was first published in The Daily ...
6,WITHIN A WALL,“Within a Wall” was first published in Royal ...
7,THE MYSTERY OF THE SPANISH CHEST,“The Mystery of the Spanish Chest” is an expa...
8,THE HARLEQUIN TEA SET,“The Harlequin Tea Set” was first published i...


In [69]:
#split out first sentence, then only save the numbers
stories_df['year'] = stories_df['text'].map(lambda x: x.split('.')[0]).map(lambda x: ''.join([y for y in list(x) if y in '0123456789']))
stories_df

Unnamed: 0,title,text,year
0,THE EDGE,“The Edge” was first published in Pearson’s M...,1927
1,THE ACTRESS,“The Actress” was first published as “A Trap ...,1923
2,WHILE THE LIGHT LASTS,“While the Light Lasts” was first published i...,1924
3,THE HOUSE OF DREAMS,“The House of Dreams” was first published in ...,1926
4,THE LONELY GOD,“The Lonely God” was first published in Royal...,1926
5,MANX GOLD,“Manx Gold” was first published in The Daily ...,23281930
6,WITHIN A WALL,“Within a Wall” was first published in Royal ...,1925
7,THE MYSTERY OF THE SPANISH CHEST,“The Mystery of the Spanish Chest” is an expa...,1932
8,THE HARLEQUIN TEA SET,“The Harlequin Tea Set” was first published i...,1971


In [70]:
#fix manx gold:
stories_df.loc[5, 'year'] = 1930
stories_df

Unnamed: 0,title,text,year
0,THE EDGE,“The Edge” was first published in Pearson’s M...,1927
1,THE ACTRESS,“The Actress” was first published as “A Trap ...,1923
2,WHILE THE LIGHT LASTS,“While the Light Lasts” was first published i...,1924
3,THE HOUSE OF DREAMS,“The House of Dreams” was first published in ...,1926
4,THE LONELY GOD,“The Lonely God” was first published in Royal...,1926
5,MANX GOLD,“Manx Gold” was first published in The Daily ...,1930
6,WITHIN A WALL,“Within a Wall” was first published in Royal ...,1925
7,THE MYSTERY OF THE SPANISH CHEST,“The Mystery of the Spanish Chest” is an expa...,1932
8,THE HARLEQUIN TEA SET,“The Harlequin Tea Set” was first published i...,1971


In [71]:
#get rid of the first sentence of the text:
stories_df['text'] = stories_df['text'].map(lambda x: '.'.join(x.split('.')[1:]))

#the last story had 2 sentences:
stories_df.loc[8, 'text'] = '.'.join((stories_df['text'][8].split('.')[1:]))

In [72]:
stories_df.to_csv('./CSVs/harlequin_df.csv', index=False)

### Feature engineering for short stories

In [73]:
stories_df['sentence_count'] = stories_df['text'].map(lambda x: len(x.split('.')))

#get rid of periods and other punctuation
stories_df['text'] = stories_df['text'].map(lambda x: ''.join([y for y in list(x.lower()) if y in string.ascii_lowercase + ' -—']))
stories_df['text'] = stories_df['text'].map(lambda x: x.replace('-', ' '))
stories_df['text'] = stories_df['text'].map(lambda x: x.replace('—', ' '))

In [74]:
stories_df['syllable_count'] = stories_df['text'].map(lambda x: syllables_count(x.split()))

In [75]:
stories_df['word_count'] = stories_df['text'].map(lambda x: len(x.split(' ')))

In [76]:
stories_df['flesch_reading_ease'] = 206.835 - (1.015* (stories_df['word_count']/stories_df['sentence_count'])) -(84.6*(stories_df['syllable_count']/stories_df['word_count']))

In [77]:
stories_df['avg_words_per_sentence'] = stories_df['word_count']/stories_df['sentence_count']

In [78]:
stories_df['avg_syllables_per_word'] = stories_df['syllable_count']/stories_df['word_count']

In [79]:
stories_df['stemmed_text'] = stories_df['text'].map(lambda x: stemmer.stem(x))

In [80]:
stories_df['unique_word_count'] = stories_df['stemmed_text'].map(lambda x: len(set(x.split(' '))))

In [81]:
stories_df['unique_words_%'] = 100*(stories_df['unique_word_count']/stories_df['word_count'])

In [82]:
stories_df = get_adj_words(stories_df)
stories_df = get_noun_words(stories_df)

In [83]:
#I need to rename the columns because count vectorizer end up making columns with the same names
#since some of my column names are one word long
new_column_names = {'title' : 'book_title', 'text': 'book_text', 'year': 'year_written'}
stories_df = stories_df.rename(columns = new_column_names)

In [84]:
#indefinite words from the paper
stories_df = word_counter(stories_df, 'thing')
stories_df = word_counter(stories_df, 'something')
stories_df = word_counter(stories_df, 'anything')

In [85]:
#other vague words
stories_df = word_counter(stories_df, 'stuff')
stories_df = word_counter(stories_df, 'lot')
stories_df = word_counter(stories_df, 'very')

In [86]:
stories_df = count_trigrams(stories_df)

In [87]:
stories_df = get_sentiment(stories_df)

In [88]:
stories_df['%_pos_sentences'] = stories_df['total_pos_sentences']/stories_df['sentence_count']
stories_df['%_neu_sentences'] = stories_df['total_neu_sentences']/stories_df['sentence_count']
stories_df['%_neg_sentences'] = stories_df['total_neg_sentences']/stories_df['sentence_count']

### See if her usage of  stopwords in general increased

In [89]:
sw = stopwords.words("english")

In [90]:
stories_df['stemmed_removed_stopwords'] = stories_df['stemmed_text'].map(lambda x: ' '.join([y for y in x.split() if y not in sw]))
stories_df['only_stopwords'] = stories_df['stemmed_text'].map(lambda x: ' '.join([y for y in x.split() if y in sw]))

In [91]:
stories_df['stopwords_count'] = stories_df['only_stopwords'].map(lambda x: len(x.split()))
stories_df['%_stop_words'] = (stories_df['stopwords_count'])/stories_df['word_count']

In [92]:
stories_df.columns

Index(['book_title', 'book_text', 'year_written', 'sentence_count',
       'syllable_count', 'word_count', 'flesch_reading_ease',
       'avg_words_per_sentence', 'avg_syllables_per_word', 'stemmed_text',
       'unique_word_count', 'unique_words_%', 'total_adj_count',
       'unique_adj_count', 'total_noun_count', 'unique_noun_count', 'thing',
       '%_thing', 'something', '%_something', 'anything', '%_anything',
       'stuff', '%_stuff', 'lot', '%_lot', 'very', '%_very',
       'repeated_trigrams', 'total_pos_sentences', 'total_neu_sentences',
       'total_neg_sentences', '%_pos_sentences', '%_neu_sentences',
       '%_neg_sentences', 'stemmed_removed_stopwords', 'only_stopwords',
       'stopwords_count', '%_stop_words'],
      dtype='object')

### Make sure the features that I am going to use are normalized

In [93]:
stories_df['%_repeated_trigrams'] = stories_df['repeated_trigrams']/(stories_df['word_count']/3)
stories_df['%_unique_adj'] = stories_df['unique_adj_count']/stories_df['total_adj_count']
stories_df['%_unique_noun'] = stories_df['unique_noun_count']/stories_df['total_noun_count']
stories_df['%_adj_in_text'] = stories_df['total_adj_count']/stories_df['word_count']
stories_df['%_noun_in_text'] = stories_df['total_noun_count']/stories_df['word_count']

In [94]:
stories_df.to_csv('./CSVs/harlequin_clean.csv', index=False)