# Cleaning the data and feature engineering

In this notebook, I compile all the text from all the books into one dataframe, then do some cleaning and do some feature engineering.

### Import things

In [1]:
import os
import string
import pandas as pd

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import WordPunctTokenizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
try:
    os.mkdir('./CSVs')
except:
    pass

In [3]:
#look at the names of some of the files in the txt folder
txt_path = './TXT_files'
books = os.listdir(txt_path)
books[0:5]

['450 From Paddington - Agatha Christie.txt',
 'A Caribbean Mystery_ A Miss Mar - Agatha Christie.txt',
 'A Murder Is Announced_ A Miss M - Agatha Christie.txt',
 'A Pocket Full of Rye - Agatha Christie.txt',
 'After the Funeral_ A Hercule Po - Agatha Christie.txt']

### Concatenating all the text from all the books:

In [4]:
books_list = []
for file in books:  
    d={}
    book = os.path.join('./TXT_files', file)
    text = pd.read_csv(book, sep='\n', header=None)[0].str.cat(sep=' ')
    d['title'] = file[:-22].split('_')[0]
    d['text'] = text
    books_list.append(d)
df = pd.DataFrame(books_list)
df.head()

Unnamed: 0,title,text
0,450 From Paddington,Agatha Christie 4:50 from Paddington A Miss Ma...
1,A Caribbean Mystery,Agatha Christie A Caribbean Mystery A Miss Mar...
2,A Murder Is Announced,Agatha Christie A Murder Is Announced A Miss M...
3,A Pocket Full of Rye,A Pocket Full of Rye A Miss Marple Mystery Aga...
4,After the Funeral,Agatha Christie After the Funeral A Hercule Po...


### Extracting the copyright dates for the book:

In [5]:
def get_copyright(df):
    copyright_list = []
    for i in range(len(df['text'])):
        for word in range(len(df['text'][i])):
            d = {}
            if df['text'][i][word] == '©':
                d['title'] = df['title'][i] 
                d['copyright'] = df['text'][i][word: word + 14]
                copyright_list.append(d)
    return pd.DataFrame(copyright_list)

In [6]:
copyright_df = get_copyright(df)
copyright_df.head(9)

Unnamed: 0,title,copyright
0,450 From Paddington,© 2011 Agatha
1,450 From Paddington,© 1957. Publis
2,450 From Paddington,© MAY 2011 ISB
3,A Caribbean Mystery,© 2011 Agatha
4,A Caribbean Mystery,© 1965. Publis
5,A Caribbean Mystery,© MAY 2011 ISB
6,A Murder Is Announced,© 2011 Agatha
7,A Murder Is Announced,© 1950. Publis
8,A Murder Is Announced,© MAY 2011 ISB


#### It looks like each book has 3 copy right dates. I only want the original copyright date
I assume the original copyright date is the earliest year

In [7]:
copyright_df['year'] = copyright_df['copyright'].map(lambda x: ''.join([y for y in list(x) if y in '0123456789']))

#drop blank years
copyright_df = copyright_df[copyright_df['year'] != '']

#convert year strings into integers
copyright_df['year'] = copyright_df['year'].astype(int)

#also get rid of any years that are like, 201:
copyright_df = copyright_df[copyright_df['year'] > 1000]

#sort by name, then sort by year
copyright_df = copyright_df.sort_values(['title', 'year'], ascending = [True, True])
copyright_df.head(20)

#drop duplicates, keep the earliest year
copyright_df = copyright_df.drop_duplicates(subset='title', keep='first').reset_index(drop=True)

In [8]:
df = pd.merge(df, copyright_df, left_on='title', right_on = 'title')

### Look through the books and fix years/drop the short stories
Research what these books are/when they were published and decide whether or not to keep them

In [9]:
pd.set_option('display.max_rows', 95)
df

Unnamed: 0,title,text,copyright,year
0,450 From Paddington,Agatha Christie 4:50 from Paddington A Miss Ma...,© 1957. Publis,1957
1,A Caribbean Mystery,Agatha Christie A Caribbean Mystery A Miss Mar...,© 1965. Publis,1965
2,A Murder Is Announced,Agatha Christie A Murder Is Announced A Miss M...,© 1950. Publis,1950
3,A Pocket Full of Rye,A Pocket Full of Rye A Miss Marple Mystery Aga...,© 1954. Publis,1954
4,After the Funeral,Agatha Christie After the Funeral A Hercule Po...,© 1953 Agatha,1953
5,An Autobiography,Agatha Christie An Autobiography Contents Cove...,© 1977 Agatha,1977
6,And Then There Were None,Agatha Christie And Then There Were None Dedic...,© 2009 Agatha,2009
7,Appointment With Death,Appointment with Death To Richard and Myra Mal...,© 1938 Agatha,1938
8,At Bertram's Hotel,Agatha Christie At Bertram’s Hotel A Miss Marp...,© 1966. Publis,1966
9,Black Coffee,Agatha Christie Black Coffee adapted as a nove...,© 1997 Agatha,1997


In [10]:
#Original copyright of And Then There Were None is 1939, 
#this book changed names a lot because they were inappropriate
df.loc[6, 'year'] = 1939
df[df['title'] == 'And Then There Were None']

Unnamed: 0,title,text,copyright,year
6,And Then There Were None,Agatha Christie And Then There Were None Dedic...,© 2009 Agatha,1939


In [11]:
#original copyright of The Mirror Crack'd From Side to Side is 1962
df.loc[67, 'year'] = 1962
df[df['title'] == "The Mirror Crack'd From Side to"]

Unnamed: 0,title,text,copyright,year
67,The Mirror Crack'd From Side to,Agatha Christie The Mirror Crack’d from Side t...,© 2011 Agatha,1962


### Sleeping Murder and Curtain were written in 1940, but published after she passed away. 
I need to fix the date in the dataframe  
source: https://en.wikipedia.org/wiki/Sleeping_Murder
source: https://en.wikipedia.org/wiki/Curtain_(novel)

In [12]:
df.loc[14]

title                                                  Curtain
text         Curtain: Poirot’s Last Case A Hercule Poirot M...
copyright                                       © 1975 Agatha 
year                                                      1975
Name: 14, dtype: object

In [13]:
df.loc[51]

title                                          Sleeping Murder
text         Agatha Christie Sleeping Murder A Miss Marple ...
copyright                                       © 1976. Publis
year                                                      1976
Name: 51, dtype: object

In [14]:
df.loc[14, 'year'] = 1940
df.loc[51, 'year'] = 1940

In [15]:
# Drop books that were published after she passed away (1976)
# I will use her autobiography later, and all the books that were published posthumously 
# were short stories or novelizations of her plays written by another author
df[df['year'] > 1976]

Unnamed: 0,title,text,copyright,year
5,An Autobiography,Agatha Christie An Autobiography Contents Cove...,© 1977 Agatha,1977
9,Black Coffee,Agatha Christie Black Coffee adapted as a nove...,© 1997 Agatha,1997
31,Miss Marple's Final Cases,Miss Marple’s Final Cases Contents About Agath...,© 1979 Agatha,1979
32,Miss Marple,Contents Author’s Foreword From The Thirteen P...,© 2011 Agatha,2011
53,Spider's Web,Agatha Christie Spider’s Web Novelised by Char...,© 2000 Agatha,2000
61,The Harlequin Tea Set and Other,The Harlequin Tea Set and Other Stories Conten...,© 1997 Agatha,1997
82,The Unexpected Guest,Agatha Christie The Unexpected Guest Novelised...,© 1999 Agatha,1999
90,While the Light Lasts,Contents While the Light Lasts About the Autho...,© 1997 Agatha,1997


In [16]:
df = df[df['year'] <= 1976].reset_index(drop=True)

In [17]:
#drop the short stories:
stories_list = df[df['title'].str.lower().str.contains('and other s|poirot|miss marple')]['title'].to_list()
stories_list

['Double Sin and Other Stories',
 "Hercule Poirot's Christmas",
 'Hercule Poirot',
 'Poirot Investigates',
 "Poirot's Early Cases",
 'The Golden Ball and Other Stori',
 'The Regatta Mystery and Other S',
 'The Under Dog and Other Stories',
 'Three Blind Mice and Other Stor']

In [18]:
for row in range(len(df)):
    if df.loc[row]['title'] in stories_list:
        df.drop(row, inplace=True)

In [19]:
df = df.reset_index(drop=True)

In [20]:
more_stories = ['The Adventure of the Christmas', 
                'The Hound of Death', 
                'The Listerdale Mystery',
                'The Mysterious Mr. Quin',
                'Parker Pyne Investigates',
                'Partners in Crime',
                'Murder in the Mews',
                'The Labours of Hercules',
                'The Witness for the Prosecution',
                'The Thirteen Problems',
                'Problem at Pollensa Bay']

In [21]:
for row in range(len(df)):
    if df.loc[row]['title'] in more_stories:
        df.drop(row, inplace=True)
df = df.reset_index(drop=True)

In [22]:
#save the df for later in case things go awry
df.to_csv('./CSVs/books.csv', index=False)

In [23]:
df = pd.read_csv('./CSVs/books.csv')

### Cleaning the df with the books

In [24]:
#split by 'about the publisher' if the book has one:
for row in range(len(df)):
     if int(len(df.loc[row]['text'].lower().split('about the publisher'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('about the publisher')[1]

In [25]:
for row in range(len(df)):
    if 3 > int(len(df.loc[row]['text'].lower().split('about the author'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('about the author')[0]
    elif int(len(df.loc[row]['text'].lower().split('about the author'))) == 3:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('about the author')[1]

In [26]:
for row in range(len(df)):
     if int(len(df.loc[row]['text'].lower().split('* * *'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('* * *')[0]

In [27]:
for row in range(len(df)):
     if int(len(df.loc[row]['text'].lower().split('about the publisher'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].lower().split('e-book')[0]

### More cleaning:
Going to perform the cleaning that they did in the paper:
> After all punctuation, apostrophes, and hyphens were deleted, each text was divided into 10,000-word segments.

In [28]:
df['sentence_count'] = df['text'].map(lambda x: len(x.split('.')))

In [29]:
#get rid of periods and other punctuation
df['text'] = df['text'].map(lambda x: ''.join([y for y in list(x.lower()) if y in string.ascii_lowercase + ' -—']))
df['text'] = df['text'].map(lambda x: x.replace('-', ' '))
df['text'] = df['text'].map(lambda x: x.replace('—', ' '))

### Feature engineering for readability index

In [30]:
df['word_count'] = df['text'].map(lambda x: len(x.split(' ')))

In [31]:
#source: https://stackoverflow.com/questions/46759492/syllable-count-in-python

def syllables_count(sentence):
    count = 0
    for word in sentence:
        word = word.lower()
        vowels = "aeiouy"
        if word[0] in vowels:
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if word.endswith("e"):
            count -= 1
        if count == 0:
            count += 1
    return count

df['syllable_count'] = df['text'].map(lambda x: syllables_count(x.split()))

### Remove forewords/introductions by different people just in novels

In [32]:
#get rid of everything before prologue if the book has a prologue
for row in range(len(df)):
     if int(len(df.loc[row]['text'].split('prologue'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].split('prologue')[1]

In [33]:
#the mysterious affairs at styles didn't have a prologue, so I'm splitting by the last sentence before the book starts
for row in range(len(df)):
     if int(len(df.loc[row]['text'].split('contents in my two books Agatha Christies Secret Notebooks and Murder in the Making published by HarperCollins'.lower()))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].split('contents in my two books Agatha Christies Secret Notebooks and Murder in the Making published by HarperCollins'.lower())[1]

In [34]:
#getting rid of foreword in The ABC Murders
for row in range(len(df)):
     if int(len(df.loc[row]['text'].split('real genius in the way he tackled a problem entirely unlike any which had previously come his way'.lower()))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].split('real genius in the way he tackled a problem entirely unlike any which had previously come his way'.lower())[1]

In [35]:
#getting rid of introductin on the seven dials mystery
for row in range(len(df)):
     if int(len(df.loc[row]['text'].split('the quickness of christies hand still continues to deceive our eyes all those years later'))) > 1:
        df.loc[row, 'text'] = df.loc[row]['text'].split('the quickness of christies hand still continues to deceive our eyes all those years later')[1]

In [36]:
for row in range(len(df)):
    if df.loc[row]['text'][:1] == ' ':
        df.loc[row,'text'] = df.loc[row]['text'][1:]
        
for row in range(len(df)):
    if df.loc[row]['text'][:2] == 'i ':
        df.loc[row,'text'] = df.loc[row]['text'][2:]
        
for row in range(len(df)):
    if df.loc[row]['text'][:3] == 'one':
        df.loc[row,'text'] = df.loc[row]['text'][3:]
        
for row in range(len(df)):
    if df.loc[row]['text'][:11] == 'chapter one':
        df.loc[row,'text'] = df.loc[row]['text'][11:]

### Calculating the Flesch Reading Ease:

![](./assets/flesch_reading_ease.png)

In [37]:
df['flesch_reading_ease'] = 206.835 - (1.015* (df['word_count']/df['sentence_count'])) -(84.6*(df['syllable_count']/df['word_count']))

In [38]:
df['avg_words_per_sentence'] = df['word_count']/df['sentence_count']

In [39]:
df['avg_syllables_per_word'] = df['syllable_count']/df['word_count']

### Feature engineering for parts of speech tagging
The authors didn't mention stemming, but I'm going to do it

In [40]:
tok = WordPunctTokenizer()
stemmer = PorterStemmer()

In [41]:
df['stemmed_text'] = df['text'].map(lambda x: stemmer.stem(x))

In [42]:
df['unique_word_count'] = df['stemmed_text'].map(lambda x: len(set(x.split(' '))))

In [43]:
df['unique_words_%'] = 100*(df['unique_word_count']/df['word_count'])

In [44]:
#count total number of adjectives (not unique)
adjectives_list = []

for i in range(len(df)):
    d = {}
    adj_counter = 0
    tagged_words = pos_tag(tok.tokenize(df.loc[i]['stemmed_text']))
    for j in range(len(tagged_words)):
        if tagged_words[j][1] in ['JJ', 'JJR', 'JJS']:
            adj_counter += 1
    d['title']  = df.loc[i]['title']
    d['total_adj_count'] = adj_counter
    adjectives_list.append(d)
    
adj_df = pd.DataFrame(adjectives_list)
df = pd.merge(df, adj_df)

In [45]:
#count total number of nouns (not unique)
nouns_list = []

for i in range(len(df)):
    d = {}
    noun_counter = 0
    tagged_words = pos_tag(tok.tokenize(df.loc[i]['stemmed_text']))
    for j in range(len(tagged_words)):
        if tagged_words[j][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            noun_counter += 1
    d['title']  = df.loc[i]['title']
    d['total_noun_count'] = noun_counter
    nouns_list.append(d)
    
noun_df = pd.DataFrame(nouns_list)
df = pd.merge(df, noun_df)

In [46]:
#making a column with a list of all the adjectives in the text
adj_word_list = []
for i in range(len(df)):
    d = {}
    adj_words = ''    
    tagged_words = pos_tag(tok.tokenize(df.loc[i]['stemmed_text']))
    for j in range(len(tagged_words)):
        if tagged_words[j][1] in ['JJ', 'JJR', 'JJS']:
            adj_words = adj_words + tagged_words[j][0] + ' '
    d['title']  = df.loc[i]['title']
    d['adj_word_list'] = adj_words
    adj_word_list.append(d)

adj_words_df = pd.DataFrame(adj_word_list)
df = pd.merge(df, adj_words_df)

In [47]:
#making a column with a list of all the nouns in the text
nouns_word_list = []
for i in range(len(df)):
    d = {}
    noun_words = ''    
    tagged_words = pos_tag(tok.tokenize(df.loc[i]['stemmed_text']))
    for j in range(len(tagged_words)):
        if tagged_words[j][1] in ['NN', 'NNS', 'NNP', 'NNPS']:
            noun_words = noun_words + tagged_words[j][0] + ' '
    d['title']  = df.loc[i]['title']
    d['noun_word_list'] = noun_words
    nouns_word_list.append(d)

noun_word_df = pd.DataFrame(nouns_word_list)
df = pd.merge(df, noun_word_df)

In [48]:
df['unique_adj_count'] = df['adj_word_list'].map(lambda x: len(set(x.split())))
df['unique_noun_count'] = df['noun_word_list'].map(lambda x: len(set(x.split())))

### Feature engineering for indefinite words and trigrams

In [49]:
#I need to rename the columns because count vectorizer end up making columns with the same names
#since some of my column names are one word long
new_column_names = {'title' : 'book_title', 'text': 'book_text', 'year': 'year_written'}
df = df.rename(columns = new_column_names)

In [50]:
#no stop words because the authors of the paper look at some stop words
cv = CountVectorizer()
df2 = pd.DataFrame(cv.fit_transform(df['book_text']).todense(), columns=cv.get_feature_names())

text_and_counts = pd.concat([df, df2], axis=1)
indefinite_words = text_and_counts[['book_text','year_written','something', 'thing', 'anything']].set_index('year_written')

df = pd.merge(df, indefinite_words)

In [51]:
cv_3 = CountVectorizer(ngram_range=(3, 3))
trigrams = pd.DataFrame(cv_3.fit_transform(df['book_text']).todense(), columns=cv_3.get_feature_names())

text_and_trigrams = pd.concat([df, trigrams], axis=1)

temp3 = text_and_trigrams.drop(['year_written', 'word_count','book_text'], axis=1)

tri = []
for x in range(len(temp3)):  
    t = {}
    t['book_title'] = temp3.loc[x]['book_title']
    t['repeated_trigrams'] = sum(temp3.iloc[x][18:] > 1)
    tri.append(t)

In [52]:
trigram_count  = pd.DataFrame(tri)
df = pd.merge(df, trigram_count)

### Sentiment analysis

I get the sentiment of each sentence in a book, and then add the number of positive, neutral, negative sentences up.  
I determine sentence length of each text by dividing the book text by the average words per sentence.

In [53]:
sentiment_analyzer = SentimentIntensityAnalyzer()

In [54]:
overall_scores = []
for i in range(len(df)): 
    #this part splits the long string of text into words, 
    #then it divides the text length (in words) by the average word per sentence
    split_strings = []
    for index in range(0, len(df.loc[i]['book_text'].split(' ')), int(df.loc[i]['avg_words_per_sentence'])):
        split_strings.append(' '.join(df.loc[i]['book_text'].split(' ')[index : index + int(df.loc[i]['avg_words_per_sentence'])]))
    
    pos = 0
    neu = 0
    neg = 0
    
    #this part iterates through the split up sentences and does sentiment analysis on each split up sentence
    sentiment_dict = {}
    for n in split_strings:
        score = sentiment_analyzer.polarity_scores(n)
        if score['compound'] > 0:
            pos +=1
        elif score['compound'] < 0:
            neg +=1
        else:
            neu +=1
            
        sentiment_dict['book_title'] = df.loc[i]['book_title']
        sentiment_dict['total_pos_sentences'] = pos
        sentiment_dict['total_neu_sentences'] = neu
        sentiment_dict['total_neg_sentences'] = neg
    
    overall_scores.append(sentiment_dict)

In [55]:
text_sentiment = pd.DataFrame(overall_scores)

In [56]:
df = pd.merge(df, text_sentiment)

In [57]:
df['%_pos_sentences'] = df['total_pos_sentences']/df['sentence_count']
df['%_neu_sentences'] = df['total_neu_sentences']/df['sentence_count']
df['%_neg_sentences'] = df['total_neg_sentences']/df['sentence_count']

### Make sure the features that I am going to use are normalized

In [58]:
df['%_repeated_trigrams'] = df['repeated_trigrams']/(df['word_count']/3)
df['%_unique_adj'] = df['unique_adj_count']/df['total_adj_count']
df['%_unique_noun'] = df['unique_noun_count']/df['total_noun_count']
df['%_adj_in_text'] = df['total_adj_count']/df['word_count']
df['%_noun_in_text'] = df['total_noun_count']/df['word_count']
df['%_something_in_text'] = df['something']/df['word_count']
df['%_thing_in_text'] = df['thing']/df['word_count']
df['%_anything_in_text'] = df['anything']/df['word_count']

### See if her usage of  stopwords in general increased

In [59]:
sw = stopwords.words("english")

In [60]:
df['stemmed_removed_stopwords'] = df['stemmed_text'].map(lambda x: ' '.join([y for y in x.split() if y not in sw]))
df['only_stopwords'] = df['stemmed_text'].map(lambda x: ' '.join([y for y in x.split() if y in sw]))

In [61]:
df['stopwords_count'] = df['only_stopwords'].map(lambda x: len(x.split()))
df['%_stop_words'] = (df['stopwords_count'])/df['word_count']

### Other vague words:

In [72]:
def word_counter(df, word_interest):
    word_list = []
    for row in (range(len(df))):
        d = {}
        word_counter = 0
        for word in df['stemmed_text'][row].split():
            if word in word_interest:
                word_counter += 1

        d['book_title']  = df.loc[row]['book_title']
        d[f'{word_interest}_count'] = word_counter
        word_list.append(d)
        
    word_df = pd.DataFrame(word_list).reset_index(drop=True)
    df = pd.merge(df, word_df)
    df[f'%_{word_interest}'] = df[f'{word_interest}_count']/df['word_count']
    return df

In [73]:
df = word_counter(df, 'stuff')

In [74]:
df = word_counter(df, 'lot')

In [75]:
df = word_counter(df, 'very')

### I think it's mostly clean now!

In [76]:
df['year_written'] = df['year_written'].astype(int)
df = df.sort_values(by='year_written').reset_index(drop=True)

In [77]:
df.to_csv('./CSVs/all_clean.csv', index=False)