# Explore the Data

In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats

import nltk
import nltk.sentiment
from wordcloud import WordCloud

### Load the Data

In [2]:
blurbs = pd.read_csv('cleaned_book_blurbs.csv')

In [3]:
blurbs.head()

Unnamed: 0,genre,sub-genre,original,clean,stemmed,lemmatized
0,Horror,ghost-stories,"Designed to appeal to the book lover, the Macm...",designed appeal book lover macmillan collector...,design appeal book lover macmillan collector '...,designed appeal book lover macmillan collector...
1,Horror,ghost-stories,"Part of the Penguin Orange Collection, a limit...",part penguin orange collection limitedrun seri...,part penguin orang collect limitedrun seri twe...,part penguin orange collection limitedrun seri...
2,Horror,ghost-stories,Part of a new six-volume series of the best in...,part new sixvolume series best classic horror ...,part new sixvolum seri best classic horror sel...,part new sixvolume series best classic horror ...
3,Horror,ghost-stories,A USA TODAY BESTSELLER!An Indie Next Pick!An O...,usa today bestselleran indie next pickan octob...,usa today bestselleran indi next pickan octob ...,usa today bestselleran indie next pickan octob...
4,Horror,ghost-stories,From the New York Times best-selling author of...,new york times bestselling author southern boo...,new york time bestsel author southern book clu...,new york time bestselling author southern book...


In [4]:
blurbs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21415 entries, 0 to 21414
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   genre       21415 non-null  object
 1   sub-genre   21415 non-null  object
 2   original    21414 non-null  object
 3   clean       21415 non-null  object
 4   stemmed     21415 non-null  object
 5   lemmatized  21415 non-null  object
dtypes: object(6)
memory usage: 1004.0+ KB


### Split the Data

Exploration should only be done on the training data set.

In [23]:
train, test = train_test_split(blurbs, stratify = blurbs.genre, test_size = .25, random_state = 123)
train.shape, test.shape

((16061, 6), (5354, 6))

### Combine Blurbs

For exploration, I will combine all of the lemmatized blurbs into one mass of text for each genre. Don't forget to only use training data.

In [24]:
#Create mass of text for horror genre
horror_blurbs = train[train.genre == 'Horror']
horror_words = ' '.join(horror_blurbs.lemmatized)

In [25]:
#Create mass of text for romance genre
romance_blurbs = train[train.genre == 'Romance']
romance_words = ' '.join(romance_blurbs.lemmatized)

In [26]:
#Create mass of text for Mystery and Crime genre
mystery_blurbs = train[train.genre == 'Mystery and Crime']
mystery_words = ' '.join(mystery_blurbs.lemmatized)

In [27]:
#Create mass of text for Sci-Fi and Fantasy genre
fantasy_blurbs = train[train.genre == 'Sci-Fi and Fantasy']
fantasy_words = ' '.join(fantasy_blurbs.lemmatized)

### Begin Exploring

What are the top 5 most common words in the horror blurbs?

In [29]:
horror_freq = pd.Series(horror_words.split()).value_counts()

In [34]:
horror_freq.sort_values(ascending = False).head(6)

'         9794
ha        3190
story     3104
one       2649
new       2501
horror    2200
dtype: int64