<a href="https://colab.research.google.com/github/jjiahao/business-analytics/blob/master/scrapeHermingway.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The Sun Also Rises (1926) http://gutenberg.ca/ebooks/hemingwaye-sunalsorises/hemingwaye-sunalsorises-00-h.html

Men Without Women (1927) http://gutenberg.ca/ebooks/hemingwaye-menwithoutwomen/hemingwaye-menwithoutwomen-00-h.html

Winner Take Nothing (1933) http://gutenberg.ca/ebooks/hemingwaye-winnertakenothing/hemingwaye-winnertakenothing-00-h.html

Green Hills of Africa (1935) http://gutenberg.ca/ebooks/hemingwaye-greenhillsofafrica/hemingwaye-greenhillsofafrica-00-h.html

Across the River and Into the Trees (1950) http://gutenberg.ca/ebooks/hemingwaye-acrosstheriver/hemingwaye-acrosstheriver-00-h.html

The Old Man and the Sea (1952) http://gutenberg.ca/ebooks/hemingwaye-oldmanandthesea/hemingwaye-oldmanandthesea-00-h.html

In [None]:
# Web Scraping
import requests
from bs4 import BeautifulSoup

# Preprocess
import spacy
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import pandas as pd

# EDA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import homogeneity_score
from sklearn.metrics import silhouette_score

# Viz
import matplotlib.pyplot as plt
import seaborn as sns

## Web Scraping using BeautifulSoup

In [None]:
url = 'http://gutenberg.ca/ebooks/hemingwaye-sunalsorises/hemingwaye-sunalsorises-00-h.html'
res = requests.get(url)
html_page = res.content

soup = BeautifulSoup(html_page, 'html.parser')

text = soup.find_all(text=True)

set([t.parent.name for t in text])

In [None]:

output = ''
for t in text:
    if t.parent.name in ['p']:
        output += '{} '.format(t)
#output

In [None]:
def Web_Scraping(url):
    res = requests.get(url)
    html_page = res.content
    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)
    output = ''
    for t in text:
        if t.parent.name in ['p']:
            output += '{} '.format(t)
    output = output.strip().replace("\n", " ").replace("\r", " ").replace("\r\n", " ").replace("  ", " ")
    return output

In [None]:
url_1 = "http://gutenberg.ca/ebooks/hemingwaye-sunalsorises/hemingwaye-sunalsorises-00-h.html"
book_1 = Web_Scraping(url_1)
url_2 = "http://gutenberg.ca/ebooks/hemingwaye-menwithoutwomen/hemingwaye-menwithoutwomen-00-h.html"
book_2 = Web_Scraping(url_2)
url_3 = "http://gutenberg.ca/ebooks/hemingwaye-winnertakenothing/hemingwaye-winnertakenothing-00-h.html"
book_3 = Web_Scraping(url_3)
url_4 = "http://gutenberg.ca/ebooks/hemingwaye-greenhillsofafrica/hemingwaye-greenhillsofafrica-00-h.html"
book_4 = Web_Scraping(url_4)
url_5 = "http://gutenberg.ca/ebooks/hemingwaye-acrosstheriver/hemingwaye-acrosstheriver-00-h.html"
book_5 = Web_Scraping(url_5)
url_6 = "http://gutenberg.ca/ebooks/hemingwaye-oldmanandthesea/hemingwaye-oldmanandthesea-00-h.html"
book_6 = Web_Scraping(url_6)

## Text Processing

## Ideas

# Number of distinct words (vocabulary)
# Lexical diversity = vocabulary/token ratio
# Distribution of word lengths
# Most frequent words
# Number of n-gram word (bigrams, trigrams, 4-grams, etc.)
# Usage of passive and active voice
# Usage of parts of speech (nouns, verb, adverbs, adjectives, etc.)
# Sentiment (positive, negative)

###  Book Level

In [None]:
all_books = {'book': ['book_1','book_2','book_3','book_4','book_5', 'book_6'],
             'text': [book_1, book_2, book_3, book_4, book_5, book_6]}

all_books = pd.DataFrame(all_books, columns = ['book', 'text'])
all_books

In [None]:

# Counting Vocabulary
all_books['token'] = all_books['text'].apply(word_tokenize)\
.apply(lambda x: [item for item in x if item.isalpha()])

all_books['token count'] = all_books['token'].apply(len)
all_books['vocab count'] = all_books['token'].apply(set).apply(len)
all_books['lexical_diversity'] = all_books['vocab count']/all_books['token count']

In [None]:
sid = SentimentIntensityAnalyzer()
all_books['scores sentiment'] = all_books['text'].apply(lambda x: sid.polarity_scores(x))
all_books

## Sentence Level

In [None]:
def get_sents(book):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(book)
    sent_list = []
    for sent in doc.sents:
        sent_list.append(sent.text)
    return sent_list

In [None]:
sents_1 = get_sents(book_1)
df_1 = {'sentence': sents_1,
        'book': 'Book_1'}
df_1 = pd.DataFrame(df_1, columns = ['sentence', 'book'])
len(sents_1)

In [None]:
sents_2 = get_sents(book_2)
df_2 = {'sentence': sents_2,
        'book': 'Book_2'}
df_2 = pd.DataFrame(df_2, columns = ['sentence', 'book'])
len(sents_2)

In [None]:
sents_2 = get_sents(book_2)
df_2 = {'sentence': sents_2,
        'book': 'Book_2'}
df_2 = pd.DataFrame(df_2, columns = ['sentence', 'book'])
len(sents_2)

In [None]:
sents_4 = get_sents(book_4)
df_4 = {'sentence': sents_4,
        'book': 'Book_4'}
df_4 = pd.DataFrame(df_4, columns = ['sentence', 'book'])
len(sents_4)

In [None]:
sents_5 = get_sents(book_5)
df_5 = {'sentence': sents_5,
        'book': 'Book_5'}
df_5 = pd.DataFrame(df_5, columns = ['sentence', 'book'])
len(sents_5)

In [None]:
sents_6 = get_sents(book_6)
df_6 = {'sentence': sents_6,
        'book': 'Book_6'}
df_6 = pd.DataFrame(df_6, columns = ['sentence', 'book'])
len(sents_6)

In [None]:
df_all = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6]).reset_index(drop=True)
df_all.head(10)

In [None]:
len(sents_1)+len(sents_2)+len(sents_3)+len(sents_4)+len(sents_5)+len(sents_6)

In [None]:
df_all['token'] = df_all['sentence'].apply(word_tokenize)\
.apply(lambda x: [item for item in x if item.isalpha()])

# after remove stopwords and stemmer
stop = stopwords.words('english')
porter_stemmer = PorterStemmer()
df_all['clean token'] = df_all['token'].apply(lambda x: [item for item in x if item not in stop_words])\
.apply(lambda x: [porter_stemmer.stem(item) for item in x])
df_all.head(15)

##  Advanced Methods

In [None]:
# The default regexp select tokens of 2 or more alphanumeric characters 
# And punctuation is completely ignored and always treated as a token separator
# Use unigrams 
count_vect = CountVectorizer(stop_words='english')
X_counts = count_vect.fit_transform(df_all.sentence)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)
print(X_tfidf.shape)

In [None]:
# for reproducibility
random_state = 666
cls = MiniBatchKMeans(n_clusters=6, random_state=random_state)
cls.fit(X_tfidf)
cls.predict(X_tfidf)

In [None]:
# reduce the features to 2D
pca = PCA(n_components=2, random_state=random_state)
reduced_features = pca.fit_transform(X_tfidf.toarray())

# reduce the cluster centers to 2D
reduced_cluster_centers = pca.transform(cls.cluster_centers_)

In [None]:
plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(X_tfidf))
plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:,1], marker='x', s=150, c='b')

In [None]:
reduced_cluster_centers

In [None]:
# Evalauation with labelled dataset
homogeneity_score(df_all.book, cls.predict(X_tfidf))

In [None]:
# The best value is 1 and the worst value is -1. 
# Values near 0 indicate overlapping clusters. 
silhouette_score(X_tfidf, labels=cls.predict(X_tfidf))

In [None]:
# The best value is 1 and the worst value is -1. 
# Values near 0 indicate overlapping clusters. 
silhouette_score(X_tfidf, labels=cls.predict(X_tfidf))

In [None]:
#add the cluster label to the data frame
df_all['cluster'] = kmeans.labels_
clusters = df_all.groupby(['cluster', 'book']).size()
fig, ax1 = plt.subplots(figsize = (26, 15))
sns.heatmap(clusters.unstack(level = 'book'), ax = ax1, cmap = 'Reds')
ax1.set_xlabel('book').set_size(18)
ax1.set_ylabel('cluster').set_size(18)