In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.goodreads.com/quotes/tag/{}?page={}'

In [3]:
emotions = ['love', 'religion']

In [4]:
complete = url.format(emotions[0], 1)

In [5]:
complete

'https://www.goodreads.com/quotes/tag/love?page=1'

In [6]:
def get_quotes(complete):
    data = requests.get(complete)
    soup = BeautifulSoup(data.text)
    divs = soup.find_all('div', attrs={'class' : 'quoteText'})
    quotes = [div.text.strip().split('\n')[0][1:-1] for div in divs]
    return quotes

In [7]:
quotes = get_quotes(complete)

In [8]:
X, y = [], []

for emotion in emotions:
    for i in range(1, 6):
        complete = url.format(emotion, i)
        quotes = get_quotes(complete)
        X.extend(quotes)
        y.extend([emotion] * len(quotes))
        print(f'Processed page {i} for {emotion}')

Processed page 1 for love
Processed page 2 for love
Processed page 3 for love
Processed page 4 for love
Processed page 5 for love
Processed page 1 for religion
Processed page 2 for religion
Processed page 3 for religion
Processed page 4 for religion
Processed page 5 for religion


In [9]:
X[100]

'The very essence of romance is uncertainty.'

In [10]:
y[200]

'religion'

In [11]:
import pandas as pd

In [12]:
df = pd.DataFrame(list(zip(y, X)), columns=['emotion', 'quotes'])

In [13]:
df.to_csv('emotions.csv', index=False)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vect = CountVectorizer(max_features=500)

In [16]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [17]:
tokenizer = RegexpTokenizer('\w+')
sw = set(stopwords.words('english'))
ps = PorterStemmer()

In [18]:
def getStemmedQuote(quote):
    quote = quote.lower()
    
    # tokenize
    tokens = tokenizer.tokenize(quote)
    
    # remove stopwords
    new_tokens = [token for token in tokens if token not in sw]
    
    stemmed_token = [ps.stem(token) for token in new_tokens]
    
    clean_quote = ' '.join(stemmed_token)
    
    return clean_quote

def getStemmedQuotes(quotes):
    d = []
    for quote in quotes:
        d.append(getStemmedQuote(quote))
    return d

In [19]:
X = getStemmedQuotes(X)

In [20]:
vect.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=500, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [21]:
len(vect.vocabulary_)

500

In [22]:
X_mod = vect.transform(X).todense()

In [23]:
from sklearn.model_selection import train_test_split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(
...     X_mod, y, test_size=0.33, random_state=42)

In [25]:
from sklearn.naive_bayes import BernoulliNB

In [26]:
model = BernoulliNB()

In [27]:
model.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [28]:
model.score(X_test, y_test)

0.8080808080808081

In [29]:
line = "You're just too good to be true can't take my eyes off you you'd be like heaven to touch I wanna hold you so much I love you baby"

In [30]:
X_vec = vect.transform([line]).todense()

In [31]:
model.predict(X_vec)

array(['love'], dtype='<U8')