This file take the blurbs scraped from a major book website and cleans them. Process is as follows:
1. Cleaning the URLs of the blurbs that were scraped so we can compare it with the 'title' column in our book-crossings dataset
2. Using a function that crawls through the URLs and matches them to titles, shifting the cells up and down throughout the process. This is because the scrape skipped many titles and added nonsense data randomly.
3. Many of the blurbs have repeated sections that I identified and deleted.
4. For each blurb, break into sentences, generate BERT vectors using BERT-as-a-service
5. Add vectors into the books dataframe and save as a CSV file

In [1]:
import numpy as np
import pandas as pd
import string
import re
import time
import random

from bert_serving.client import BertClient

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans



In [13]:
### Data

# BookCrossing
# http://www2.informatik.uni-freiburg.de/~cziegler/BX/

df_ratings = pd.read_csv('/DataScience/BX-CSV-Dump/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding = "latin-1")
df_users = pd.read_csv('/DataScience/BX-CSV-Dump/BX-Users.csv', sep=';', encoding='latin-1')
df_books = pd.read_csv('/DataScience/BX-CSV-Dump/BX-Books.csv', sep=';', error_bad_lines=False, encoding = "latin-1")

# Renaming columns for ease of use, and dropping image links I wont be using
df_ratings.rename(columns={'User-ID': 'User', 'Book-Rating': 'Rating'}, inplace=True)
df_users.rename(columns={'User-ID': 'User'}, inplace=True)
df_books.rename(columns={'ISBN': 'ISBN', 'Book-Title': 'Title', 'Year-Of-Publication': 'Year', 'Book-Author': 'Author'}, inplace=True)
df_books.drop(['Image-URL-S', 'Image-URL-M', 'Image-URL-L'], 1, inplace=True)

# Using only explicit ratings. 0 ratings are 'implicit' in documentation of data:
df_ratings = df_ratings[df_ratings['Rating'] > 0]

# Inner join df_ratings and df_books
df_ratings = df_ratings[df_ratings['ISBN'].isin(df_books.ISBN.unique())]
df_books = df_books[df_books['ISBN'].isin(df_ratings.ISBN.unique())]

df_books.reset_index(inplace=True, drop=True)
df_ratings.reset_index(inplace=True, drop=True)
df_users.reset_index(inplace=True, drop=True)

# 2 batches of scraped blurbs from Goodreads
first_scrape = pd.read_csv('/Users/jdobrow/Code/blurbs1.csv')
second_scrape = pd.read_csv('/Users/jdobrow/Code/blurbs2.csv')


b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


In [165]:
# We want to add our scraped blurbs to df_books, but the scraped data is messy and
# does not line up correctly.

# Our titles from the scraped URLs. They almost, but not quite match the actual titles.

scrapes = [first_scrape, second_scrape]
url_titles = []
for chunk in scrapes:
    for book in range(len(chunk)):
        skip, url_split = 0, -1
        for i in range(len(chunk.URL[book])):
            skip += 1 if chunk.URL[book][i] == '.':
                if skip == 3:
                    url_split = chunk.URL[book][i+1:].lower()
                    url_split = re.sub(r'[^\w\s]',' ',url_split).split('_')
        if url_split == -1:
            for i in range(len(chunk.URL[book])):
                if chunk.URL[book][i] == '-':
                    url_split = chunk.URL[book][i+1:].lower().split('-')
                    break
        url_titles.append(url_split)

    # The actual text of the blurb
    blurbs = list(chunk.text)

    # The titles from our data, tidied up a bit to assist matching
    if len(chunk) == len(second_scrape):
        book_titles = list(df_books['Title'][100000:])
        for i in range(len(df_books) - len(chunk) - 100000):
            url_titles.append(None)
            blurbs.append(None)
    else:
        book_titles = list(df_books['Title'][:100000])
        for i in range(len(100000 - len(chunk)):
            url_titles.append(None)
            blurbs.append(None)
                       
    for title in range(len(book_titles)):
        book_titles[title] = book_titles[title].lower()
        book_titles[title] = re.sub(r'[^\w\s]',' ', book_titles[title]).split()

    
compare_df = pd.DataFrame()
compare_df['URLTitle'] = url_titles
compare_df['BookTitle'] = book_titles
compare_df['Blurb'] = blurbs

In [166]:
# Checks if the title and url match each other. Matches are determined by how many
# shared words there are
stop_words = ['the', 'of', 'if', 'and', 'it', 'as', 'or']
def check_matched_up(url_index, book_index, data):
    try:
        count = 0
        for k in data.URLTitle[url_index]:
            if k in data.BookTitle[book_index]:
                if k not in stop_words:
                    count += 1
        if count >= max(min(len(data.URLTitle[url_index])//2, len(data.BookTitle[book_index])//2), 1):
            return True
        else:
            return False
   
    except:
        return False  

In [None]:
# Iterates through the data and check if titles match up, and if not corrects them by
# shifting all of the data up or down or in some cases swapping entries.
goto = len(compare_df)
beginning = 0
while beginning < goto:

    for row in range(beginning, goto):
        if (check_matched_up(row, row, compare_df) == True):
            # everything good
            #print(row, 'case 1')
            None
        elif (check_matched_up(row, row + 1, compare_df) == True) and (check_matched_up(row + 1, row, compare_df) == True):
            # 2 adjacent rows need to be swapped
            url1, url2 = compare_df.URLTitle[row], compare_df.URLTitle[row + 1]
            blurb1, blurb2 = compare_df.Blurb[row], compare_df.Blurb[row + 1]
            compare_df.iloc[row, 0], compare_df.iloc[row + 1, 0] = url2, url1
            compare_df.iloc[row, 2], compare_df.iloc[row + 1, 2] = blurb2, blurb1
            #print(row, 'case 2')
            break
        elif (check_matched_up(row + 1, row, compare_df) == True):
            # Insert an empty entry into book titles to match things up
            book_titles.insert(row, None)
            url_titles.append(None)
            blurbs.append(None)
            compare_df = pd.DataFrame()
            compare_df['URLTitle'] = url_titles
            compare_df['BookTitle'] = book_titles
            compare_df['Blurb'] = blurbs
            #print(row, 'case 3')
            break
        elif (check_matched_up(row, row + 1, compare_df) == True):
            # Insert an empty entry into url titles to match things up
            url_titles.insert(row, None)
            blurbs.insert(row, None)
            book_titles.append(None)
            compare_df = pd.DataFrame()
            compare_df['URLTitle'] = url_titles
            compare_df['BookTitle'] = book_titles
            compare_df['Blurb'] = blurbs
            #print(row, 'case 4')
            break
        else:
            book_titles[row] = None
            url_titles[row] = None
            compare_df = pd.DataFrame()
            compare_df['URLTitle'] = url_titles
            compare_df['BookTitle'] = book_titles
            compare_df['Blurb'] = blurbs
            #print(row, 'case 5')
    
    beginning = row + 1

In [215]:
# Drop missing rows
compare_df = compare_df.dropna()
compare_df.reset_index(inplace=True, drop=True)

In [247]:
# Move blurbs from compare_df to a new dataframe to work out of

active = df_books.dropna().reset_index(drop=True)
blurbs = []
blurb_row = 0
for book_row in range(len(active)):
    cleaned_title = active['Title'][book_row].lower()
    cleaned_title = re.sub(r'[^\w\s]',' ', cleaned_title).split()
    if cleaned_title == compare_df.BookTitle[blurb_row]:
        blurbs.append(compare_df.Blurb[blurb_row])
        if blurb_row < len(compare_df):
            blurb_row += 1
    else:
        blurbs.append(None)
active['Blurb'] = blurbs

active.dropna(inplace=True)
active.reset_index(inplace=True, drop=True)


In [251]:
# Due to the nature of the scraped data, many of the blurbs have a repeated section.
# This block finds the repeat and deletes it.

updated_blurbs = []

for row in range(len(active)):
    # Most of the very short blurbs appear to be nonsense
    if len(active.Blurb[row]) > 100:
        try:
            # A lot of random case by case errors are happening so I catch all with a
            # try, and when it fails just go with the raw blurb since it's close enough.
            go = False
            string = active.Blurb[row][200:]
            last_i = -1
            first_i = 0
            # the regular expression search doesn't like non alphanumeric characters so this
            # searches for a chunk to find a repeating section.
            while (go == False) and (first_i < 190):
                first_char = '!'
                first_i = last_i + 1
                while not first_char.isalpha():
                    first_char = active.Blurb[row][first_i]
                    first_i += 1
                last_char = 'a'
                last_i = first_i + 1
                str_len = 0
                while (last_char.isalpha() or last_char.isspace()) and (str_len < 10):
                    str_len = last_i - first_i + 1
                    last_char = active.Blurb[row][last_i]
                    last_i += 1
                if str_len == 10:
                    go = True
                if last_i > len(active.Blurb[row]) - 2:
                    go = True
            a = re.search(r'{}'.format(active.Blurb[row][(first_i-1):(last_i-1)]), string)
            
            updated_blurbs.append(active.Blurb[row][a.start() + 200 - first_i + 1:])
            
        except:
            updated_blurbs.append(active.Blurb[row])
    else:
        updated_blurbs.append(None)
        
active['Blurb'] = updated_blurbs
active = active.dropna()
active.reset_index(inplace=True, drop=True)


In [258]:
# Get the BERT vectors for each sentence in a blurb, and then average them

bc = BertClient()

vector_list = []
for blurb in active.Blurb:
    raw = re.split('\. |! |\? ', blurb)
    sentences = []
    for sentence in raw:
        if (len(sentence) > 0) and not sentence.isspace():
            sentences.append(sentence)
    vectors = bc.encode(sentences)
    
    mean_vector = vectors[0]
    for i in range(len(vectors) - 1):
        mean_vector = mean_vector + vectors[i + 1]
    mean_vector = mean_vector/len(vectors)
    vector_list.append(mean_vector)

here is what you can do:
- or, start a new server with a larger "max_seq_len"
  '- or, start a new server with a larger "max_seq_len"' % self.length_limit)


1
501
1001
1501
2001
2501
3001
3501
4001
4501
5001
5501
6001
6501
7001
7501
8001
8501
9001
9501
10001
10501
11001
11501
12001
12501
13001
13501
14001
14501
15001
15501
16001
16501
17001
17501
18001
18501
19001
19501
20001
20501
21001
21501


In [270]:
# Combine books with vectors and save to a local csv file

vectors = pd.DataFrame(vector_list)
active = pd.concat([active, vectors], axis=1)
active.to_csv('/DataScience/Final Capstone Files/books_with_blurbs_and_BERT_combined.csv')
