Input:
directory with book txt files
csv with book metadata

Output:
sql script to upload Book table

Book
- title
- author
- publicationDate
- genre
- subjects[]
- keywords[]
- similarManuscripts[]
- description
- isbn13
- isbn10
- amazonProductUrl
- plotStructure[]

insert into "Book" ("title", "author", "publicationDate", "genre", "subjects", "keywords", "similarManuscripts", "description", "cover", "isbn13", "isbn10", "amazonProductUrl", "plotStructure", "createdAt", "updatedAt") 

Flow:
read csv
for each book generate analytics
write output sql line



In [1]:
import nltk
import nltk.data
from nltk.sentiment import SentimentIntensityAnalyzer
import numpy as np
import csv
import math


In [2]:
nltk.download(["names",
"stopwords",
"state_union",
"twitter_samples",
"movie_reviews",
"averaged_perceptron_tagger",
"vader_lexicon",
"punkt",])

[nltk_data] Downloading package names to /Users/idanhahn/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/idanhahn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     /Users/idanhahn/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/idanhahn/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/idanhahn/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/idanhahn/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/idanh

True

In [3]:
def moving_avarage(x, w):
  return np.convolve(x, np.ones(w)/w, mode='valid')/w

In [4]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sia = SentimentIntensityAnalyzer()

In [5]:
def process_plot_structure(book_file):
    """
    Processes the books file and returns plot structure chart points.
    """
    fb = open(book_file, "r", encoding="utf8")
    raw = fb.read()
    lines = tokenizer.tokenize(raw)
    x = []
    y = []
    for idx, line in enumerate(lines):
        x.append(idx)
        y.append(sia.polarity_scores(line)['compound']*100)
    y_ma = moving_avarage(y, 1000)
    y_norm = 100*(y_ma - min(y_ma))/(max(y_ma) - min(y_ma)) - 50
    y_reduce = y_norm[::math.ceil(len(y_norm)/100)]
    y_floored = [math.floor(e) for e in y_reduce]
    if (len(y_floored) == 99):
        y_floored.append(math.ceil(y_norm[-1]))
    
    return y_floored

In [13]:
input_file = open('./books.csv', 'r')
output_file = open('./book_seed.sql', 'w')

csvreader = csv.reader(input_file)
header = next(csvreader)
for row in csvreader:
    subjects = "{subjects}"
    keywords = "{keywords}"
    similar_manuscripts = "{similarManuscripts}"
    plot_structure_array = process_plot_structure(f'./data/{row[1]} - {row[2]}.txt')
    plot_structure = "{" + f'{",".join([str(i) for i in plot_structure_array])}' + "}"
    cover = "cover"
    title = row[1].replace('\'', '\'\'')
    description = row[5].replace('"', '').replace('\'', '\'\'')
    output_file.write(f'INSERT INTO "Book" \
      ("title", "author", "genre", "publicationDate", "description",  "subjects", "keywords",  "isbn13", "isbn10", "amazonProductUrl", "createdAt", "updatedAt") VALUES\
      (\'{title}\', \'{row[2]}\', \'{row[4]}\', \'{row[3]}\', \'{description}\', \'{subjects}\', \'{keywords}\', \'{row[8]}\', \'{row[9]}\', \'{row[10]}\', NOW(), NOW());\n')
output_file.close()