In [None]:
import matplotlib
import matplotlib.pyplot as plt
import nltk
import pandas as pd

import os
import sys
ai_lit_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir))
print("Loading AI Lit system from path", ai_lit_path)
sys.path.append(ai_lit_path)
from ai_lit.input import input_util
from ai_lit.input.gutenberg_dataset import gb_input as gb
from ai_lit.input.gutenberg_dataset import gb_dataset_util

# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60) 

# include matplot inline in the workbook
%matplotlib inline
matplotlib.style.use('ggplot')

In [None]:
train, test, sbjs, vocab = gb.get_inputs(os.path.join(ai_lit_path, 'workspace', 'gb_input'), 5000)

In [None]:
print("Train set length:", len(train))
print("Test set length:", len(test))

In [None]:
variables = train[0].__dict__.keys()
train_df = pd.DataFrame([[getattr(i,j) for j in variables] for i in train], columns = variables)
test_df = pd.DataFrame([[getattr(i,j) for j in variables] for i in test], columns = variables)

In [None]:
print("[Train] Number of unknown authors:", len(train_df[train_df['author'] == 'unknown']))
print("[Train] Number of unknown titles:", len(train_df[train_df['title'] == 'unknown']))
print("[Test] Number of unknown authors:", len(test_df[test_df['author'] == 'unknown']))
print("[Test] Number of unknown titles:", len(test_df[test_df['title'] == 'unknown']))

In [None]:
train_df['chapter_count'] = train_df['body'].apply(lambda x: len(input_util.parse_chapters(x)))
test_df['chapter_count'] = test_df['body'].apply(lambda x: len(input_util.parse_chapters(x)))

In [None]:
#train_df['length'] = train_df['body'].apply(lambda x: len(nltk.word_tokenize(x)))
#test_df['length'] = test_df['body'].apply(lambda x: len(nltk.word_tokenize(x)))

In [None]:
subjects = ["Science fiction", "Adventure stories", "Historical fiction", "Love stories", "Detective and mystery stories", "Western stories"]
train_df['atomic_subject'] = train_df['subjects'].apply(lambda x: gb_dataset_util.get_atomic_subject(x, subjects))
test_df['atomic_subject'] = test_df['subjects'].apply(lambda x: gb_dataset_util.get_atomic_subject(x, subjects))

In [None]:
full_df = pd.concat([train_df, test_df])

In [None]:
print(train_df['atomic_subject'].value_counts())
print(test_df['atomic_subject'].value_counts())
print(full_df['atomic_subject'].value_counts())

In [None]:
print(train_df.groupby(['atomic_subject'])['chapter_count'].mean())
print(test_df.groupby(['atomic_subject'])['chapter_count'].mean())
print(full_df.groupby(['atomic_subject'])['chapter_count'].mean())

In [None]:
full_df.groupby(['atomic_subject'])['chapter_count'].max()

In [None]:
full_df.groupby(['atomic_subject'])['chapter_count'].min()

In [None]:
full_df.groupby(['atomic_subject'])['chapter_count'].std()

In [None]:
full_df[full_df['chapter_count'] == 2]['atomic_subject'].value_counts().div(full_df['atomic_subject'].value_counts())

In [None]:
full_df.groupby(['atomic_subject'])['length'].mean()

In [None]:
full_df.groupby(['atomic_subject'])['length'].max()

In [None]:
full_df.groupby(['atomic_subject'])['length'].min()

In [None]:
full_df.groupby(['atomic_subject'])['length'].std()

In [None]:
from ai_lit.input.gutenberg_dataset import gb_chapters_dataset
wkspc = os.path.join(ai_lit_path, 'workspace', 'gb_input')
subjects = gb.get_subjects(wkspc)
vocab = input_util.get_sorted_vocab(gb.get_vocabulary(wkspc))
vocab = vocab[:5001]
records, title_map = gb_chapters_dataset.extract_dataset(wkspc, len(subjects), vocab, -1)

In [None]:
chap_df = pd.DataFrame([[book_idx, title_map[book_idx], records[book_idx]] for book_idx in records], columns = ['id', 'title-author', 'chapters'])

In [None]:
chap_df['avg-len'] = chap_df['chapters'].apply(lambda x: sum([len(c) for c in x])/len(x))
print("Mean:", chap_df['avg-len'].mean())
print("Median:", chap_df['avg-len'].median())
print("Min (avg):", chap_df['avg-len'].min())
print("Max (avg):", chap_df['avg-len'].max())

In [None]:
chap_df['chapter-count'] = chap_df['chapters'].apply(lambda x: len(x))
chap_df['chapter-count'].describe()

In [None]:
v_records, v_title_map = gb_chapters_dataset.extract_dataset(wkspc, len(subjects), vocab, -1, train=False)

In [None]:
chap_df = pd.DataFrame([[book_idx, title_map[book_idx], v_records[book_idx]] for book_idx in v_records], columns = ['id', 'title-author', 'chapters'])

In [None]:
chap_df['avg-len'] = chap_df['chapters'].apply(lambda x: sum([len(c) for c in x])/len(x))
print("Mean:", chap_df['avg-len'].mean())
print("Median:", chap_df['avg-len'].median())
print("Min (avg):", chap_df['avg-len'].min())
print("Max (avg):", chap_df['avg-len'].max())

In [None]:
chap_df['chapter-count'] = chap_df['chapters'].apply(lambda x: len(x))
chap_df['chapter-count'].describe()

In [None]:
chap_df['chapter-count'].sum()