In [1]:
import matplotlib
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd

import os
import sys
ai_lit_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir))
print("Loading AI Lit system from path", ai_lit_path)
sys.path.append(ai_lit_path)
from ai_lit.input import input_util
from ai_lit.input.gutenberg_dataset import gb_input as gb
from ai_lit.input.gutenberg_dataset import gb_dataset_util

# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60) 

# include matplot inline in the workbook
%matplotlib inline
matplotlib.style.use('ggplot')

Loading AI Lit system from path C:\Users\joewo\workspaces\python\ai_lit




In [2]:
train, test, sbjs, vocab = gb.get_inputs(os.path.join(ai_lit_path, 'workspace', 'gb_input'), 5000)

In [3]:
print("Train set length:", len(train))
print("Test set length:", len(test))

Train set length: 2528
Test set length: 1049


In [4]:
variables = train[0].__dict__.keys()
train_df = pd.DataFrame([[getattr(i,j) for j in variables] for i in train], columns = variables)
test_df = pd.DataFrame([[getattr(i,j) for j in variables] for i in test], columns = variables)

In [5]:
print("[Train] Number of unknown authors:", len(train_df[train_df['author'] == 'unknown']))
print("[Train] Number of unknown titles:", len(train_df[train_df['title'] == 'unknown']))
print("[Test] Number of unknown authors:", len(test_df[test_df['author'] == 'unknown']))
print("[Test] Number of unknown titles:", len(test_df[test_df['title'] == 'unknown']))

[Train] Number of unknown authors: 13
[Train] Number of unknown titles: 0
[Test] Number of unknown authors: 1
[Test] Number of unknown titles: 0


In [None]:
train_df['chapter_count'] = train_df['body'].apply(lambda x: len(input_util.parse_chapters(x)))
test_df['chapter_count'] = test_df['body'].apply(lambda x: len(input_util.parse_chapters(x)))

In [None]:
#train_df['length'] = train_df['body'].apply(lambda x: len(nltk.word_tokenize(x)))
#test_df['length'] = test_df['body'].apply(lambda x: len(nltk.word_tokenize(x)))

In [7]:
train_df['atomic_subject'] = train_df['subjects'].apply(lambda x: gb_dataset_util.get_atomic_subject(x, sbjs))
test_df['atomic_subject'] = test_df['subjects'].apply(lambda x: gb_dataset_util.get_atomic_subject(x, sbjs))

In [8]:
full_df = pd.concat([train_df, test_df])

In [9]:
print(full_df['atomic_subject'].value_counts())

Science fiction                  1186
Adventure stories                 595
Love stories                      508
Detective and mystery stories     485
Historical fiction                410
Western stories                   393
Name: atomic_subject, dtype: int64


In [None]:
print(full_df.groupby(['atomic_subject'])['chapter_count'].mean())

In [None]:
print(full_df['chapter_count'].mean())

In [None]:
full_df.groupby(['atomic_subject'])['chapter_count'].max()

In [None]:
full_df.groupby(['atomic_subject'])['chapter_count'].min()

In [None]:
full_df.groupby(['atomic_subject'])['chapter_count'].std()

In [None]:
full_df[full_df['chapter_count'] == 2]['atomic_subject'].value_counts().div(full_df['atomic_subject'].value_counts())

In [None]:
full_df.groupby(['atomic_subject'])['length'].mean()

In [None]:
full_df.groupby(['atomic_subject'])['length'].max()

In [None]:
full_df.groupby(['atomic_subject'])['length'].min()

In [None]:
full_df.groupby(['atomic_subject'])['length'].std()

In [3]:
from ai_lit.input.gutenberg_dataset import gb_chapters_dataset
wkspc = os.path.join(ai_lit_path, 'workspace', 'gb_input')
subjects = gb.get_subjects(wkspc)
vocab = input_util.get_sorted_vocab(gb.get_vocabulary(wkspc))
vocab = vocab[:5001]

In [4]:
records, title_map = gb_chapters_dataset.extract_dataset(wkspc, len(subjects), vocab, -1)
train_chap_df = pd.DataFrame([[book_idx, title_map[book_idx], records[book_idx]] for book_idx in records],
                       columns = ['id', 'title-author', 'chapters'])
records, title_map = gb_chapters_dataset.extract_dataset(wkspc, len(subjects), vocab, -1, train=False)

test_chap_df = pd.DataFrame([[book_idx, title_map[book_idx], records[book_idx]] for book_idx in records],
                       columns = ['id', 'title-author', 'chapters'])

full_chap_df = pd.concat([train_chap_df, test_chap_df])
full_chap_df['subject'] = full_chap_df['chapters'].apply(
    lambda c: subjects[np.argmax(next(iter(c.values())).target)])

Training examples exhausted
Training examples exhausted


In [6]:
full_chap_df['subject'].value_counts()

Science fiction                  1186
Adventure stories                 595
Love stories                      508
Detective and mystery stories     485
Historical fiction                410
Western stories                   393
Name: subject, dtype: int64

In [56]:
full_chap_df['avg-len'] = full_chap_df['chapters'].apply(
    lambda x: sum([len(c.body) for c in x.values()])/len(x.values()))
full_chap_df['min-len'] = full_chap_df['chapters'].apply(
    lambda x: min([len(c.body) for c in x.values()]))
full_chap_df['body-len'] = full_chap_df['chapters'].apply(
    lambda x: sum([len(c.body) for c in x.values()]))

print("Body lengths:", full_chap_df['body-len'].describe())
print()
print("Body lengths by subject:", full_chap_df.groupby(['subject'])['body-len'].mean())
print()
print("Avg chapter lengths:", full_chap_df['avg-len'].describe())
print()
print("Min chapter lengths:", full_chap_df['min-len'].describe())
print()
print("Mean by subject:", full_chap_df.groupby(['subject'])['avg-len'].mean())
print()
print("Median by subject:", full_chap_df.groupby(['subject'])['avg-len'].median())
print()
print("Min by subject (avg):", full_chap_df.groupby(['subject'])['avg-len'].min())
print()
print("Max by subject (avg):", full_chap_df.groupby(['subject'])['avg-len'].max())

Body lengths: count       3577.000000
mean      379101.504054
std       389025.260266
min        11294.000000
25%       115167.000000
50%       257761.000000
75%       519813.000000
max      7699632.000000
Name: body-len, dtype: float64

Body lengths by subject: subject
Adventure stories                459556
Detective and mystery stories    475883
Historical fiction               558343
Love stories                     480265
Science fiction                  165874
Western stories                  463569
Name: body-len, dtype: int64

Avg chapter lengths: count      3577.000000
mean      23694.621157
std       16685.905174
min        3199.603433
25%       12296.695652
50%       18795.475000
75%       29311.333333
max      169558.000000
Name: avg-len, dtype: float64

Min chapter lengths: count      3577.000000
mean       8567.315348
std       11269.122934
min         158.000000
25%        5049.000000
50%        5995.000000
75%        7594.000000
max      169558.000000
Name: min-len, dty

In [51]:
full_chap_df['chapter-count'] = full_chap_df['chapters'].apply(lambda x: len(x))
print(full_chap_df['chapter-count'].describe())
full_chap_df.groupby(['subject'])['chapter-count'].describe()

count    3577.000000
mean       19.379368
std        26.805394
min         2.000000
25%         6.000000
50%        15.000000
75%        28.000000
max      1107.000000
Name: chapter-count, dtype: float64


subject                             
Adventure stories              count     595.000000
                               mean       24.489076
                               std        17.841364
                               min         2.000000
                               25%        13.000000
                               50%        23.000000
                               75%        32.000000
                               max       138.000000
Detective and mystery stories  count     485.000000
                               mean       25.092784
                               std        51.087974
                               min         2.000000
                               25%        13.000000
                               50%        23.000000
                               75%        32.000000
                               max      1107.000000
Historical fiction             count     410.000000
                               mean       27.797561
                           