In [1]:
import matplotlib
import matplotlib.pyplot as plt
import nltk
import pandas as pd

import os
import sys
ai_lit_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir))
print("Loading AI Lit system from path", ai_lit_path)
sys.path.append(ai_lit_path)
from ai_lit.input import input_util
from ai_lit.input.gutenberg_dataset import gb_input as gb
from ai_lit.input.gutenberg_dataset import gb_dataset_util

# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60) 

# include matplot inline in the workbook
%matplotlib inline
matplotlib.style.use('ggplot')

Loading AI Lit system from path C:\Users\joewo\workspaces\python\ai_lit




In [2]:
train, test, sbjs, vocab = gb.get_inputs(os.path.join(ai_lit_path, 'workspace', 'gb_input'), 5000)

In [3]:
print("Train set length:", len(train))
print("Test set length:", len(test))

Train set length: 2528
Test set length: 1049


In [4]:
variables = train[0].__dict__.keys()
train_df = pd.DataFrame([[getattr(i,j) for j in variables] for i in train], columns = variables)
test_df = pd.DataFrame([[getattr(i,j) for j in variables] for i in test], columns = variables)

In [5]:
print("[Train] Number of unknown authors:", len(train_df[train_df['author'] == 'unknown']))
print("[Train] Number of unknown titles:", len(train_df[train_df['title'] == 'unknown']))
print("[Test] Number of unknown authors:", len(test_df[test_df['author'] == 'unknown']))
print("[Test] Number of unknown titles:", len(test_df[test_df['title'] == 'unknown']))

[Train] Number of unknown authors: 13
[Train] Number of unknown titles: 0
[Test] Number of unknown authors: 1
[Test] Number of unknown titles: 0


In [6]:
train_df['chapter_count'] = train_df['body'].apply(lambda x: len(input_util.parse_chapters(x)))
test_df['chapter_count'] = test_df['body'].apply(lambda x: len(input_util.parse_chapters(x)))

In [None]:
train_df['length'] = train_df['body'].apply(lambda x: len(nltk.word_tokenize(x)))
test_df['length'] = test_df['body'].apply(lambda x: len(nltk.word_tokenize(x)))

In [7]:
subjects = ["Science fiction", "Adventure stories", "Historical fiction", "Love stories", "Detective and mystery stories", "Western stories"]
train_df['atomic_subject'] = train_df['subjects'].apply(lambda x: gb_dataset_util.get_atomic_subject(x, subjects))
test_df['atomic_subject'] = test_df['subjects'].apply(lambda x: gb_dataset_util.get_atomic_subject(x, subjects))

In [8]:
full_df = pd.concat([train_df, test_df])

In [9]:
full_df['atomic_subject'].value_counts()

Science fiction                  1186
Adventure stories                 595
Love stories                      508
Detective and mystery stories     485
Historical fiction                410
Western stories                   393
Name: atomic_subject, dtype: int64

In [10]:
full_df.groupby(['atomic_subject'])['chapter_count'].mean()

atomic_subject
Adventure stories                27.341176
Detective and mystery stories    27.115464
Historical fiction               30.468293
Love stories                     26.055118
Science fiction                   8.500843
Western stories                  24.193384
Name: chapter_count, dtype: float64

In [11]:
full_df.groupby(['atomic_subject'])['chapter_count'].max()

atomic_subject
Adventure stories                 165
Detective and mystery stories    1484
Historical fiction                595
Love stories                      242
Science fiction                   104
Western stories                    93
Name: chapter_count, dtype: int64

In [12]:
full_df.groupby(['atomic_subject'])['chapter_count'].min()

atomic_subject
Adventure stories                2
Detective and mystery stories    2
Historical fiction               2
Love stories                     2
Science fiction                  2
Western stories                  2
Name: chapter_count, dtype: int64

In [13]:
full_df.groupby(['atomic_subject'])['chapter_count'].std()

atomic_subject
Adventure stories                20.668693
Detective and mystery stories    68.028852
Historical fiction               36.925819
Love stories                     22.906054
Science fiction                   9.408610
Western stories                  15.767710
Name: chapter_count, dtype: float64

In [14]:
full_df[full_df['chapter_count'] == 2]['atomic_subject'].value_counts().div(full_df['atomic_subject'].value_counts())

Adventure stories                0.058824
Detective and mystery stories    0.096907
Historical fiction               0.068293
Love stories                     0.088583
Science fiction                  0.107926
Western stories                  0.081425
Name: atomic_subject, dtype: float64

In [None]:
full_df.groupby(['atomic_subject'])['length'].mean()

In [None]:
full_df.groupby(['atomic_subject'])['length'].max()

In [None]:
full_df.groupby(['atomic_subject'])['length'].min()

In [None]:
full_df.groupby(['atomic_subject'])['length'].std()

In [None]:
from ai_lit.input.gutenberg_dataset import gb_chapters_dataset
wkspc = os.path.join(ai_lit_path, 'workspace', 'gb_input')
subjects = gb.get_subjects(wkspc)
vocab = input_util.get_sorted_vocab(gb.get_vocabulary(wkspc))
vocab = vocab[:5001]
records, title_map = gb_chapters_dataset.extract_dataset(wkspc, len(subjects), vocab, -1)

In [None]:
chap_df = pd.DataFrame([[book_idx, title_map[book_idx], records[book_idx]] for book_idx in records], columns = ['id', 'title-author', 'chapters'])

In [None]:
chap_df['avg-len'] = chap_df['chapters'].apply(lambda x: sum([len(c) for c in x])/len(x))
print("Mean:", chap_df['avg-len'].mean())
print("Median:", chap_df['avg-len'].median())
print("Min (avg):", chap_df['avg-len'].min())
print("Max (avg):", chap_df['avg-len'].max())