In [3]:
import matplotlib
import matplotlib.pyplot as plt
import nltk
import pandas as pd

import sys
sys.path.append("C:/workspaces/python/ai_lit")
from ai_lit.input.gutenberg_dataset import gb_input as gb
from ai_lit.input.gutenberg_dataset import gb_dataset_util

# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60) 

# include matplot inline in the workbook
%matplotlib inline
matplotlib.style.use('ggplot')

In [4]:
train, test, sbjs, vocab = gb.get_inputs('C:/workspaces/python/ai_lit/workspace/gb_input', 5000)

In [5]:
print("Train set length:", len(train))
print("Test set length:", len(test))

Train set length: 2528
Test set length: 1049


In [7]:
variables = train[0].__dict__.keys()
train_df = pd.DataFrame([[getattr(i,j) for j in variables] for i in train], columns = variables)
test_df = pd.DataFrame([[getattr(i,j) for j in variables] for i in test], columns = variables)

In [8]:
print("[Train] Number of unknown authors:", len(train_df[train_df['author'] == 'unknown']))
print("[Train] Number of unknown titles:", len(train_df[train_df['title'] == 'unknown']))
print("[Test] Number of unknown authors:", len(test_df[test_df['author'] == 'unknown']))
print("[Test] Number of unknown titles:", len(test_df[test_df['title'] == 'unknown']))

[Train] Number of unknown authors: 13
[Train] Number of unknown titles: 0
[Test] Number of unknown authors: 1
[Test] Number of unknown titles: 0


In [10]:
train_df['length'] = train_df['body'].apply(lambda x: len(nltk.word_tokenize(x)))
test_df['length'] = test_df['body'].apply(lambda x: len(nltk.word_tokenize(x)))

In [13]:
subjects = ["Science fiction", "Adventure stories", "Historical fiction", "Love stories", "Detective and mystery stories", "Western stories"]
train_df['atomic_subject'] = train_df['subjects'].apply(lambda x: gb_dataset_util.get_atomic_subject(x, subjects))
test_df['atomic_subject'] = test_df['subjects'].apply(lambda x: gb_dataset_util.get_atomic_subject(x, subjects))

In [16]:
full_df = pd.concat([train_df, test_df])

In [20]:
full_df['atomic_subject'].value_counts()

Science fiction                  1186
Adventure stories                 595
Love stories                      508
Detective and mystery stories     485
Historical fiction                410
Western stories                   393
Name: atomic_subject, dtype: int64

In [26]:
full_df.groupby(['atomic_subject'])['length'].mean()

atomic_subject
Adventure stories                 91997.047059
Detective and mystery stories     87762.758763
Historical fiction               133760.565854
Love stories                     111880.927165
Science fiction                   28535.950253
Western stories                   93883.117048
Name: length, dtype: float64

In [27]:
full_df.groupby(['atomic_subject'])['length'].max()

atomic_subject
Adventure stories                 414568
Detective and mystery stories     254328
Historical fiction               2999088
Love stories                      431133
Science fiction                   237699
Western stories                   232957
Name: length, dtype: int64

In [28]:
full_df.groupby(['atomic_subject'])['length'].min()

atomic_subject
Adventure stories                4461
Detective and mystery stories    9526
Historical fiction               3478
Love stories                     6940
Science fiction                  3766
Western stories                  9476
Name: length, dtype: int64

In [29]:
full_df.groupby(['atomic_subject'])['length'].std()

atomic_subject
Adventure stories                 47383.022308
Detective and mystery stories     36513.178101
Historical fiction               161693.891316
Love stories                      67279.125544
Science fiction                   33471.728802
Western stories                   31280.363937
Name: length, dtype: float64