## Code for Analyzing Anthology Metadata

In [1]:
##########
# Imports
##########
from __future__ import division
import os
import codecs
import re
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
import sys
import math

In [2]:
#read in the data to do some manipulation
df = pd.read_excel(r'C:\Users\Hoyt\Dropbox\CodeDataForBook\Chapter2\Data\CompleteMetadata_Ver3.xlsx', sheetname='Sheet1',)
df.shape

(596979, 13)

# Some Pre-Processing Steps

In [3]:
#need to convert any integers in the "main_title" column to strings
df['main_title'] = df['main_title'].map(lambda x: str(x) if type(x) == int else x)

In [38]:
#functions to help convert years to integers
#need to run this after you load the data
def strip_non_ascii(string):
    ''' Returns the string without non ASCII characters'''
    stripped = (c for c in string if 0 < ord(c) < 127)
    return ''.join(stripped)

def num_converter(number):
    #num = re.sub(ur'\u5b9d\u6587\u9928\u51fa\u7248', '', num)
    if pd.isnull(number):
        return 0
    num = strip_non_ascii(number)
    num = re.sub(r'p.*', '', num)
    if num != '':
        num = (int(num.encode('utf-8')))
        return num
    else:
        return 0

#convert years to integers
df['year'] = df['year'].map(lambda x: num_converter(x))
df['year'][0]

1987

In [4]:
#sanity check to make sure we converted to integers
df[df['year']==1948].shape

(219, 13)

In [40]:
#get counts for author_type
df.groupby('author_type').size().sort_values(ascending=False)

author_type
Poet           293715
Writer         192421
Critic          53004
Unknown         29915
Philosopher     16460
Dramatist        8568
Painter          2745
Translator        151
dtype: int64

In [25]:
#eliminate the "unknown" entries, since the zenshu_type label is unreliable here
#df = df[df['author_type']!='Unknown']
#get counts for zenshu_type
df.groupby('genre').size().sort_values(ascending=False)

genre
Fiction    189967
dtype: int64

# Get Counts by Zenshu Type

In [4]:
#calculate the total number of unique zenshu
df['title_pub'] = df.zenshu_short_title + df.publisher
print(df.groupby('title_pub').size().sort_values(ascending=False).shape[0])
#There are 1,168 unique zenshu
author_df = df[df['zenshu_type']=='Author']
general_df = df[df['zenshu_type']=='General']
print(author_df.groupby('title_pub').size().sort_values(ascending=False).shape[0])
print(general_df.groupby('title_pub').size().sort_values(ascending=False).shape[0])

1255
1022
234


# Filter on "Fiction" and "General" anthologies

In [5]:
#filter on the "fiction" category
all_fiction = df[df['genre']=='Fiction']
all_fiction['title_pub'] = all_fiction.zenshu_short_title + all_fiction.publisher

#replace NaNs with an empty string and create new column that merges title and main_title
#all_fiction = all_fiction.replace(np.nan, '', regex=True)
#all_fiction['full_title'] = all_fiction.title + all_fiction.main_title

all_fiction.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


(189765, 14)

In [6]:
#now filter on the zenshu_type, to exclude author zenshu
gen_fiction = all_fiction[all_fiction['zenshu_type']=='General']
gen_fiction.shape

(38358, 14)

In [7]:
kojin_fiction = all_fiction[all_fiction['zenshu_type']=='Author']
kojin_fiction.shape

(151407, 14)

# Code for working with all data

In [8]:
#get counts for zenshu_type
all_fiction.groupby('zenshu_type').size().sort_values(ascending=False)

zenshu_type
Author     151407
General     38358
dtype: int64

In [10]:
#select by years to get frequency counts for a specific period
some_fiction = gen_fiction[(gen_fiction['year'] >= 1920) & (gen_fiction['year'] < 1960)]

#examine the top titles for all years
top_titles = some_fiction.groupby(['author', 'title']).size().sort_values(ascending=False)
top_titles = pd.DataFrame(top_titles)
top_titles = top_titles.reset_index()
top_titles.columns = ['author', 'title', 'count']
top_titles[0:20]
#top_titles.shape

Unnamed: 0,author,title,count
0,川端康成,伊豆の踊子,10
1,菊池寛,恩讐の彼方に,10
2,菊池寛,忠直卿行状記,10
3,島崎藤村,嵐,9
4,芥川龍之介,奉教人の死,9
5,芥川龍之介,地獄変,9
6,芥川龍之介,きりしとほろ上人伝,8
7,芥川龍之介,トロッコ,8
8,志賀直哉,和解,8
9,芥川龍之介,秋,8


In [12]:
#examine the top authors for select years
top_auth = gen_fiction.groupby(['author']).size().sort_values(ascending=False)
top_auth = pd.DataFrame(top_auth)
top_auth = top_auth.reset_index()
top_auth.columns = ['author', 'count']
top_auth[0:20]

Unnamed: 0,author,count
0,芥川龍之介,1283
1,志賀直哉,838
2,森鴎外,821
3,国木田独歩,694
4,太宰治,658
5,川端康成,587
6,井伏鱒二,522
7,島崎藤村,483
8,堀辰雄,473
9,井上靖,443


# Output Results

In [16]:
import xlsxwriter
import openpyxl
writer = pd.ExcelWriter(r'TopAuthorMetrics.xlsx', engine='xlsxwriter')
top_auth.to_excel(writer, sheet_name='Sheet1')
writer.save()

# Calculate Alternate Measures for Titles

In [None]:
top_titles = gen_fiction.groupby(['author', 'title']).size().sort_values(ascending=False)
top_titles = pd.DataFrame(top_titles)
top_titles = top_titles.reset_index()
top_titles.columns = ['author', 'title', 'count']

#normalize the count of titles by the number of years since first publication
#do this for the top 3000 titles
normalized = []
intensity = []
first_pubs = []
last_pubs = []

for i in top_titles.index[0:3000]:
    #select on title and author to avoid problem of duplicate titles by different authors
    first_pub = int(gen_fiction[(gen_fiction['author']==top_titles.ix[i]['author']) & (gen_fiction['title']==top_titles.ix[i]['title'])].year.min())
    last_pub = int(gen_fiction[(gen_fiction['author']==top_titles.ix[i]['author']) & (gen_fiction['title']==top_titles.ix[i]['title'])].year.max())
    if last_pub - first_pub == 0:
        normalized.append(0)
        intensity.append(0)
    else:
        normalized_ratio = top_titles.ix[i]['count'] / (2004 - first_pub)
        intensity_ratio = top_titles.ix[i]['count'] / (last_pub - first_pub) 
        
        normalized.append(normalized_ratio)
        intensity.append(intensity_ratio)
        first_pubs.append(first_pub)
        last_pubs.append(last_pub)

top_3K = top_titles[0:3000]
top_3K['normalized'] = pd.Series(normalized).values
top_3K['intensity'] = pd.Series(intensity).values
top_3K['first_publication'] = pd.Series(first_pubs).values
top_3K['last_publication'] = pd.Series(last_pubs).values
#top_3K = top_3K.sort_values(by='normalized', ascending=False) 
#top_3K[0:50]

In [10]:
top_3K.sort_values(by='normalized', ascending=False)[0:20]

Unnamed: 0,author,title,count,normalized,intensity,first_publication,last_publication
3,大岡昇平,野火,29,0.568627,0.74359,1953,1992
11,大岡昇平,俘虜記,27,0.509434,0.818182,1951,1984
21,永井荷風,[zA272]東綺譚,25,0.480769,0.595238,1952,1994
60,吉行淳之介,驟雨,22,0.478261,0.594595,1958,1995
38,梅崎春生,桜島,23,0.46,0.575,1954,1994
82,安岡章太郎,ガラスの靴,20,0.454545,0.571429,1960,1995
29,川端康成,雪国,24,0.436364,0.648649,1949,1986
61,志賀直哉,灰色の月,22,0.431373,0.468085,1953,2000
32,堀辰雄,風立ちぬ,24,0.428571,0.545455,1948,1992
133,安岡章太郎,海辺の光景,18,0.428571,0.72,1962,1987


# Calculate Author Metrics

In [None]:
#calculate h_index and m_index for top 200 most prolific authors
h_indexes = []
m_indexes = []

#get the names of top 250 authors
top_250_list = top_auth.author[0:250].tolist()
top_250_df = top_auth[0:250]

for author in top_250_list:
    #create author dataframe
    one_auth = gen_fiction[gen_fiction['author']==author]
    #get unique titles sorted in decreasing order of number of times published
    titles = one_auth.groupby(['author', 'title']).size().sort_values(ascending=False)
    titles = pd.DataFrame(titles)
    titles = titles.reset_index()
    titles.columns = ['author', 'title', 'count']

    #calculate number of years between first publication and 2004 (publication of this dataset)
    n = 2004 - int(one_auth.year.min())

    #calculate h-index and m-index
    h_index = 0
    for i in titles.index:
        if titles.ix[i]['count'] >= i:
            h_index = i
        else:
            break

    #for authors with an h_index less than 1, assign 0
    if h_index == 1:
        m_index = 0
    else:
        m_index = h_index / n       
    
    #store in list
    h_indexes.append(h_index)
    m_indexes.append(m_index)
    
#merge these indexes with the top_250 dataframe and sort
top_250_df['h_index'] = pd.Series(h_indexes).values
top_250_df['m_index'] = pd.Series(m_indexes).values

In [15]:
top_250_df.sort_values(by='h_index', ascending=False)[0:50]

Unnamed: 0,author,count,h_index,m_index
0,芥川龍之介,1283,21,0.265823
1,志賀直哉,838,16,0.205128
3,国木田独歩,694,15,0.194805
2,森鴎外,821,14,0.184211
20,梶井基次郎,298,13,0.209677
4,太宰治,658,11,0.2
6,井伏鱒二,522,11,0.152778
8,堀辰雄,473,11,0.152778
12,正宗白鳥,403,10,0.126582
15,谷崎潤一郎,362,10,0.128205


In [54]:
import xlsxwriter
import openpyxl
writer = pd.ExcelWriter(r'TopAuthors.xlsx', engine='xlsxwriter')
top_250_df.to_excel(writer, sheet_name='Sheet1')
writer.save()

In [21]:
#get a list of top writers by year from 1925 to 2004
top_authors = {'year': [], 'author': [], 'no. of titles': [], 'all titles':[], 'percent of all titles': []}

i = 1950
while i != 2004:
    total = gen_fiction[gen_fiction['year']== i].shape[0]
    top = gen_fiction[gen_fiction['year']== i].groupby(['author']).size().sort_values(ascending=False)
    #print i, top.index[0], '%02.2f' % (top[0]/total)
    top_authors['year'].append(i)
    top_authors['author'].append(top.index[0])
    top_authors['no. of titles'].append(top[0])
    top_authors['percent of all titles'].append(str(int((100*(top[0]/total))))+'%')
    top_authors['all titles'].append(total)
    
    if i == 1943:
        i = 1946
    else:
        i+=1
        
top_df = DataFrame(top_authors, columns=['year', 'author', 'no. of titles', 'all titles', 'percent of all titles'])
top_df

Unnamed: 0,year,author,no. of titles,all titles,percent of all titles
0,1950,野村胡堂,36,395,9%
1,1951,志賀直哉,15,147,10%
2,1952,幸田露伴,13,236,5%
3,1953,芥川龍之介,177,976,18%
4,1954,志賀直哉,41,633,6%
5,1955,武田麟太郎,32,574,5%
6,1956,国木田独歩,45,413,10%
7,1957,徳田秋声,21,374,5%
8,1958,久保田万太郎,91,573,15%
9,1959,上林暁,47,482,9%


In [132]:
import xlsxwriter
import openpyxl
writer = pd.ExcelWriter(r'TopAuthorsByYear.xlsx', engine='xlsxwriter')
top_df.to_excel(writer, sheet_name='Sheet1')
writer.save()