In [142]:
import pandas as pd
import numpy as np
import math
import time

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import cm
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, HoverTool, CategoricalColorMapper, Legend
from bokeh.transform import factor_cmap

In [2]:
df = pd.read_csv('../data/processed/1965-1984_10-14.tsv', sep='\t', index_col=0)
df

Unnamed: 0,Choice,AB Bookman's Weekly,Publishers Weekly,Esquire,Booklist,Journal of Aesthetics and Art Criticism,International Philosophical Quarterly,Journal of Marketing,Harvard Law Review,Journal of Business Education,...,Computers and the Humanities,American Arts,Essays on Canadian Writing`,Performing Arts Review,"Journal of Arts Management, Law, and Society","Studio International, Review",Journal of Black Studies,Book: the Magazine for Reading Life,Lone Star Review,Aspen Journal of the Arts
"A, Brandon",1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"AAGAARD-MOGENSEN, Lars",1,0,0,0,0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"AAKER, David A",1,0,0,0,1,0,0,4,1,1,...,0,0,0,0,0,0,0,0,0,0
"AALTO, Alvar",3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"AARDEMA, Verna",1,0,7,0,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"JOHN, Martha T",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"KERSCHNER, Paul",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"MYERS, Phyllis",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"REGNIER, Victor",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# restrict to authors who were reviewed at least n times (optional)
auth_min = 25
df = df[df.sum(axis=1) >= auth_min]
df = df[df.columns[df.sum() >= 25]] # drop poorly-represented journals
df.shape

(7440, 358)

In [10]:
# this cell can be used to see the explained variance ratio with different values for n_components
for n in range(5, 50, 5):
    svd = TruncatedSVD(n_components=n)
    comps = svd.fit_transform(df)
    exp_var = sum(svd.explained_variance_ratio_)
    print(f'Components: {n}. Explained variance: {exp_var}')

Components: 5. Explained variance: 0.6636356161471071
Components: 10. Explained variance: 0.7617180430355688
Components: 15. Explained variance: 0.8154568685011312
Components: 20. Explained variance: 0.845805552530671
Components: 25. Explained variance: 0.8656561683034554
Components: 30. Explained variance: 0.8814342092007499
Components: 35. Explained variance: 0.8942488312091049
Components: 40. Explained variance: 0.9049909771605432
Components: 45. Explained variance: 0.9142813598914507


In [13]:
n=50
svd = TruncatedSVD(n_components=n)
comps = svd.fit_transform(df)
exp_var = sum(svd.explained_variance_ratio_)
print(f'Components: {n}. Explained variance: {exp_var}')

Components: 50. Explained variance: 0.9221525260004488


In [14]:
# most important features for each principal component
n_comps = svd.components_.shape[0]
most_imp = [svd.components_[i].argsort()[::-1][:10] for i in range(n_comps)]

for i, component in enumerate(most_imp):
    print(f'top features for component# {i}')
    for feature in component:
        print(df.columns[feature])
        print(svd.components_[i][feature])
    print('\n\n')

top features for component# 0
Kirkus Reviews
0.447055271520129
Booklist
0.42202438841967005
Publishers Weekly
0.3986708534629233
Library Journal
0.3706124524144477
New York Times Book Review
0.27206230533289255
Times Literary Supplement
0.19482250642163362
Best Sellers
0.16651051225314015
Book World
0.14724714149574206
Observer (London)
0.14627357132535806
School Library Journal
0.13914697047445793



top features for component# 1
School Library Journal
0.33446080983499044
Booklist
0.30904765895885217
Center for Children's Books, Bulletin
0.2643098381909327
Kirkus Reviews
0.24455810055609892
Horn Book Magazine
0.15293866114371318
Science Books and Films
0.1332085627882348
Children's Book Review Service
0.08398426042408928
Childhood Education
0.07547398653823205
Reading Teacher
0.06594108360849729
Catholic Library World
0.0623880669881921



top features for component# 2
Publishers Weekly
0.360838693855365
Best Sellers
0.2790541149043768
New York Times Book Review
0.23961165689482836
Li

In [33]:
# it is useful to introduce a weighting scheme, since some magazines just publish so many more reviews than authors

# I'll use TF-IDF, where IDF is instead: log(total authors / authors reviewed by this magazine)
docs = df.shape[0]
idfs = [math.log(docs / np.where(df[col] == 0, 0, 1).sum()) for col in df.columns]
tfidf = df * idfs
tfidf

Unnamed: 0,Choice,AB Bookman's Weekly,Publishers Weekly,Esquire,Booklist,Journal of Aesthetics and Art Criticism,International Philosophical Quarterly,Harvard Law Review,Journal of Home Economics,Social Education,...,Foreign Affairs,Thought,"Clio: a Journal of Literature, History, and the Philosophy of History",Political Science Reviewer,Mankind,Black Scholar,Social Research,Religious Studies,Theological Studies,Threepenny Review
"AARDEMA, Verna",0.377826,0.0,1.453476,0.0,1.604654,0.0,0.0,0.0,0.0,5.268461,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"AARON, Daniel",0.755653,0.0,0.415279,0.0,0.160465,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"AARON, Henry J",4.156090,0.0,0.000000,0.0,0.320931,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"ABBEY, Edward",0.377826,0.0,2.491672,0.0,1.123258,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"ABBOTT, Carl",1.133479,0.0,0.000000,0.0,0.160465,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"TRIPP, Miles",0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"WATKINS-PITCHFORD, Denys James",0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"DIXON, Franklin W",0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"MONCURE, Jane B",0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
# With these values, we can see an author's review profile when weighted by the 
# total number of reviews published by a magazine

# This mitigates the problem of everybody's "top" journal being Booklist, Kirkus, or Publisher's Weekly

# here, for example, is Ursula Le Guin
tfidf.loc['LE GUIN, Ursula K'].sort_values(ascending=False)[:10]

Magazine of Fantasy and Science Fiction    22.779794
English Journal                            16.851439
Analog Science Fiction and Fact            16.250383
Voice of Youth Advocates                   13.294380
Emergency Librarian                        11.407915
Horn Book Magazine                          9.493542
Observer (London)                           9.383036
Book World                                  9.363350
New Age Journal                             8.679830
Book Report                                 7.499680
Name: LE GUIN, Ursula K, dtype: float64

In [98]:
# or Toni Morrison
tfidf.loc['MORRISON, Toni'].sort_values(ascending=False)[:10]

Black Scholar          32.883834
Critique               18.899886
Ms.                    12.951489
American Literature     9.467173
Black World             9.332262
Newsweek                8.309319
Hudson Review           7.039645
Progressive             5.291060
Nation                  5.189991
Yale Review             4.966590
Name: MORRISON, Toni, dtype: float64

In [101]:
# or Thomas Merton, a famous American Christian monk
# side note: this is how I discovered that "America" is a Jesuit journal
tfidf.loc['MERTON, Thomas'].sort_values(ascending=False)[:10]

Christian Century           31.385226
America                     26.435117
Review for Religious        22.994971
Critic                      22.335174
Religious Studies Review    18.840447
Sewanee Review              12.647159
Commonweal                  11.568903
Catholic Library World       8.416696
American Book Review         7.388541
Best Sellers                 7.002259
Name: MERTON, Thomas, dtype: float64

In [103]:
# Eugene Kennedy, a psychologist and Catholic priest
tfidf.loc['KENNEDY, Eugene'].sort_values(ascending=False)[:10]

America                    37.009164
Review for Religious       31.618086
Christian Century          26.677442
Commonweal                 11.568903
Critic                      9.926744
Best Sellers                8.169302
Catholic Library World      6.733356
Educational Leadership      4.078344
New Catholic World          3.994645
Contemporary Psychology     3.631422
Name: KENNEDY, Eugene, dtype: float64

In [106]:
# we can see the more "elite mainstream" profile of somebody like John Updike
tfidf.loc['UPDIKE, John'].sort_values(ascending=False)[:10]

America                     29.078628
Newsweek                    21.604228
Hudson Review               21.118934
New York Review of Books    20.681935
National Review             20.633266
Time                        20.401461
Atlantic Monthly            19.861204
Commonweal                  18.799467
New Republic                18.113006
Saturday Review             17.873968
Name: UPDIKE, John, dtype: float64

In [108]:
# you can even see what might be called the leftist magazine sphere
tfidf.loc['ZINN, Howard'].sort_values(ascending=False)[:10]

Science and Society            11.050553
Negro Digest                    8.521332
Social Education                7.902691
Dissent                         7.593265
Nation                          6.919988
Partisan Review                 6.448533
Progressive                     5.291060
Journal of American History     5.145009
Saturday Review                 4.468492
Commonweal                      4.338339
Name: ZINN, Howard, dtype: float64

In [110]:
# social historian Lerone Bennett has a prominent place in Black journals
tfidf.loc['BENNETT, Lerone, Jr.'].sort_values(ascending=False)[:10]

Negro Digest                   17.042663
Black World                     9.332262
Black Scholar                   5.480639
Christian Century               4.707784
Saturday Review                 3.574794
Quarterly Journal of Speech     3.283414
Social Studies                  3.106484
Top of the News                 2.853169
Journal of American History     2.572505
Critic                          2.481686
Name: BENNETT, Lerone, Jr., dtype: float64

In [112]:
# British children's poet Charles Causley has an interesting place; note that he was known for blurring line between
# children's and adult poetry
tfidf.loc['CAUSLEY, Charles'].sort_values(ascending=False)[:10]

Junior Bookshelf                         36.521990
Growing Point                            14.189852
Times Educational Supplement              7.782162
New Statesman                             6.527859
School Librarian                          5.743987
Books & Bookmen                           5.287753
Horn Book Magazine                        4.746771
Observer (London)                         4.691518
Center for Children's Books, Bulletin     4.543850
Listener                                  3.721420
Name: CAUSLEY, Charles, dtype: float64

In [124]:
# The natural next step is to want to see a visualization of the entire space in which authors that have high TF-IDF
# scores in the same journals are grouped together

# Before doing that, I'm going to create a simple dictionary that associates each author with their top 5 journals.
# This will be included as a tooltip for that author visible when mousing over their point in the visualization.
# This is useful just because I have no idea who most of these people are. 

author_dict = {}
for author in tfidf.index:
    top_10 = tfidf.loc[author].sort_values(ascending=False)[:5]
    author_dict[author] = top_10.index
author_dict

{'AARDEMA, Verna': Index(['Center for Children's Books, Bulletin', 'School Library Journal',
        'Childhood Education', 'Language Arts', 'Catholic Library World'],
       dtype='object'),
 'AARON, Daniel': Index(['Journal of American History',
        'Journal of Library History, Philosophy, and Comparative Libarianship',
        'Science and Society', 'Journal of Southern History',
        'American Historical Review'],
       dtype='object'),
 'AARON, Henry J': Index(['Journal of Economic Literature', 'Perspective',
        'Political Science Quarterly', 'Monthly Labor Review',
        'Wall Street Review of Books'],
       dtype='object'),
 'ABBEY, Edward': Index(['Living Wilderness', 'National Parks', 'Southwest Review',
        'English Journal', 'Audubon'],
       dtype='object'),
 'ABBOTT, Carl': Index(['Western Historical Quarterly', 'Journal of American History',
        'Business History Review', 'Pacific Historical Review',
        'Wall Street Review of Books'],
       

In [153]:
# to add a little color to our plot, I'm just going to arbitrarily assign colors to journal clusters discovered from 
# the hierarchical cluster analysis


def single_true(iterable):
    i = iter(iterable)
    return any(i) and not any(i)

clusters = {
    'sf' : ['Science Fiction Review', 
            'Analog Science Fiction and Fact', 
            'Magazine of Fantasy and Science Fiction',
            'Fantasy Review'],
    'children' : ['Reading Teacher',
                 'Language Arts',
                 'School Library Journal',
                 'Horn Book Magazine'],
    'science' : ['Science', 
                 'Sky and Telescope', 
                 'Nature', 
                 'Scientific American'],
    'christian' : ['America', 
                   'Christian Century',
                   'Review for Religious',
                   'Critic'],
    'british' : ['Observer (London)',
                'New Statesman',
                'Listener',
                'Spectator',
                'Times Literary Supplement',
                'Guardian Weekly'],
    'wide coverage' : ['Kirkus Reviews',
                      'Publishers Weekly',
                      'Library Journal',
                      'Booklist',
                      'New York Times Book Review'],
    'history' : ['Reviews in American History',
                'Journal of American History',
                'American Historical Review',
                'Historian'],
    'poetry' : ['Parnassus: Poetry in Review',
               'Poetry',
               'North American Review',
               'American Poetry Review']
    
}
author_cluster_list = []
for author in tfidf.index:
    top_score = author_dict[author][0]
    found_cluster = 'Other'
    for cluster in clusters:
        if top_score in clusters[cluster]:
            found_cluster = cluster
    author_cluster_list.append(found_cluster)

In [155]:
color_map = {
    'Other' : 'grey',
    'sf' : 'darkmagenta',
    'children' : 'blue',
    'science': 'limegreen',
    'wide coverage': 'olive',
    'history': 'gold',
    'british': 'crimson',
    'christian': 'saddlebrown',
    'poetry' : 'orange'
}

In [118]:
# I'm going to visualize with t-SNE
# To do so, it's recommended that you reduce dimensionality first.
# I'll use SVD rather than PCA because the data is sparse.

n=50
svd = TruncatedSVD(n_components=n)
transformed = svd.fit_transform(tfidf)
exp_var = sum(svd.explained_variance_ratio_)
print(f'Components: {n}. Explained variance: {exp_var}')

Components: 50. Explained variance: 0.6475457671405569


In [121]:
# t-SNE on the SVD transformed data
time_start = time.time()
tsne = TSNE(n_components=2, verbose=0, perplexity=40, n_iter=300, random_state=11)
tsne_svd_results = tsne.fit_transform(transformed)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

t-SNE done! Time elapsed: 6.537069082260132 seconds


In [157]:
# Now, we'll make a scatterplot with a mouseover that gives us the name of the author and their top-scoring journals
#output_file("tsne_interactive2.html")

source = ColumnDataSource(data=dict(
    x=tsne_svd_results[:,0],
    y=tsne_svd_results[:,1],
    author=tfidf.index,
    top_scores = [author_dict[author] for author in tfidf.index],
    label=author_cluster_list,
    colors=[color_map[c] for c in author_cluster_list]
    
))
TOOLTIPS = [
    ("(x,y)", "($x, $y)"),
    ("author", "@author"),
    ("top scores", "@top_scores"),
]

p = figure(plot_width=1000, plot_height=800, tooltips=TOOLTIPS, toolbar_location='above',
           title="t-SNE Projection of 7000 Authors in Book Review Space")
p.scatter('x', 
          'y',
          size=7,
          source=source,
          fill_alpha=1,
          fill_color='colors'
)

output_file("tsne_interactive.html", title="t-SNE Projection of 7000 Authors in Book Review Space")

show(p)

INFO:bokeh.io.state:Session output file 'tsne_interactive.html' already exists, will be overwritten.


Take some time to look it over. You will note that the clusters have a high degree of intuitive structure. Just browsing, I even found a cluster of 19th century American authors: Twain, Fennimore Cooper, Melville, etc.

In [150]:
set(author_cluster_list)

{'Other'}

In [40]:
# most important features for each principal component
n_comps = svd.components_.shape[0]
most_imp = [svd.components_[i].argsort()[::-1][:10] for i in range(n_comps)]
[np.abs(model.components_[i]).argmax() for i in range(n_pcs)]

for i, component in enumerate(most_imp):
    print(f'top features for component# {i}')
    for feature in component:
        print(df.columns[feature])
        print(svd.components_[i][feature])
    print('\n\n')

top features for component# 0
Center for Children's Books, Bulletin
0.41204661327871656
School Library Journal
0.4068408600985703
Horn Book Magazine
0.2997735062125521
Science Books and Films
0.22923591873719926
Children's Book Review Service
0.1929334375254476
Junior Bookshelf
0.1733755783802396
Reading Teacher
0.171425491299126
Childhood Education
0.16959870500739593
Growing Point
0.14901364869334954
Observer (London)
0.13270672395314145



top features for component# 1
Observer (London)
0.24923784800878548
New Statesman
0.19129615551568901
Listener
0.1906587887255814
Books & Bookmen
0.189453387445913
Spectator
0.1798524071303056
Guardian Weekly
0.17780318563780248
New York Review of Books
0.148616695663173
America
0.14649162190769546
Time
0.14375989574033246
Newsweek
0.13627715889046219



top features for component# 2
Science Books and Films
0.6382083103165558
Appraisal: Children's Science Books
0.2691615926303257
Sky and Telescope
0.20113853630025202
Appraisal: Science Books for Y

In [50]:
# let's try a topic model
from sklearn.decomposition import LatentDirichletAllocation
from lda import LDA

In [65]:
lda = LDA(n_topics=20, random_state=99, n_iter=1000)
lda.fit(df.to_numpy())

INFO:lda:n_documents: 7440
INFO:lda:vocab_size: 358
INFO:lda:n_words: 419557
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -3373556
INFO:lda:<10> log likelihood: -2134913
INFO:lda:<20> log likelihood: -1991415
INFO:lda:<30> log likelihood: -1948789
INFO:lda:<40> log likelihood: -1928380
INFO:lda:<50> log likelihood: -1920153
INFO:lda:<60> log likelihood: -1913032
INFO:lda:<70> log likelihood: -1909436
INFO:lda:<80> log likelihood: -1906343
INFO:lda:<90> log likelihood: -1905592
INFO:lda:<100> log likelihood: -1903186
INFO:lda:<110> log likelihood: -1901420
INFO:lda:<120> log likelihood: -1904286
INFO:lda:<130> log likelihood: -1905054
INFO:lda:<140> log likelihood: -1902394
INFO:lda:<150> log likelihood: -1901460
INFO:lda:<160> log likelihood: -1900258
INFO:lda:<170> log likelihood: -1899551
INFO:lda:<180> log likelihood: -1899144
INFO:lda:<190> log likelihood: -1901167
INFO:lda:<200> log likelihood: -1900569
INFO:lda:<210> log likelihood: -1898946
INFO:lda:<

<lda.lda.LDA at 0x7f191f131e50>

In [61]:
df.to_numpy().shape

(7440, 358)

In [44]:
# Function that displays the top features per topic
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" | ".join([feature_names[i]
                        for i in topic.argsort()[:-num_top_words - 1:-1]]))
        print("")

In [66]:
num_top_features = 10
display_topics(lda, df.columns, num_top_features)

Topic 0:
Library Journal | Booklist | Publishers Weekly | Kirkus Reviews | Choice | New York Times Book Review | Science Books and Films | Best Sellers | Christian Science Monitor | Book World

Topic 1:
Choice | Times Literary Supplement | Library Journal | Burlington Magazine | Apollo | Classical World | Geographical Journal | Booklist | Books & Bookmen | British Book News

Topic 2:
Times Literary Supplement | Observer (London) | New Statesman | Books & Bookmen | Spectator | Listener | Guardian Weekly | British Book News | Economist | Punch

Topic 3:
New York Times Book Review | Kirkus Reviews | Publishers Weekly | Booklist | Choice | Library Journal | Book World | National Review | America | New Republic

Topic 4:
Times Literary Supplement | Choice | American Historical Review | Economist | English Historical Review | History Today | Journal of Modern History | Library Journal | British Book News | New Statesman

Topic 5:
Christian Century | America | Choice | Commonweal | Library Jo