- https://github.com/JasonKessler/Scattertext-PyData
- https://nbviewer.jupyter.org/github/JasonKessler/Scattertext-PyData/blob/master/PyData-Scattertext-Part-1.ipynb

In [4]:
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer
display(HTML("<style>.container {width:98% !important;}</style>"))

In [5]:
nlp = spacy.load('en')

### Data prep & processing

In [6]:
convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df.iloc[0]

party                                               democrat
speaker                                         BARACK OBAMA
text       Thank you. Thank you. Thank you. Thank you so ...
Name: 0, dtype: object

In [8]:
print('Document Count')
print(convention_df.groupby('party')['text'].count())
print('Word Count')
convention_df.groupby('party').apply(lambda x: x.text.apply(lambda x: len(x.split())).sum())
convention_df['parsed'] = convention_df.text.apply(nlp)

Document Count
party
democrat      123
republican     66
Name: text, dtype: int64
Word Count


### Convert to Scattext corpus

In [9]:
corpus = st.CorpusFromParsedDocuments(convention_df, category_col='party', parsed_col='parsed').build()

In [10]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['dem_precision'] = term_freq_df['democrat freq'] * 1./(term_freq_df['democrat freq'] + term_freq_df['republican freq'])
term_freq_df['dem_freq_pct'] = term_freq_df['democrat freq'] * 1./term_freq_df['democrat freq'].sum()
term_freq_df['dem_hmean'] = term_freq_df.apply(lambda x: (hmean([x['dem_precision'], x['dem_freq_pct']])
                                                                   if x['dem_precision'] > 0 and x['dem_freq_pct'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df.sort_values(by='dem_hmean', ascending=False).iloc[:10]

Unnamed: 0_level_0,democrat freq,republican freq,dem_precision,dem_freq_pct,dem_hmean
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
the,3402,2532,0.573306,0.022408,0.04313
and,2709,2233,0.548159,0.017843,0.034562
to,2340,1667,0.583978,0.015413,0.030033
a,1602,1346,0.543419,0.010552,0.020702
of,1569,1377,0.532587,0.010335,0.020276
that,1400,1051,0.571195,0.009221,0.01815
we,1318,1146,0.534903,0.008681,0.017085
in,1291,986,0.566974,0.008503,0.016756
i,1098,851,0.563366,0.007232,0.014281
's,1037,631,0.621703,0.00683,0.013512
