In [1]:
import pandas as pd
import numpy as np
import configparser
config = configparser.ConfigParser()

config.read("../final_project_files/env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [2]:
OHCO = ['book_title','chap_num', 'para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)

In [3]:
CORPUS = pd.read_csv(f"{output_dir}\\CORPUS.csv").set_index(OHCO)
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos,pos_group
book_title,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01_a_game_of_thrones,1,0,1,1,We,we,PRP,PR
01_a_game_of_thrones,1,0,1,2,should,should,MD,MD
01_a_game_of_thrones,1,0,1,3,start,start,VB,VB
01_a_game_of_thrones,1,0,1,4,back,back,RP,RP
01_a_game_of_thrones,1,0,1,7,Gared,gared,VBD,VB


### BOW

In [4]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

### TFIDF

In [5]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack() # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)

    return TFIDF, DFIDF, DTCM

### Using Fuctions

In [6]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'
VOCAB['max_pos'] = CORPUS.value_counts(['term_str','pos']).unstack().idxmax(1)
VOCAB.head()

Unnamed: 0_level_0,n,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
the,157913,DT
and,74625,CC
to,65973,TO
a,62275,DT
of,56741,IN


In [7]:
bag = bags['BOOKS']
BOW = create_bow(CORPUS, bag)
TFIDF, DFIDF, DTM = get_tfidf(BOW, tf_method='log', df_method='sklearn_smooth')

In [8]:
VOCAB['dfidf'] = DFIDF
VOCAB['mean_tfidf'] = TFIDF.mean()

In [9]:
VOCAB.head(2)

Unnamed: 0_level_0,n,max_pos,dfidf,mean_tfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
the,157913,DT,18.0,12.667898
and,74625,CC,18.0,11.492454


In [10]:
TFIDF[VOCAB.sort_values('n', ascending=False).head(200).sample(10).index].sample(10).fillna(0).style.background_gradient(cmap='GnBu', high=.75)

term_str,heard,right,before,enough,make,voice,brother,t,would,red
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
05_a_dangerous_path,5.70044,6.442943,6.857981,5.357552,6.228819,6.629357,3.257836,9.257388,8.016808,1.0
06_fire_world,5.459432,7.076816,7.312883,5.643856,6.357552,5.61471,4.418301,8.948367,7.857981,5.491853
03_a_storm_of_swords,8.531381,7.839204,9.438792,8.61471,8.535275,7.67948,9.962546,5.169925,10.695228,8.78136
05_dark_fire,5.129283,6.988685,7.0,4.906891,6.247928,5.906891,3.257836,8.857981,7.67948,4.523562
05_a_dance_with_dragons,8.154818,7.72792,9.422065,8.584963,8.682995,7.491853,9.147529,8.550747,10.814582,8.876517
06_the_darkest_hour,5.554589,6.247928,7.066089,5.044394,6.189825,6.33985,4.014548,8.703904,8.243174,3.321928
01_the_fire_within,4.906891,6.285402,5.554589,4.087463,5.523562,5.247928,0.0,8.778077,5.754888,4.169925
01_a_game_of_thrones,7.982994,7.475733,8.672425,7.787903,8.022368,8.23362,10.186648,9.276124,10.134426,7.988685
03_forest_of_secrets,6.321928,6.523562,6.988685,5.857981,6.044394,6.044394,3.257836,8.962896,7.918863,2.321928
07_the_fire_ascending,6.491853,6.882643,7.209453,6.066089,5.857981,6.209453,2.694516,9.355351,8.082149,5.129283


In [11]:
bag2 = bags['CHAPS']
BOW2 = create_bow(CORPUS, bag2)
TFIDF2, DFIDF2, DTM2 = get_tfidf(BOW2, tf_method='log', df_method='sklearn_smooth')

In [12]:
VOCAB2 = VOCAB.copy()
VOCAB2['dfidf'] = DFIDF2
VOCAB2['mean_tfidf'] = TFIDF2.mean()

In [13]:
TFIDF2.head()

Unnamed: 0_level_0,term_str,0,031,032,1,10,100,101,102,103,104,...,zoomed,zooming,zorse,zorses,zz,zzed,zzing,zzle,zzled,zzzs
book_title,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
01_a_game_of_thrones,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01_a_game_of_thrones,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01_a_game_of_thrones,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01_a_game_of_thrones,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01_a_game_of_thrones,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
TFIDF2[VOCAB2.sort_values('n', ascending=False).head(200).sample(10).index].sample(10).fillna(0).style.background_gradient(cmap='GnBu', high=.75)

Unnamed: 0_level_0,term_str,made,looked,too,first,our,much,cat,its,them,eyes
book_title,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
05_a_dance_with_dragons,442,3.207363,0.0,2.561054,2.079184,4.708218,0.0,3.169925,0.0,3.07347,3.106854
06_the_darkest_hour,15,2.541775,4.27983,4.176899,4.827715,2.027719,0.0,10.530263,3.757216,4.755923,4.660281
05_a_dance_with_dragons,863,2.541775,1.843222,0.0,2.079184,0.0,1.943553,0.0,0.0,2.760728,0.0
05_a_dance_with_dragons,750,1.603682,0.0,0.0,2.079184,0.0,0.0,0.0,0.0,0.0,0.0
06_fire_world,7,0.0,1.843222,0.0,0.0,0.0,0.0,0.0,0.0,3.07347,2.462123
05_a_dance_with_dragons,790,0.0,0.0,0.0,2.079184,4.055438,1.943553,0.0,0.0,1.88449,0.0
05_a_dance_with_dragons,647,1.603682,0.0,0.0,0.0,3.213859,1.943553,0.0,0.0,2.760728,0.0
04_rising_storm,13,1.603682,5.174579,1.615845,3.295428,2.027719,0.0,3.169925,0.0,2.377961,4.361021
06_the_darkest_hour,7,4.145457,2.921438,3.23169,4.827715,3.213859,3.887105,12.06903,0.0,1.188981,5.914448
05_a_dance_with_dragons,610,1.603682,0.0,0.0,2.079184,3.213859,0.0,0.0,2.977523,2.760728,1.553427


In [15]:
# VOCAB.to_csv(f"{output_dir}\\BOW_books.csv")
# VOCAB2.to_csv(f"{output_dir}\\BOW_chaps.csv")

# TFIDF.to_csv(f"{output_dir}\\TFIDF_books.csv")
# TFIDF2.to_csv(f"{output_dir}\\TFIDF_chaps.csv")

# DTM.to_csv(f"{output_dir}\\DTM_books.csv")
# DTM2.to_csv(f"{output_dir}\\DTM_chaps.csv")

### Reduce and Normalize TFIDF with L2

In [16]:
from numpy.linalg import norm
from scipy.spatial.distance import pdist

In [17]:
VOCAB.dfidf.sort_values(ascending=False)

term_str
brush         18.774517
rights        18.774517
personal      18.774517
spin          18.774517
midst         18.774517
                ...    
stabilized     4.247928
offcourse      4.247928
plankton       4.247928
skeptic        4.247928
eros           4.247928
Name: dfidf, Length: 34790, dtype: float64

In [18]:
VOCAB2.dfidf.sort_values(ascending=False).head(10)

term_str
or      1727.700359
into    1727.694211
like    1727.667420
down    1727.608852
who     1727.606669
do      1727.073732
more    1726.866404
out     1726.775295
did     1726.775295
your    1726.629895
Name: dfidf, dtype: float64

In [19]:
n_terms = 1000
pos_list = [
    'JJ',  # Adjective
    'JJR', # Comparative adjective
    'JJS', # Superlative adjective
    'VB',  # Verb, base form
    'VBD', # Verb, past tense
    'VBG', # Verb, gerund/present participle
    'VBN', # Verb, past participle
    'VBP', # Verb, non-3rd person singular present
    'VBZ', # Verb, 3rd person singular present
    'NNS'  # Plural noun
]

In [20]:
VIDX = VOCAB.loc[VOCAB.max_pos.isin(pos_list)]\
    .sort_values('dfidf', ascending=False)\
    .head(n_terms).index
VIDX2 = VOCAB2.loc[VOCAB2.max_pos.isin(pos_list)]\
    .sort_values('dfidf', ascending=False)\
    .head(n_terms).index

In [22]:
M = TFIDF[VIDX].fillna(0).groupby('book_title').mean() # MUST FILLNA
M2 = TFIDF2[VIDX2].fillna(0).groupby(['book_title', 'chap_num']).mean()
M.head()

term_str,hearts,tempted,streamed,advanced,keeps,overwhelmed,generous,glistened,count,drained,...,deserves,connected,joked,taut,reluctant,write,engulfed,boulders,pounded,accusing
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01_a_game_of_thrones,3.764767,1.341037,3.764767,1.341037,4.639226,2.125493,3.113791,2.125493,6.066264,2.682074,...,3.094976,1.547488,2.45271,3.094976,1.547488,6.325299,1.547488,1.547488,2.45271,2.45271
01_into_the_wild,1.341037,1.341037,1.341037,0.0,0.0,2.125493,1.341037,2.125493,0.0,0.0,...,0.0,1.547488,1.547488,0.0,2.45271,0.0,1.547488,3.094976,0.0,1.547488
01_the_fire_within,1.341037,0.0,0.0,0.0,1.341037,1.341037,1.341037,1.341037,0.0,0.0,...,0.0,0.0,1.547488,0.0,0.0,6.325299,1.547488,0.0,4.000198,1.547488
02_a_clash_of_kings,4.639226,2.682074,5.364148,2.125493,5.89026,2.125493,3.764767,2.682074,6.93306,2.125493,...,2.45271,0.0,2.45271,4.90542,3.593155,5.353428,3.593155,3.593155,2.45271,1.547488
02_fire_and_ice,0.0,1.341037,2.682074,0.0,1.341037,3.764767,1.341037,2.125493,1.341037,2.125493,...,2.45271,0.0,2.45271,0.0,2.45271,0.0,0.0,3.094976,3.094976,1.547488


In [23]:
M2.head()

Unnamed: 0_level_0,term_str,do,more,did,been,are,eyes,see,come,know,made,...,cities,seek,rough,driven,tommen,prepared,cursed,trained,early,interrupted
book_title,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
01_a_game_of_thrones,1,2.223678,3.611557,5.449372,5.606807,4.647198,4.924247,5.625195,4.487019,2.535385,4.811045,...,0.0,0.0,0.0,0.0,0.0,4.997346,0.0,0.0,0.0,0.0
01_a_game_of_thrones,2,5.029646,1.397141,4.825251,4.115125,3.472487,6.213708,4.973956,3.196617,2.535385,3.207363,...,0.0,0.0,0.0,4.98327,0.0,0.0,4.997346,0.0,0.0,0.0
01_a_game_of_thrones,3,1.402984,2.214416,4.184431,4.348213,3.771236,4.660281,3.643357,3.711157,3.1993,1.603682,...,0.0,7.898297,0.0,0.0,4.98327,0.0,0.0,0.0,0.0,0.0
01_a_game_of_thrones,4,4.853528,3.244061,4.825251,5.22258,3.771236,5.373974,4.056087,3.711157,4.135035,2.541775,...,9.966541,0.0,9.966541,0.0,0.0,4.997346,0.0,0.0,0.0,0.0
01_a_game_of_thrones,5,3.257629,5.319412,5.000343,5.719921,5.706417,5.160373,4.405045,5.309466,3.1993,2.541775,...,0.0,0.0,4.98327,4.98327,0.0,0.0,0.0,0.0,4.997346,0.0


In [None]:
L2 = M.apply(lambda x: x / norm(x), 1) # Euclidean
L2_2 = M2.apply(lambda x: x / norm(x), 1) # Euclidean

In [27]:
VSHORT = VOCAB.loc[VIDX]
VSHORT2 = VOCAB.loc[VIDX2]

In [28]:
# L2.to_csv(f"{output_dir}\\l2_norm_books.csv")
# L2_2.to_csv(f"{output_dir}\\l2_norm_chaps.csv")
# VSHORT.to_csv(f"{output_dir}\\VSHORT_books.csv")
# VSHORT2.to_csv(f"{output_dir}\\VSHORT_chaps.csv")

In [None]:
M.T.corr().stack().to_frame('correl')

In [None]:
PAIRS = M.T.corr().stack().to_frame('correl')
PAIRS.index.names = ['doc_a','doc_b']
PAIRS = PAIRS.query("doc_a > doc_b") # Remove identities and reverse duplicates

In [None]:
general_method = 'weighted' # single, complete, average, weighted 
euclidean_method = 'ward' # ward, centroid, median
combos  = [
    (L2, 'euclidean', 'euclidean-ward', 'ward'),
    (L2,  'euclidean', 'euclidean-centroid', 'centroid'),
    (L2,  'euclidean', 'euclidean-median', 'median')
]

In [None]:
for X, metric, label, _ in combos:
    PAIRS[label] = pdist(X, metric)

In [None]:
PAIRS.style.background_gradient('GnBu', high=.5)

### Visualizations: Trees

In [None]:
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

In [None]:
LIB = pd.read_csv(f"{output_dir}\\LIB.csv")

In [None]:
LIB['label'] = LIB.author.str.split(', ').str[0] + ': ' +  LIB.book_title.str[:20] + ' (' + LIB.year.astype('str') + ')'
LIB.set_index('book_title', inplace=True)
LIB.head()

In [None]:
def hac(sims, linkage_method='complete', color_thresh=.3, figsize=(10, 10)):

    # Generate the clustering
    tree = sch.linkage(sims, method=linkage_method)
    
    # Get labels for the leaves
    labels = LIB.label.values
    
    # Create a figure 
    plt.figure()
    fig, axes = plt.subplots(figsize=figsize)
    
    # Create a dendrogram with the tree
    dendrogram = sch.dendrogram(tree, 
                                labels=labels, 
                                orientation="left", 
                                count_sort=True,
                                distance_sort=True,
                                above_threshold_color='.75',
                                color_threshold=color_thresh
                               )
    
    # Change the appearance of ticks, tick labels, and gridlines
    plt.tick_params(axis='both', which='major', labelsize=14)

In [None]:
hac(PAIRS['euclidean-ward'], linkage_method='ward', color_thresh=.9)

In [None]:
hac(PAIRS['euclidean-centroid'], linkage_method='centroid', color_thresh=.5)

In [None]:
hac(PAIRS['euclidean-median'], linkage_method='median', color_thresh=.5)

### Visualizations: K-Means

In [None]:
from sklearn.cluster import KMeans

In [None]:
def get_k_clusters(k=10, n_init='auto'):
    LIB[f'y_raw_{k}'] = KMeans(k, n_init=n_init).fit_predict(M)
    LIB[f'y_L2_{k}']  = KMeans(k, n_init=n_init).fit_predict(L2)
    y_cols = [col for col in LIB.columns if 'y_' in col and f'_{k}' in col]
    return LIB.reset_index().set_index('label')[y_cols].sort_values('label').style.background_gradient("YlGnBu")

In [None]:
get_k_clusters(3)

In [None]:
get_k_clusters(4)

In [None]:
# LIB.to_csv(f"{output_dir}\\LIB_LABELS.csv")