In [1]:
import pandas as pd
import numpy as np
import configparser
config = configparser.ConfigParser()

config.read("../final_project_files/env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [2]:
OHCO = ['book_title','chap_num', 'para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)

In [3]:
CORPUS = pd.read_csv(f"{output_dir}\\CORPUS.csv").set_index(OHCO)
CORPUS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str,pos,pos_group
book_title,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01_a_game_of_thrones,1,0,1,1,We,we,PRP,PR
01_a_game_of_thrones,1,0,1,2,should,should,MD,MD
01_a_game_of_thrones,1,0,1,3,start,start,VB,VB
01_a_game_of_thrones,1,0,1,4,back,back,RP,RP
01_a_game_of_thrones,1,0,1,7,Gared,gared,VBD,VB


### BOW

In [4]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW

### TFIDF

In [5]:
def get_tfidf(BOW, tf_method='max', df_method='standard', item_type='term_str'):
            
    DTCM = BOW.n.unstack(fill_value=0) # Create Doc-Term Count Matrix
    
    if tf_method == 'sum':
        TF = (DTCM.T / DTCM.T.sum()).T
    elif tf_method == 'max':
        TF = (DTCM.T / DTCM.T.max()).T
    elif tf_method == 'log':
        TF = (np.log2(DTCM.T + 1)).T
    elif tf_method == 'raw':
        TF = DTCM
    elif tf_method == 'bool':
        TF = DTCM.astype('bool').astype('int')
    else:
        raise ValueError(f"TF method {tf_method} not found.")

    DF = DTCM.count() # Assumes NULLs 
    N_docs = len(DTCM)
    
    if df_method == 'standard':
        IDF = np.log2(N_docs/DF) # This what the students were asked to use
    elif df_method == 'textbook':
        IDF = np.log2(N_docs/(DF + 1))
    elif df_method == 'sklearn':
        IDF = np.log2(N_docs/DF) + 1
    elif df_method == 'sklearn_smooth':
        IDF = np.log2((N_docs + 1)/(DF + 1)) + 1
    else:
        raise ValueError(f"DF method {df_method} not found.")
    
    TFIDF = TF * IDF
    
    DFIDF = DF * IDF
    
    TFIDF = TFIDF.fillna(0)

    return TFIDF, DFIDF, DTCM

### Using Fuctions

In [6]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n')
VOCAB.index.name = 'term_str'
VOCAB['max_pos'] = CORPUS.value_counts(['term_str','pos']).unstack().idxmax(1)
VOCAB.head()

Unnamed: 0_level_0,n,max_pos
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
the,157913,DT
and,74625,CC
to,65973,TO
a,62275,DT
of,56741,IN


In [7]:
bag = bags['BOOKS']
BOW = create_bow(CORPUS, bag)
TFIDF, DFIDF, DTM = get_tfidf(BOW, tf_method='log', df_method='sklearn_smooth')

In [8]:
VOCAB['dfidf'] = DFIDF
VOCAB['mean_tfidf'] = TFIDF.mean()

In [9]:
VOCAB.head(2)

Unnamed: 0_level_0,n,max_pos,dfidf,mean_tfidf
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
the,157913,DT,18.0,12.667898
and,74625,CC,18.0,11.492454


In [10]:
bag2 = bags['CHAPS']
BOW2 = create_bow(CORPUS, bag2)
TFIDF2, DFIDF2, DTM2 = get_tfidf(BOW2, tf_method='log', df_method='sklearn_smooth')

In [11]:
VOCAB2 = VOCAB.copy()
VOCAB2['dfidf'] = DFIDF2
VOCAB2['mean_tfidf'] = TFIDF2.mean()

In [12]:
VOCAB.to_csv(f"{output_dir}\\BOW_books.csv")
VOCAB2.to_csv(f"{output_dir}\\BOW_chaps.csv")

TFIDF.to_csv(f"{output_dir}\\TFIDF_books.csv")
TFIDF2.to_csv(f"{output_dir}\\TFIDF_chaps.csv")

DTM.to_csv(f"{output_dir}\\DTM_books.csv")
DTM2.to_csv(f"{output_dir}\\DTM_chaps.csv")

### Reduce and Normalize TFIDF with L2

In [13]:
from numpy.linalg import norm
from scipy.spatial.distance import pdist

In [14]:
n_terms = 1000

In [15]:
VIDX = VOCAB.sort_values('dfidf', ascending=False).head(n_terms).index
VIDX2 = VOCAB2.sort_values('dfidf', ascending=False).head(n_terms).index

In [16]:
M = TFIDF[VIDX].fillna(0).groupby('book_title').mean() # MUST FILLNA
M2 = TFIDF2[VIDX2].fillna(0).groupby('book_title').mean()
M.head()

term_str,the,ensued,resulted,beyondthewall,privates,tremond,ryon,redirected,references,stalactites,...,fastmoving,parttime,surrogate,overthecounter,congratulatory,diana,acquaintances,thinlipped,director,wastrels
book_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01_a_game_of_thrones,14.112032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01_into_the_wild,11.729196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01_the_fire_within,11.386401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02_a_clash_of_kings,14.18913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02_fire_and_ice,12.010528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
L2 = M.apply(lambda x: x / norm(x), 1) # Euclidean
L2_2 = M2.apply(lambda x: x / norm(x), 1) # Euclidean

In [18]:
L2.to_csv(f"{output_dir}\\l2_norm_books.csv")
L2_2.to_csv(f"{output_dir}\\l2_norm_chaps.csv")

In [19]:
PAIRS = M.T.corr().stack().to_frame('correl')
PAIRS.index.names = ['doc_a','doc_b']
PAIRS = PAIRS.query("doc_a > doc_b") # Remove identities and reverse duplicates

PAIRS2 = M2.T.corr().stack().to_frame('correl')
PAIRS2.index.names = ['doc_a','doc_b']
PAIRS2 = PAIRS2.query("doc_a > doc_b") # Remove identities and reverse duplicates

In [20]:
general_method = 'weighted' # single, complete, average, weighted 
euclidean_method = 'ward' # ward, centroid, median
combos  = [
    (L2, 'euclidean', 'euclidean-ward', 'ward'),
    (L2,  'euclidean', 'euclidean-centroid', 'centroid'),
    (L2,  'euclidean', 'euclidean-median', 'median')
]

In [21]:
for X, metric, label, _ in combos:
    PAIRS[label] = pdist(X, metric)
    PAIRS2[label] = pdist(X, metric)

In [22]:
PAIRS.style.background_gradient('GnBu', high=.5)

Unnamed: 0_level_0,Unnamed: 1_level_0,correl,euclidean-ward,euclidean-centroid,euclidean-median
doc_a,doc_b,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01_into_the_wild,01_a_game_of_thrones,0.825586,0.596294,0.596294,0.596294
01_the_fire_within,01_a_game_of_thrones,0.714069,0.74832,0.74832,0.74832
01_the_fire_within,01_into_the_wild,0.835395,0.535519,0.535519,0.535519
02_a_clash_of_kings,01_a_game_of_thrones,0.859941,0.653871,0.653871,0.653871
02_a_clash_of_kings,01_into_the_wild,0.948617,1.029882,1.029882,1.029882
02_a_clash_of_kings,01_the_fire_within,0.832461,0.917389,0.917389,0.917389
02_fire_and_ice,01_a_game_of_thrones,0.786259,0.723727,0.723727,0.723727
02_fire_and_ice,01_into_the_wild,0.909909,0.787724,0.787724,0.787724
02_fire_and_ice,01_the_fire_within,0.796211,0.961695,0.961695,0.961695
02_fire_and_ice,02_a_clash_of_kings,0.907054,0.863646,0.863646,0.863646


In [23]:
PAIRS2.style.background_gradient('GnBu', high=.5)

Unnamed: 0_level_0,Unnamed: 1_level_0,correl,euclidean-ward,euclidean-centroid,euclidean-median
doc_a,doc_b,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01_into_the_wild,01_a_game_of_thrones,0.999752,0.596294,0.596294,0.596294
01_the_fire_within,01_a_game_of_thrones,0.999469,0.74832,0.74832,0.74832
01_the_fire_within,01_into_the_wild,0.999515,0.535519,0.535519,0.535519
02_a_clash_of_kings,01_a_game_of_thrones,0.999869,0.653871,0.653871,0.653871
02_a_clash_of_kings,01_into_the_wild,0.999872,1.029882,1.029882,1.029882
02_a_clash_of_kings,01_the_fire_within,0.999605,0.917389,0.917389,0.917389
02_fire_and_ice,01_a_game_of_thrones,0.999594,0.723727,0.723727,0.723727
02_fire_and_ice,01_into_the_wild,0.999632,0.787724,0.787724,0.787724
02_fire_and_ice,01_the_fire_within,0.999354,0.961695,0.961695,0.961695
02_fire_and_ice,02_a_clash_of_kings,0.99972,0.863646,0.863646,0.863646
