In [1]:
import pandas as pd
import numpy as np
import configparser
config = configparser.ConfigParser()

config.read("../final_project_files/env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import plotly_express as px

In [4]:
OHCO = ['book_title','chap_num', 'para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    CHAPS = OHCO[:2],
    BOOKS = OHCO[:1]
)
bag = bags['CHAPS']

### Vector Space

In [5]:
VOCAB = pd.read_csv(f"{output_dir}\\VSHORT_chaps.csv").set_index('term_str')
VOCAB.shape

(1000, 4)

In [6]:
DTM = pd.read_csv(f"{output_dir}\\DTM_chaps.csv")
vocab_list = list(VOCAB.index)
DTM = DTM.drop(columns=[col for col in DTM.columns if col not in vocab_list+bag]).fillna(0)
DTM.set_index(bag, inplace=True)

In [13]:
DTM.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,abandoned,able,accept,added,admitted,afraid,agree,agreed,alive,allow,...,write,writing,written,wrong,yards,years,yellow,young,younger,yours
book_title,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
01_a_game_of_thrones,1,2.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,4.0,0.0,3.0,0.0,5.0,0.0,0.0
01_a_game_of_thrones,2,0.0,0.0,0.0,0.0,1.0,2.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0


In [8]:
# set(list(DTM.index.get_level_values(0)))

In [14]:
place = DTM.stack().to_frame('rand')
place.index.set_names(bag+['term_str'], inplace=True)
place.reset_index(inplace=True)
place['rand'] = place.rand.replace(0, pd.NA)
place = place.dropna(subset=['rand'])
place.drop(columns=['rand'], inplace=True)
DOCS = place.groupby(bag).term_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'term_str':'doc_str'})
DOCS.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_str
book_title,chap_num,Unnamed: 2_level_1
06_the_darkest_hour,5,alive am ancient angry announced appeared appr...
04_fire_star,7,allowed amber anxious are asked asleep be bear...
04_fire_star,46,appeared are be beg began better brought caugh...
05_a_dance_with_dragons,29,afraid agreed alive am appear are aren asked a...
02_a_clash_of_kings,47,afraid am ancient announced are arms arrived a...


In [10]:
VOCAB['doc_count'] = DTM.astype('bool').astype('int').sum()
DOCS['term_count'] = DTM.sum(1)

In [11]:
VOCAB.sample(5)

Unnamed: 0_level_0,n,max_pos,dfidf,mean_tfidf,doc_count
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
swear,270,VBP,18.774517,3.417066,169
enemies,212,NNS,18.569854,3.510657,137
stay,633,VB,18.0,4.979483,378
know,4540,VBP,18.0,7.664551,770
afraid,462,JJ,18.0,4.056997,274


In [12]:
DOCS.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,doc_str,term_count
book_title,chap_num,Unnamed: 2_level_1,Unnamed: 3_level_1
02_a_clash_of_kings,50,abandoned afraid alive allowed ancient angry a...,854.0
01_the_fire_within,12,abandoned agreed are asked be been begged bein...,187.0
05_a_dance_with_dragons,25,accept admitted afraid allow allowed am answer...,1377.0
03_forest_of_secrets,10,added admitted afraid agreed amber angry are a...,447.0
03_a_storm_of_swords,34,afraid alive allowed am angry answered are arm...,1917.0


### Model Generation

In [None]:
class TopicExplorer:
    
    n_features = 1000
    stopwords = 'english'
    lda_num_topics = 20
    lda_max_iter = 10
    lda_n_top_terms = 10

    ngram_range = (1, 3)
    # n_terms = 1000
    # n_topics = 40
    # max_iter = 20
    # n_top_terms = 10
    
    def __init__(self, vocab, dtm, docs, bag, labels=[]):
        self.VOCAB = vocab
        self.DTM = dtm
        self.DOCS = docs
        self.bag = bag
        self.labels = labels
        
    def generate_tables(self):
        print("BAG:", self.bag[-1])
        print("Getting TERMS")
        self._get_count_model()
        print("Getting THETA, PHI")
        self._get_topic_model()
        print("Getting TOPICS")
        self._get_topics()
        print('Binding LIB labels to THETA')
        self._bind_labels()
        print("Done.")
        return self
        
    def _get_count_model(self):
        self.count_engine = CountVectorizer(max_features=self.n_features, 
                                            stop_words=self.stopwords)
        self.count_model = self.count_engine.fit_transform(self.DOCS.doc_str)
        self.TERMS = self.count_engine.get_feature_names_out()
        
    def _get_topic_model(self):
        self.lda_engine = LDA(n_components=self.lda_num_topics, 
                              max_iter=self.lda_max_iter, 
                              learning_offset=50., 
                              random_state=0)
        self.THETA = pd.DataFrame(self.lda_engine.fit_transform(self.count_model), 
                                  index=self.DOCS.index)
        self.THETA.columns.name = 'topic_id'
        self.PHI = pd.DataFrame(self.lda_engine.components_, columns=self.TERMS)
        self.PHI.index.name = 'topic_id'
        self.PHI.columns.name = 'term_str'
        
    def _get_topics(self, n_terms=10):
        self.TOPICS = self.PHI.stack().to_frame('weight')\
            .groupby('topic_id')\
            .apply(lambda x: x.weight.sort_values(ascending=False)\
               .head(self.lda_n_top_terms)\
               .reset_index()\
               .drop('topic_id', axis=1)\
               .term_str)
        self.TOPICS['label'] = self.TOPICS[[t for t in range(self.lda_n_top_terms)]]\
            .apply(lambda x: str(x.name)\
                   .zfill(len(str(self.lda_num_topics))) + ' ' + ' '.join(x), axis=1)
        self.TOPICS['doc_weight_sum'] = self.THETA.sum()
        self.topic_cols = [t for t in range(self.lda_num_topics)]
        
    def _bind_labels(self):
        self.LABELS = {}
        self.LABEL_VALUES = {}
        for label in self.labels:
            self.THETA[label] = self.THETA\
                .apply(lambda x: self.LIB.loc[x.name[0], label], axis=1)
            self.LABELS[label] = self.THETA.groupby(label)[self.topic_cols].mean().T  
            self.THETA = self.THETA.drop(label, axis=1) # Don't keep the column
            self.LABELS[label].index.name = 'topic_id'
            self.LABELS[label]['label'] = self.TOPICS['label']
            self.LABEL_VALUES[label] = sorted(list(set(self.LIB[label])))
            
    def show_dominant_label_topic(self, label):
        X = self.LABELS[label][self.LABEL_VALUES[label]].idxmax()
        return X.to_frame('topic_id').topic_id.map(self.TOPICS.label)
            
    def show_label_values(self):
        for label in self.LABEL_VALUES:
            print(label, ": ", self.LABEL_VALUES[label])
        
    def show_topic_bar(self):
        fig_height = self.lda_num_topics / 3
        self.TOPICS.sort_values('doc_weight_sum', ascending=True)\
            .plot.barh(y='doc_weight_sum', x='label', figsize=(5, fig_height));
        
    def show_topic_label_heatmap(self, label):
        return MP.LABELS[label][MP.LABEL_VALUES[label]].style.background_gradient()
        
    def show_label_comparison_plot(self, label, label_value_x, label_value_y):
        px.scatter(self.LABELS[label].reset_index(), label_value_x, label_value_y, 
                   hover_name='label', text='topic_id', width=800, height=600)\
            .update_traces(mode='text').show() 