### Make a Dynamic Topic model

In [1]:
import boto3
from sklearn import decomposition
import numpy as np
import tarfile, io, joblib
import pandas as pd

from Greene_dnmf import TopicCollection


### Establish Boto3 client, get model paths, and model ks.

In [2]:
connection = boto3.client('s3')
paginator = connection.get_paginator('list_objects_v2')

# map chamber to model_path
chamber_map = {}

c = 97
for page in paginator.paginate(Bucket='ascsagemaker', Prefix="JMP_congressional_nmf/unigram_models"):
    for ob in page['Contents']:
        if ob['Key'].endswith('.tar.gz'):
            chamber_map[c] = ob['Key']
            c += 1


### Make DTM for DNMF from all window topics

In [3]:
def get_k_from_model(chamber,k):
    """
    finds and loads model with given k
    """
    
    model_path = chamber_map[chamber]
    object_ = connection.get_object(Bucket='ascsagemaker',Key=model_path)['Body'].read()
    tar = tarfile.open(fileobj=io.BytesIO(object_))
    members = tar.getmembers()
    
    chamber = members[0].name.split('_')[1]
    member_name = f"NMF_{chamber}_{k}.pkl"
    model = joblib.load(tar.extractfile(member=tar.getmember(member_name)))
    model['chamber'] = chamber
    return model

In [5]:
# create the DNMF DTM
wk = 75

collection = TopicCollection()
models = []
for chamber in chamber_map.keys():
    model = get_k_from_model(chamber,wk)
    window_topic_names = [f'{chamber}_{i}' for i in range(model['H'].shape[0])]
    collection.add_topic_model(model['H'],model['terms'],window_topic_names)
    models.append(model)
    print(f'loaded chamber {chamber} with {wk} topics')

M, all_terms = collection.create_matrix()
topic_ids = collection.topic_ids

### run DNMF

In [184]:
def rank_terms(H,terms):
    term_rankings = []
    for topic_index in range(H.shape[0]):
        top_indices = np.argsort(H[topic_index,:])[::-1]
        term_ranking = [terms[i] for i in top_indices[:10]]
        term_rankings.append(term_ranking)
    return term_rankings

In [185]:
def run_dnmf(M,k,terms):
    dnmf = decomposition.NMF(n_components=k,init='nndsvd',max_iter=200,random_state=1234)
    results = {"W":dnmf.fit_transform(M),
                         'H':dnmf.components_,
                         "model":dnmf,
                         'term_rank':rank_terms(dnmf.components_,terms)}
    results['partition'] = np.argmax(results['W'],axis=1).tolist()

    return results

In [186]:
dynamic_models = {}
k_range = range(20,51,2)
for k in k_range:
    dynamic_models[k] = run_dnmf(M,k,all_terms)   
    print(f"ran DNMF with {k} dynamic topics")

ran DNMF with 20 dynamic topics
ran DNMF with 25 dynamic topics
