In [21]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import umap
import umap
import hdbscan
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import LinearColorMapper, HoverTool, ColumnDataSource
from bokeh.palettes import GnBu as palette
from bokeh.transform import linear_cmap

In [50]:
data = pd.read_csv('data\\cosponsor_recalculated.csv')
data2 = pd.read_csv('data\\old_data\\all_bills.csv')

data = pd.merge(data, data2, on = 'number', how='left')

data = data[data['cosponsor_party'] == 'D']
summaries = [str(sum) for sum in data.summary.to_list()]

In [51]:
vectorizer = CountVectorizer(stop_words='english')
model = BERTopic(verbose=True, vectorizer_model=vectorizer)

model.fit(summaries)

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

2022-12-01 01:59:02,403 - BERTopic - Transformed documents to Embeddings
2022-12-01 01:59:05,777 - BERTopic - Reduced dimensionality
2022-12-01 01:59:05,840 - BERTopic - Clustered reduced embeddings


<bertopic._bertopic.BERTopic at 0x19a900f92b0>

In [7]:
model.visualize_barchart()

In [52]:
embeds = model._extract_embeddings(summaries)

In [14]:
data.columns.values

array(['Unnamed: 0_x', 'number', 'cosponsor_party', 'cosponsor_name',
       'perc', 'bill.number', 'subjects', 'summary', 'policy_area',
       'latest_action', 'cosponsor_D_perc', 'cosponsor_R_perc', 'date',
       'session', 'Unnamed: 0.1', 'index', 'Unnamed: 0_y', 'congress',
       'latestAction', 'originChamber', 'originChamberCode', 'title',
       'type', 'updateDate', 'updateDateIncludingText', 'url'],
      dtype=object)

In [58]:
from pickle import dump

with open('data\\model.obj', 'wb') as fp:
    dump(model, fp)

In [42]:
from bokeh.palettes import Viridis256 as palette

In [53]:
def visualize_embeddings(german_embeds, german_model):

    umap_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(german_embeds)
    
    result = pd.DataFrame(umap_data, columns=['x', 'y'])
    
    labels = [german_model.topic_mapper_.get_mappings()[i] for i in german_model.hdbscan_model.labels_]
    result['labels'] = [german_model.topic_labels_[i] for i in labels]

    datasource = ColumnDataSource(
        data=dict(
            x=result['x'].to_list(),
            y=result['y'].to_list(),
            title=data['title'].to_list(),
            cosponsor_D_perc=data['cosponsor_D_perc'].to_list(),
            policyarea=data['policy_area'].to_list(),
            # bill=data['number'].to_list(),
            session=data['session'].to_list(),
            # party=data['partyabbrev'].to_list(),
            cluster=result['labels'].to_list()
        )
    )
    
    color_dict = {
        '90/Greens' : 'green',
        'LINKE' : 'blueviolet',
    }
    
    color_map = LinearColorMapper(low=min(data["cosponsor_D_perc"]), high=max(data["cosponsor_D_perc"]), high_color='blue', low_color='red')
    
    colormapper = linear_cmap(field_name = "cosponsor_D_perc", low=min(data["cosponsor_D_perc"]), high=max(data["cosponsor_D_perc"]), palette=palette)
    
    tooltips = [
        ('Title', '@title'),
        # ('Party', "@party"),
        ('Session', '@session'),
        # ('Bill', "@bill"),
        ('Cluster', '@cluster')
    ]
    
    plot_figure = figure(
        title="UMAP Projection of US Congress bill summaries 115-117",
        width=800, 
        height=600,
        tools=('pan, wheel_zoom, reset, hover'),
        tooltips=tooltips
    )
    
    plot_figure.circle(
        'x', 'y', source=datasource,
        # fill_color=dict(field='cosponsor_D_perc', transform=color_map),
        color=colormapper,
        line_color="grey", line_alpha=0.6, fill_alpha=0.6, size=4
    )

    return plot_figure, datasource

In [31]:
output_notebook()

In [54]:
fig, dats = visualize_embeddings(embeds, model)

In [55]:
test = dats.to_df()

In [57]:
test.to_csv('data\\umap_projection_data.csv')

In [56]:
show(fig)

In [None]:
    umap_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(german_embeds)
    
    result = pd.DataFrame(umap_data, columns=['x', 'y'])
    
    labels = [german_model.topic_mapper_.get_mappings()[i] for i in german_model.hdbscan_model.labels_]
    result['labels'] = [german_model.topic_labels_[i] for i in labels]

    datasource = ColumnDataSource(
        data=dict(
            x=result['x'].to_list(),
            y=result['y'].to_list(),
            title=data['title'].to_list(),
            cosponsor_D_perc=data['cosponsor_D_perc'].to_list(),
            policyarea=data['policy_area'].to_list(),
            # bill=data['number'].to_list(),
            session=data['session'].to_list(),
            # party=data['partyabbrev'].to_list(),
            cluster=result['labels'].to_list()
        )
    )
    
    color_dict = {
        '90/Greens' : 'green',
        'LINKE' : 'blueviolet',
    }
    
    color_map = LinearColorMapper(low=min(data["cosponsor_D_perc"]), high=max(data["cosponsor_D_perc"]), high_color='blue', low_color='red')
    
    colormapper = linear_cmap(field_name = "cosponsor_D_perc", low=min(data["cosponsor_D_perc"]), high=max(data["cosponsor_D_perc"]), palette=palette)
    
    tooltips = [
        ('Title', '@title'),
        # ('Party', "@party"),
        ('Session', '@session'),
        # ('Bill', "@bill"),
        ('Cluster', '@cluster')
    ]
    
    plot_figure = figure(
        title="UMAP Projection of US Congress bill summaries 115-117",
        width=800, 
        height=600,
        tools=('pan, wheel_zoom, reset, hover'),
        tooltips=tooltips
    )
    
    plot_figure.circle(
        'x', 'y', source=datasource,
        # fill_color=dict(field='cosponsor_D_perc', transform=color_map),
        color=colormapper,
        line_color="grey", line_alpha=0.6, fill_alpha=0.6, size=4
    )