In [1]:
# add default values for parameters here

In [3]:
# Parameters
product = "notebooks/similarity.ipynb"

In [4]:
import __path__

In [5]:
from config import MAIN_EXPERIMENT


PREPROCESSED_FILE = MAIN_EXPERIMENT.preprocessed_file
MAIN_MODEL = MAIN_EXPERIMENT.main_model
PICTURES = MAIN_EXPERIMENT.pictures

In [6]:
import json


with open(PREPROCESSED_FILE, 'r', encoding='utf-8') as f:
    preprocessed = json.load(f)

In [7]:
from gensim.models import TfidfModel
import numpy as np
import plotly.graph_objects as go
from config import COLORS_D
from sklearn.manifold import TSNE
import pandas as pd

from src.utils import load


lda_model, dictionary, corpus = load(MAIN_MODEL)
tfidf = TfidfModel(corpus, id2word=dictionary)

In [8]:
policies_ = []
for policy in preprocessed:
    whole_policy = [w for p in policy for w in p if p]
    proba = lda_model.get_document_topics(tfidf[dictionary.doc2bow(whole_policy)], minimum_probability=0.0)
    probas = [p[1] for p in proba]
    policies_.append(probas)

In [9]:
df = pd.DataFrame(policies_).sort_values(by=0).reset_index().drop(columns='index')

tsne_model = TSNE(n_components=2, verbose=0, random_state=0, angle=.99)
tsne_lda = tsne_model.fit_transform(df.iloc[:,1:-1].values)
topics_count = df.iloc[:,0].nunique()
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.002052,0.002052,0.247616,0.179562,0.002052,0.002052,0.115226,0.002052,0.002052,0.002052,...,0.002052,0.002052,0.002052,0.002052,0.002052,0.002052,0.002052,0.002052,0.326290,0.002052
1,0.002082,0.002082,0.283696,0.201369,0.002082,0.002082,0.002082,0.002082,0.002082,0.002082,...,0.002082,0.002082,0.002082,0.002082,0.002082,0.002082,0.163037,0.002082,0.163411,0.002082
2,0.002126,0.002126,0.464599,0.002126,0.002126,0.002126,0.002126,0.002126,0.002126,0.002126,...,0.002126,0.002126,0.002126,0.002126,0.002126,0.095402,0.002126,0.002126,0.298236,0.002126
3,0.002132,0.002132,0.117606,0.136718,0.002132,0.002132,0.002132,0.002132,0.002132,0.280162,...,0.002132,0.002132,0.002132,0.002132,0.002132,0.002132,0.002132,0.002132,0.425004,0.002132
4,0.002139,0.002139,0.480970,0.002139,0.002139,0.002139,0.002139,0.002139,0.002139,0.002139,...,0.002139,0.181987,0.002139,0.002139,0.002139,0.294254,0.002139,0.002139,0.002139,0.002139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,0.233433,0.003953,0.003953,0.003953,0.003953,0.003953,0.003953,0.003953,0.003953,0.003953,...,0.003953,0.003953,0.003953,0.003953,0.003953,0.369722,0.003953,0.003953,0.003953,0.003953
587,0.247038,0.006163,0.006163,0.006163,0.006163,0.174190,0.006163,0.006163,0.006163,0.006163,...,0.006163,0.006163,0.006163,0.006163,0.006163,0.006163,0.455519,0.006163,0.006163,0.006163
588,0.396790,0.005091,0.005091,0.005091,0.005091,0.005091,0.005091,0.005091,0.005091,0.005091,...,0.005091,0.005091,0.005091,0.161047,0.005091,0.005091,0.005091,0.176711,0.005091,0.005091
589,0.397174,0.004854,0.500901,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,...,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854,0.004854


In [10]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=tsne_lda[:,0], 
        y=tsne_lda[:,1], 
        mode='markers'))

fig.update_layout(
    font=dict(family='Times New Roman', size=20, color='black'),
    plot_bgcolor='rgba(0, 0, 0, 0)',
    margin=dict(pad=10),
    colorway=[COLORS_D[4]],
    height=800,
    width=800)

fig.update_xaxes(
    title=dict(
        font=dict(family='Times New Roman', size=25, color='black'),
        text='PC1'), 
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000', 
    showgrid=True, 
    tickangle=0,
    ticklen=10)

fig.update_yaxes( 
    title=dict(
        font=dict(family='Times New Roman', size=25, color='black'),
        text='PC2'), 
    zerolinecolor='lightgrey',
    gridcolor='lightgrey',
    tickformat='000', 
    showgrid=True, 
    tickangle=0,
    ticklen=10)

fig.update_layout(showlegend=False)
fig

In [11]:
fig.write_image(f'{PICTURES}/tsne_whole_docs.png')