In [None]:
# --- get data ready to be used ---
# only fetch and translate once, then use preprocessed (translated & filtered) file
fetch_and_translate = False
# output file after preprocessing
file_processed = 'interview-mocks/mock_processed_en.csv'

if fetch_and_translate:
    # google docs data
    sheet_name = 'mock'
    file_download = 'interview-mocks/mock_raw.csv'

    # preprocess
    from dotenv import load_dotenv; load_dotenv(); import os
    from data_import import columns_from, fetch_google_sheet
    fetch_google_sheet(os.getenv('GOOGLE_SHEET_ID'), sheet_name, file_download)
    columns_from(file_download, should_translate=True, file_out=file_processed)

In [None]:
# --- setup natural language processor ---
from analysis import NLProcessor
compare_similarity = 'medicine'
NLProcessor.ready_similarity(compare_similarity) # this may take a minute or two when notebook is restarted

In [None]:
# --- define data to plot ---
# which columns to use
used_columns = lambda name: 'Associations' in name
# what values to plot for each cell
plot_values = {
    'values': lambda cell_content: [
        cell_content, NLProcessor.similarity(cell_content), NLProcessor.sentiment(cell_content)['compound']
    ],
    'value_names': ['content', 'relatedness', 'sentiment']
}

In [None]:
# --- plot data interactively in browser ---
from data_import import columns_from
import pandas as pd
import pandas_bokeh

# build data frame by definitions above
cols = columns_from(file_processed, use_col=used_columns)
dataframe = pd.DataFrame(
    [
        plot_vals for name, cells in cols.items()
        for plot_vals in [[name] + plot_values['values'](cell) for cell in cells]
    ],
    columns=['title'] + plot_values['value_names']
)
# plot
dataframe.plot_bokeh.scatter(
    title='associations', ylabel=f'relatedness to "{compare_similarity}"',
    x='sentiment', y='relatedness', category='title',
    hovertool_string='@{content}',
    xlim=(-1, 1), ylim=(0.5, 2), figsize=(1000, 600),
    toolbar_location=None, colormap=('#5B16D0', '#3594DD', '#ED419C'), line_color='white', size=20
)