In [None]:
# name of preprocessed csv file
file_processed = 'interview-data/results220105_processed.csv'

In [None]:
# --- run once to download, filter, translate & save interview data ---
# (deepl limits number of chars you can translate per month)
# (skip if using mock data without setting up .env)

# options
sheet_name = 'mock'
file_download = 'interview-mocks/mock_raw.csv'

from dotenv import load_dotenv; load_dotenv(); import os
from data_import import columns_from, fetch_google_sheet
fetch_google_sheet(os.getenv('GOOGLE_SHEET_ID'), sheet_name, file_download)
_ = columns_from(file_download, should_translate=True, file_out=file_processed)

In [None]:
import re
# --- run to setup natural language processor ---
from analysis import NLProcessor
NLProcessor.ready() # this may take a minute or two when notebook is restarted
# compute similarity with       .
NLProcessor.set_similarity_data('medicine')

In [None]:
from data_import import columns_from
use_columns = lambda col_name: 'Assoziationen' in col_name
cols = {
    header: [
        (lambda content: {
            'content': content,
            'sentiment': NLProcessor.sentiment(content)['compound'],
            'similarity': NLProcessor.similarity(content)
        })(re.sub('^[^a-zA-Z"\']*', '', cell, flags=re.MULTILINE).replace('\n', '; '))
    for cell in column]
for header, column in columns_from(file_processed, use_col=use_columns).items() }

for header, column in cols.items():
    print(header)
    for cell in column:
        print(cell, end=' ')
        print()
    print()

In [None]:
import pandas as pd
import pandas_bokeh
plot_vals = ['content', 'sentiment', 'similarity']
plot_cols = ['1.d) Assoziationen Namen', '1.e) Assoziationen Logo', '3.a) Assoziationen Gentest']
titles = ['name associations', 'logo associations', 'genetic testing associations']
dataframes = [
    pd.DataFrame([[point[pv] for pv in plot_vals] for point in cols[col_name]], columns=plot_vals)
for col_name in plot_cols]
plots = [
    df.plot_bokeh.scatter(
        title=title, x='sentiment', y='similarity', ylabel='similarity with "medicine"',
        hovertool_string='@{content}', show_figure=False,
        xlim=(-1, 1), ylim=(1, 3))
for df, title in zip(dataframes, titles)]
pandas_bokeh.plot_grid([[plots[0], plots[1]], [plots[2]]])

In [None]:
plot_vals = ['content', 'sentiment', 'similarity']
col_names = ['1.d) Assoziationen Namen', '1.e) Assoziationen Logo', '3.a) Assoziationen Gentest']
titles = ['name', 'logo', 'genetic testing']
dataframe = pd.DataFrame([
    row for title, col_name in zip(titles, col_names) for row in [
        [title] + [point[pv] for pv in plot_vals] for point in cols[col_name]
    ]
], columns=['title'] + plot_vals)
dataframe.plot_bokeh.scatter(
    title='associations', ylabel='similarity with "medicine"',
    x='sentiment', y='similarity', category='title',
    hovertool_string='@{content}',
    xlim=(-1, 1), ylim=(1, 3), figsize=(1000, 600),
    toolbar_location=None, colormap=('#5B16D0', '#3594DD', '#ED419C'), line_color='white', size=20
)