# Notebook 4 - Combining all measures, Identifying Strong Candidates, Visually Exploring the Usage Patterns


We will examine **how** candidate phrases function in conversation: who introduces them, who adopts them, and when. Provide context and timelines to understand how phrases signal evolving frames.


## 1. Setup


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
alt.data_transformers.enable('json', filename='altair/{prefix}-{hash}.json')

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 50)
sns.set_context('talk')

# Optional - Enable automatic reloading of modules when source code changes
# This eliminates the need to restart the kernel when updating external .py files.
%load_ext autoreload
%autoreload 2


In [None]:
session_ngrams = pd.read_csv('outputs/session_ngrams_3.csv')
session_turns = pd.read_csv('outputs/session_turns.csv')

## 2. Applying Combined Filters to Identify Frame Candidates

By combining all our measures, we can refine the candidate pool and identify the phrases most likely to represent design frames.

**Filtering Criteria**:

- High MI (meaningful combination)
- Multiple speakers (team adoption)
- Positive domain specificity (more common in design)
- Statistical significance (not due to chance)
- Positive MI difference (characteristic of design session)


### Frequency vs Cohesion vs relative cohesion

x axis shows mutual information score of ngrams, y axis frequency
color: Difference of mutual information

In [None]:
print("\n".join(session_ngrams.columns.tolist()))


### Domain Specificity vs Relative Cohesion

#### using MI difference

In [None]:
from src.mi_visualization import ngram_examples_grid
from matplotlib import cm

cmap = cm.get_cmap('viridis_r')
trunc = [cmap(x) for x in np.linspace(0.2, 1.95, 7)]

df = session_ngrams[
    (session_ngrams['mi_in_session_z'] > 1.0) &
    (session_ngrams['speaker_count'] > 1) &
    (session_ngrams['frequency_in_session'] > 3) & (session_ngrams['frequency_in_session'] < 100) &
    (session_ngrams['frequency_rank_within_length'] > 200) &
    (session_ngrams['ngram_length'] > 1) &
    (session_ngrams['significant'] == True) &
    (session_ngrams['effect_size_log_ratio'] > 1) &
    (session_ngrams['mi_diff_z'] > -1.6)
]

table_df, fig = ngram_examples_grid(
    df=df,
    x_col='mi_diff_z', x_mode='continuous', x_binning='linear', x_bins=9,
    y_col='effect_size_log_ratio', y_mode='continuous', y_binning='linear', y_bins=12,
    color_col='mi_in_session_z', color_mode='continuous', color_scheme='viridis', color_range=trunc,
    
    examples_per_cell=8, wrap_width=30, figsize=(14, 24),
    tick_fontsize=12, label_fontsize=18, cell_text_fontsize=10,
    cell_fill='#e9edf2', cell_fill_alpha=0.7,
    show_zero_axes=True,
    x_label='Relative Cohesion (MI difference)',
    y_label='Domain Specificity (Effect size of relative frequency)'
)
plt.show()
len(df)


#### using NPMI difference

In [None]:
from src.mi_visualization import ngram_examples_grid
from matplotlib import cm

cmap = cm.get_cmap('viridis_r')
trunc = [cmap(x) for x in np.linspace(0.2, 1.95, 7)]

df = session_ngrams[
    (session_ngrams['mi_in_session_z'] > 1.0) &
    (session_ngrams['speaker_count'] > 1) &
    (session_ngrams['frequency_in_session'] > 3) & (session_ngrams['frequency_in_session'] < 100) &
    (session_ngrams['frequency_rank_within_length'] > 200) &
    (session_ngrams['ngram_length'] > 1) &
    (session_ngrams['significant'] == True) &
    (session_ngrams['effect_size_log_ratio'] > 1) &
    (session_ngrams['mi_diff_z'] > -1.6)
]

table_df, fig = ngram_examples_grid(
    df=df,
    x_col='mi_diff_z', x_mode='continuous', x_binning='linear', x_bins=9,
    y_col='effect_size_log_ratio', y_mode='continuous', y_binning='linear', y_bins=12,
    color_col='mi_in_session_z', color_mode='continuous', color_scheme='viridis', color_range=trunc,
    
    examples_per_cell=8, wrap_width=30, figsize=(14, 24),
    tick_fontsize=12, label_fontsize=18, cell_text_fontsize=10,
    cell_fill='#e9edf2', cell_fill_alpha=0.7,
    show_zero_axes=True,
    x_label='Relative Cohesion (MI difference)',
    y_label='Domain Specificity (Effect size of relative frequency)'
)
plt.show()
len(df)


### Relative cohesion of unique ngrams
to the top -> characteristic, distinctive phrases

allows to compare phrases that are only used in the session but not in the reference corpus


In [None]:
from src.mi_visualization import ngram_examples_grid
from matplotlib import cm

cmap = cm.get_cmap('viridis_r')
trunc = [cmap(x) for x in np.linspace(0.2, 1.95, 7)]

df = session_ngrams[
    (session_ngrams['mi_in_session_z'] > 1.0) &
    (session_ngrams['speaker_count'] > 1) &
    (session_ngrams['frequency_in_session'] > 3) & (session_ngrams['frequency_in_session'] < 100) &
    (session_ngrams['frequency_rank_within_length'] > 200) &
    (session_ngrams['ngram_length'] > 1) &
    (session_ngrams['effect_size_log_ratio'].isnull()) &
    (session_ngrams['mi_diff_z'] > -1.6) & (session_ngrams['mi_diff_z'] < 0.65)
]

table_df, fig = ngram_examples_grid(
    df=df,
    x_col='frequency_rank_within_length', x_mode='continuous', x_binning='linear', x_bins=6,
    y_col='mi_diff_z',   y_mode='continuous', y_binning='linear', y_bins=10,
    color_col='mi_in_session_z', color_mode='continuous', color_scheme='viridis', color_range=trunc,
    
    examples_per_cell=12, wrap_width=30, figsize=(12, 30),
    tick_fontsize=12, label_fontsize=18, cell_text_fontsize=10,
    cell_fill='#e9edf2', cell_fill_alpha=0.7,
    show_zero_axes=True
)
plt.show()
len(df)


### Exploring top candidates


- **Upper right quadrant**: Strongest candidates (design-specific AND designerly)
- **Upper left**: Designerly but not more frequent (subtle design thinking)
- **Lower right**: Frequent in design but everyday associations
- **Lower left**: Neither design-specific nor designerly


- **Problem framing**: How teams conceptualize the challenge
- **Solution concepts**: Novel ideas and approaches
- **Design values**: What matters to the team
- **Process language**: How teams talk about their work

## 3. Looking at Usage in Time & Context 
If needed, select a phrase and search its occurrences in `dtrs_11_turns` for qualitative inspection.


### Timelines by speaker
Timeline patterns reveal:

- Who introduces a phrase (innovation)
- Who adopts it (uptake)
- Whether it persists or fades
- If meaning seems stable or shifts

- **Introduction**: First use often with explanation
- **Adoption**: Other speakers pick up the term
- **Evolution**: Meaning shifts or stabilizes over time
- **Abandonment**: Some frames disappear as teams pivot

In [None]:
df = session_ngrams[
    (session_ngrams['mi_in_session_z'] > 0.0) &
    (session_ngrams['speaker_count'] > 1) &
    (session_ngrams['frequency_in_session'] > 3) & (session_ngrams['frequency_in_session'] < 100) &
    (session_ngrams['frequency_rank_within_length'] > 200) &
    (session_ngrams['ngram_length'] > 1) &
    (session_ngrams['significant'].isna()) &  # Changed to isna() to filter NaN values
    (session_ngrams['mi_diff_z'] > 0)
    (session_ngrams['mi_diff_z'] > 0)
]
phrases = df.sort_values('mi_diff_z', ascending=False).head(100)['ngram'].tolist()
phrases


In [None]:
# Plot when each speaker uses selected phrases using the new timeline function (fixed)
from src.mi_visualization import plot_phrase_timeline
import altair as alt
alt.data_transformers.enable('json', data_dir='altair')

# It must include columns: 'session', 'session_index', 'speaker', 'text'
chart = plot_phrase_timeline(
    df_turns=session_turns,
    phrases=phrases,
    global_index=True,
    show_session_boundaries=False,
    altair_data_dir='altair',
    save_html='outputs/vis/timeline_candidates.html'
)
chart


## cluster

In [None]:
from src.mi_visualization import cluster_phrases_semantic

clusters_df, cluster_info = cluster_phrases_semantic(
    phrases=phrases,
    algorithm='hdbscan',       # or 'kmeans'|'agglomerative'|'dbscan'
    min_cluster_size=5,
    reduce_to_2d=True, reducer='umap'  # falls back to PCA if UMAP missing
)

In [None]:
import altair as alt
alt.Chart(clusters_df[clusters_df['cluster_id']!=-1]).mark_circle(size=100).encode(
  x='x:Q', y='y:Q', color='cluster_id:N', tooltip=['phrase','cluster_id']
).properties(
  width=800,
  height=600
)

In [None]:
from src.mi_visualization import cluster_phrases_semantic, plot_phrase_timeline

# 1) Cluster your phrases
clusters_df, cluster_info = cluster_phrases_semantic(phrases, algorithm='hdbscan', min_cluster_size=5)

# 2) Pick a cluster (skip -1 which is noise)
cluster_id = 3
phrases_in_cluster = (
    clusters_df.loc[clusters_df['cluster_id'] == cluster_id, 'phrase']
    .dropna().unique().tolist()
)

# Optional: take the most representative ones
# phrases_in_cluster = (
#     clusters_df[clusters_df['cluster_id'] == cluster_id]
#     .sort_values('distance_to_centroid')
#     .head(15)['phrase'].tolist()
# )

# 3) Plot timeline
chart = plot_phrase_timeline(
    df_turns=session_turns,              # must have 'session','session_index','speaker','text'
    phrases=phrases_in_cluster,
    global_index=True,
    show_session_boundaries=False,
    altair_data_dir='altair',
    save_html=None
)
chart

### Cluster candidate phrases with similar meaning 
Related phrases might indicate the same underlying frame:

- "body and mind" → "mental health" → "good life" (evolving wellness frame)
- "status symbol" + "social pressure" (social dynamics frame)

In [None]:
# Group semantically related phrases 
# Track cluster usage over time 
# # Identify potential frame evolution

# Extras:

## high level insights by suming up the metrics for certain segments 

## KWIC context