# GTM mapping
- input value : Tf-idf value of doc-term matrix

In [None]:
!pip install ugtm
import time
from ugtm import eGTM
import pandas as pd
import altair as alt

# X value
X = tfidf_df.values

# GTM fitting
start_fit = time.time()
gtm_model = eGTM(model='modes').fit(X)
end_fit = time.time()

# GTM transform
start_transform = time.time()
coordinates = gtm_model.transform(X)
end_transform = time.time()

# time
print(f"GTM fitting : {end_fit - start_fit:.2f}초")
print(f"GTM transform : {end_transform - start_transform:.2f}초")

# to DataFrame
df = pd.DataFrame(coordinates, columns=["x1", "x2"])

# Altair visualization
alt.data_transformers.disable_max_rows()
chart = alt.Chart(df).mark_point().encode(
    x='x1',
    y='x2',
    tooltip=['x1', 'x2']
).properties(
    title="Technology-Market Vacancy Map",
    width=800,
    height=600
).interactive()

chart

In [None]:
#inverse-mapping
import numpy as np
import pandas as pd

grid_x, grid_y = np.meshgrid(np.linspace(-1, 1, 16), np.linspace(-1, 1, 16))
grid_coords = np.c_[grid_x.ravel(), grid_y.ravel()]

occupied_coords = coordinates[:, :2]  # actual coordinates

# vacancy : not occupied
vacant_coords2 = np.array([
    coord for coord in grid_coords
    if not any(np.all(coord == occupied_coords, axis=1))
])
pd.DataFrame(vacant_coords2).to_csv('vacant_coords3.csv', index=False)
vc = pd.read_csv('vacant_coords3.csv')
vc.shape


In [None]:
# GTM result coordinates
coordinates2 = gtm_model.transform(X)  # (n_docs, 2)

# coordinates
vacant_coords2 = pd.read_csv("vacant_coords3.csv").to_numpy()  # (n_vacant, 2)

# coordinate -> embedding mapping
inv_coords = np.linalg.pinv(coordinates2)  # (2, n_docs)
inner_embedding = np.matmul(inv_coords, X)  # (2, dim)

# pseudo-inverse
inv_inner = np.linalg.pinv(inner_embedding.T)  # (2, 384)

# vacant coordinates → embedding (inverse)
inv_vacant_embeddings2 = np.matmul(vacant_coords2, inv_inner)  # (n_vacant, dim)

# result
inv_vacant_df2 = pd.DataFrame(inv_vacant_embeddings2, columns=tfidf_df.columns)

# add vacant num
inv_vacant_df2.insert(0, 'vacant_number', range(len(inv_vacant_df2)))
inv_vacant_df2.to_csv("inv_mapped_tfidf_from_vacant_coords3.csv", index=False)
inv_vacant_df2.shape

# Softmax function
- transform to probability distribution

In [None]:
from scipy.special import softmax

# extract only columns (keywords)
keyword_cols = inv_vacant_df2.columns.difference(['vacant_number'])

# softmax
softmax_values = inv_vacant_df2[keyword_cols].apply(
    lambda row: softmax(row.values), axis=1, result_type='expand'
)

# keep column name
softmax_values.columns = keyword_cols

inv_vacant_prob_df = inv_vacant_df2.copy()
inv_vacant_prob_df[keyword_cols] = softmax_values

inv_vacant_prob_df.to_csv("softmax_vacant_coords2_prob-4.csv", index=False)

In [None]:
softmax = pd.read_csv("softmax_vacant_coords2_prob-4.csv")

softmax['vacant_number'] = softmax['vacant_number'] + 1

softmax = softmax.set_index('vacant_number')

softmax.head()

In [None]:
row_sums = softmax.sum(axis=1)

print(row_sums.describe())

#count    5.400000e+01
#mean     1.000000e+00
#std      1.035884e-15
#min      1.000000e+00
#25%      1.000000e+00
#50%      1.000000e+00
#75%      1.000000e+00
#max      1.000000e+00

# Set column index (tech, market, overlapping)

In [None]:
#columns index
keyword_origin_map = {}

for kw in combined_top_keywords:
    in_patent = kw in patent_top_keywords
    in_startup = kw in startup_top_keywords

    if in_patent and in_startup:
        keyword_origin_map[kw] = 'overlapping'
    elif in_patent:
        keyword_origin_map[kw] = 'tech'
    elif in_startup:
        keyword_origin_map[kw] = 'market'
    else:
        keyword_origin_map[kw] = 'etc'

# column level
column_sources = [keyword_origin_map.get(kw, 'etc') for kw in softmax.columns]
column_keywords = softmax.columns

# multi index
multi_index = pd.MultiIndex.from_tuples(zip(column_sources, column_keywords))
softmax.columns = multi_index

In [None]:
softmax = softmax.sort_index(axis=1, level=0)

# group count
keyword_count = softmax.columns.to_series().groupby(level=0).count()

print(keyword_count)

#market          77
#overlapping     47
#tech           331

# Sensitivity analysis of keyword probability
- selected by 80% percentile value

In [None]:
def describe_with_quantiles(x):
    desc = x.describe()
    desc['80%'] = x.quantile(0.80)
    desc['85%'] = x.quantile(0.85)
    desc['90%'] = x.quantile(0.90)
    desc['95%'] = x.quantile(0.95)
    return desc

row_stats = softmax.apply(describe_with_quantiles, axis=1)
display(row_stats)

In [None]:
def count_keywords_by_quantiles(softmax, row_stats, quantile_cols=('75%', '80%', '85%', '90%', '95%')):
    result_rows = []

    for vac in softmax.index:
        row = softmax.loc[vac]  # vacancy softmax value
        row_data = {'vacant_number': vac}

        for q in quantile_cols:
            threshold = row_stats.loc[vac, q]  # vacancy q% value
            count = (row >= threshold).sum()   # over threshold keyword count
            # count_ge_75, count_ge_80 ...
            col_name = f"count_ge_{q.replace('%', '')}"
            row_data[col_name] = count

        result_rows.append(row_data)

    result_df = pd.DataFrame(result_rows).set_index('vacant_number')
    return result_df

count_table = count_keywords_by_quantiles(
    softmax,
    row_stats,
    quantile_cols=('75%', '80%', '85%', '90%', '95%')
)
display(count_table)

                        #count_ge_75	count_ge_80	count_ge_85	count_ge_90	count_ge_95
#vacant_number		1	    114	         91	          69	        46	        23

In [None]:
import pandas as pd
import numpy as np

def build_domain_keyword_value_table(softmax, row_stats, quantile_col='80'):
    rows = []

    for vac in row_stats.index:
        threshold = row_stats.loc[vac, quantile_col]

        row = softmax.loc[vac]
        selected = row[row >= threshold]

        domain_dict = {}

        for (domain, keyword), value in selected.items():
            entry = f"{keyword}({value:.6f})"
            if domain not in domain_dict:
                domain_dict[domain] = [entry]
            else:
                domain_dict[domain].append(entry)

        for domain in softmax.columns.levels[0]:
            domain_dict.setdefault(domain, [])
            domain_dict[domain] = ", ".join(domain_dict[domain])

        row_data = {'vacancy': vac}
        row_data.update(domain_dict)

        rows.append(row_data)

    final_df = pd.DataFrame(rows)
    return final_df

build_domain_keyword_value_table(softmax, row_stats, quantile_col='80%')

# Standardization and visualzation
- tech, market, overlapping
- in this study, market value was sparse. so we used tech & overlapping index

In [None]:
overlapping_scores = []
tech_scores = []
market_scores = []

for idx in range(1, 55):
    row = softmax.loc[idx]

    # N% quantile filtering
    threshold = row_stats.loc[idx, '80%']
    filtered = row[row >= threshold]

    # xs(level=0) sum
    overlapping_sum = (
        filtered.xs('overlapping', level=0).sum()
        if 'overlapping' in filtered.index.get_level_values(0)
        else 0
    )
    tech_sum = (
        filtered.xs('tech', level=0).sum()
        if 'tech' in filtered.index.get_level_values(0)
        else 0
    )
    market_sum = (
        filtered.xs('market', level=0).sum()
        if 'market' in filtered.index.get_level_values(0)
        else 0
    )

    overlapping_scores.append(overlapping_sum)
    tech_scores.append(tech_sum)
    market_scores.append(market_sum)

# -------------------------------------------------------
# 1) raw DataFrame
# -------------------------------------------------------
custom_index = [f'V{i}' for i in range(1, 55)]

raw_scores = pd.DataFrame({
    'overlapping': overlapping_scores,
    'tech': tech_scores,
    'market': market_scores
}, index=custom_index)

# -------------------------------------------------------
# 2) standardization (z-score)
# -------------------------------------------------------

standardized_scores = raw_scores.copy()

for col in ['overlapping', 'tech', 'market']:
    mean = raw_scores[col].mean()
    std = raw_scores[col].std(ddof=0)  # population std
    standardized_scores[col] = (raw_scores[col] - mean) / std

# -------------------------------------------------------
# 3) standardized DataFrame
# -------------------------------------------------------
standardized_scores.columns = ['overlap_z', 'tech_z', 'market_z']

print("[raw sum]:\n", raw_scores)
print("\n[standardized score]:\n", standardized_scores)


In [None]:
!pip install adjustText
import matplotlib.pyplot as plt
from adjustText import adjust_text

# X = overlap_z, Y = tech_z
x = standardized_scores['overlap_z']
y = standardized_scores['tech_z']

plt.figure(figsize=(10, 8))

plt.scatter(x, y, color='skyblue', edgecolor='k')

texts = []
for idx in standardized_scores.index:
    plt_x = standardized_scores.loc[idx, 'overlap_z']
    plt_y = standardized_scores.loc[idx, 'tech_z']
    texts.append(plt.text(plt_x, plt_y, idx, fontsize=9))

adjust_text(texts, arrowprops=dict(arrowstyle='-', color='gray', lw=0.5))

plt.axvline(x=0, color='red', linestyle='--', label='overlap_z = 0')
plt.axhline(y=0, color='blue', linestyle='--', label='tech_z = 0')

plt.xlabel('Overlap Z-score')
plt.ylabel('Tech Z-score')
plt.title('Standardized (Z-score) Distribution: Overlap vs Tech')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
q1_indices = standardized_scores[(standardized_scores['overlap_z'] > 0) & (standardized_scores['tech_z'] > 0)].index
# 'V2', 'V3', 'V6', 'V7', 'V12', 'V13', 'V18'

q1_keywords = {}

softmax.index = standardized_scores.index

# q1
for idx in q1_indices:
    vac = int(idx.replace('V', ''))
    threshold = row_stats.loc[vac, '80%']
    s = softmax.loc[idx][softmax.loc[idx] >= threshold]
    q1_keywords[idx] = s

def summarize_keywords_by_domain(keyword_dict):
    rows = []
    for idx, series in keyword_dict.items():
        # divide by domain
        overlap_items = sorted(
            [(word, prob) for (domain, word), prob in series.items() if domain == 'overlapping'],
            key=lambda x: -x[1]
        )
        tech_items = sorted(
            [(word, prob) for (domain, word), prob in series.items() if domain == 'tech'],
            key=lambda x: -x[1])

        overlap_words = [f"{word} ({prob:.4f})" for word, prob in overlap_items]
        tech_words = [f"{word} ({prob:.4f})" for word, prob in tech_items]

        rows.append({
            'index': idx,
            'overlapping': ' '.join(overlap_words),
            'tech': ' '.join(tech_words),
        })

    df = pd.DataFrame(rows)

    return df

df_q1_clean = summarize_keywords_by_domain(q1_keywords)
df_q1_clean_idx = df_q1_clean.set_index('index')

df_q1_clean_idx.to_csv('q1q4_keywords_idx.csv', index=False)
display(df_q1_clean)