# Step 1: Cluster YC startups into defined industries

1. Read in the tags from YC startups
2. Filter for tags related to domains (not just the tech)
3. Cluster into high level categories
4. Select top8 clusters by average count

In [2]:
import pandas as pd

def validate_clusters(valid, clusters):
    """
    Ensures:
      1. Every item in `valid` is captured in at least one of `clusters`.
      2. No cluster contains tags that aren't in `valid`.
    """
    valid_set = set(valid)
    cluster_tags = set()

    for cluster_name, tags in clusters.items():
        cluster_tags.update(tags)

        extra_tags = set(tags) - valid_set
        if extra_tags:
            print(f"BAD: Cluster '{cluster_name}' has tags not in 'valid': {extra_tags}")
            raise ValueError

    missing_tags = valid_set - cluster_tags
    if missing_tags:
        print(f"BAD: These 'valid' tags are missing from clusters: {missing_tags}")
        raise ValueError

    if not missing_tags and not any(set(tags) - valid_set for tags in clusters.values()):
        print("All clusters validated: nothing missing or extra.")


def validate_is_subset(small_set, big_set):
    """
    Ensures that `small_set` is a subset of `big_set`.
    """
    if not small_set.issubset(big_set):
        print(f"BAD: {small_set} is not a subset of {big_set}") 
        raise ValueError
    else:
        print("Subset validated: nothing missing or extra.")
        
        

def assign_cluster(tag):
    for cluster_name, cluster_tags in clusters.items():
        if tag in cluster_tags:
            return cluster_name
    return 'other'

        
json_file = "data/raw/yc_ai_assistant_tags.jsonl"

df = pd.read_json("data/raw/yc_ai_assistant_tags.jsonl", lines=True)


print("n unique tags", len(df))
print("n total tags", df['count'].sum())

all_tags = df['tag'].unique().tolist()

# Valid means 
# (A) a domain application not just underlying tech. For example, 'no-code' or 'data-viz' are excluded. 
# (B) Consumer-facing domain application


valid = ['ai-enhanced-learning', 'automotive', 'collaboration', 'compliance','consumer-finance',  'creator-economy', 'customer-service', 'customer-success', 'customer-support', 'design', 'design-tools', 'digital-health', 'e-commerce', 'education', 'email', 'entertainment', 'finance', 'fintech',  'health-tech', 'healthcare', 'healthcare-it', 'hr-tech', 'insurance', 'legal', 'legaltech', 'market-research', 'marketing', 'marketplace',  'note-taking', 'productivity',  'real-estate', 'recruiting',  'remote-work', 'retail',  'sales', 'sms', 'social-media', 'social-network', 'telehealth', 'travel']

print("length of valid tags", len(valid))

validate_is_subset(set(valid), set(all_tags))

clusters = {
    'hr': ['recruiting', 'hr-tech'],
    'legal': ['legal', 'legaltech', 'compliance'],
    'finance': ['fintech', 'finance', 'consumer-finance', 'insurance'],
    'commerce': ['retail', 'e-commerce', 'sales', 'marketing', 'market-research', 'real-estate','marketplace'],
    'communication': ['email', 'sms', 'social-network', 'collaboration', 'social-media'],
    'healthcare': ['telehealth', 'healthcare', 'healthcare-it', 'health-tech', 'digital-health'],
    'productivity': ['productivity', 'note-taking', 'remote-work'],
    'customer_service': ['customer-support', 'customer-success', 'customer-service'],
    'education': ['ai-enhanced-learning', 'education'],
    'arts_and_entertainment': ['creator-economy', 'entertainment', 'design-tools', 'design'],
    'transportation': [ 'automotive',  'travel']
}

validate_clusters(valid, clusters)

dfv = df[df['tag'].isin(valid)]
dfv['cluster'] = dfv['tag'].apply(assign_cluster)
top8 = dfv.groupby('cluster')['count'].sum().sort_values(ascending=False).head(8).index.tolist()

# Latex table
#############################
grouped = dfv.groupby('cluster')['tag'].apply(lambda x: ', '.join(x)).reset_index()
cluster_counts = dfv.groupby('cluster')['count'].sum().reset_index()
grouped = pd.merge(grouped, cluster_counts, on='cluster')

# filter above 5 counts
grouped = grouped.query("count >= 5")

grouped['cluster'] = grouped['cluster'].apply(lambda x: x.replace('_', ' ').title())
grouped = grouped.sort_values(by='count', ascending=False)
display(grouped)
grouped.to_latex("tables/clusters.tex", 
                 index=False,
                 escape=False, 
                 caption="We retrieved a list of April's top 100 AI Assistant startups backed by Y Combinator. Each startup had tags. We clustered tags. Then we selected the number of clusters (8) where the sum of tag counts was at least 5. Count is the total number of startups in each cluster. Tags are separated by commas.",
                 label="tab:clustersyc")


# check one more time everything good
print("Checking at end of cell")
validate_clusters(valid, clusters)
validate_is_subset(set(valid), set(all_tags))


n unique tags 85
n total tags 431
length of valid tags 40
Subset validated: nothing missing or extra.
All clusters validated: nothing missing or extra.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfv['cluster'] = dfv['tag'].apply(assign_cluster)


Unnamed: 0,cluster,tag,count
1,Commerce,"real-estate, retail, e-commerce, marketing, sa...",18
3,Customer Service,"customer-support, customer-service, customer-s...",13
5,Finance,"fintech, finance, consumer-finance, insurance",11
9,Productivity,"productivity, remote-work, note-taking",10
2,Communication,"email, sms, collaboration, social-media, socia...",8
6,Healthcare,"telehealth, healthcare, healthcare-it, health-...",7
8,Legal,"legaltech, legal, compliance",7
4,Education,"ai-enhanced-learning, education",5


Checking at end of cell
All clusters validated: nothing missing or extra.
Subset validated: nothing missing or extra.


# Step 2: Read in ONET data

SOC_Structure is from `https://www.onetcenter.org/taxonomy/2019/structure.html`

In [3]:
def clean_col(col):
    """
    Returns clean col name
    """
    col = col.replace(" ", "_")
    col = col.replace("-", "_")
    col = col.replace("*", "")
    col = col.lower()
    return col

def print_major_cats():
    """
    Prints major SOC categories and returns a df with cols like 
    
    major_group, soc_or_onet_soc_2019_title 
    """
    soc = pd.read_csv("data/raw/SOC_Structure.csv")
    soc.columns = [clean_col(s) for s in soc.columns]
    print(soc.columns)
    majors = soc.dropna(subset=['major_group'])[['major_group', 'soc_or_onet_soc_2019_title']]
    majors = majors.sort_values(by=['soc_or_onet_soc_2019_title'])
    for idx, row in majors.iterrows():
        print(f"{row['major_group']}: {row['soc_or_onet_soc_2019_title']}")
    print()
    majors = majors.sort_values(by=['major_group'])
    for idx, row in majors.iterrows():
        print(f"{row['major_group']}: {row['soc_or_onet_soc_2019_title']}")
    return majors
        
majors = print_major_cats()


Index(['major_group', 'minor_group', 'broad_occupation', 'detailed_occupation',
       'detailed_onet_soc', 'soc_or_onet_soc_2019_title'],
      dtype='object')
17-0000: Architecture and Engineering Occupations
27-0000: Arts, Design, Entertainment, Sports, and Media Occupations
37-0000: Building and Grounds Cleaning and Maintenance Occupations
13-0000: Business and Financial Operations Occupations
21-0000: Community and Social Service Occupations
15-0000: Computer and Mathematical Occupations
47-0000: Construction and Extraction Occupations
25-0000: Educational Instruction and Library Occupations
45-0000: Farming, Fishing, and Forestry Occupations
35-0000: Food Preparation and Serving Related Occupations
29-0000: Healthcare Practitioners and Technical Occupations
31-0000: Healthcare Support Occupations
49-0000: Installation, Maintenance, and Repair Occupations
23-0000: Legal Occupations
19-0000: Life, Physical, and Social Science Occupations
11-0000: Management Occupations
55-0000: Mil

# Step 3: Assign the YC clusters to ONET occupation categories

In [4]:
soc_mapping = {

    'legal': ['23-0000'],  # Legal Occupations

    'finance': ['13-0000',  # Business and Financial Operations Occupations
                '11-0000'],  # Management Occupations 

    'commerce': [
        '41-0000',  # Sales and Related Occupations
        '13-0000',  # Business and Financial Operations Occupations 
        '11-0000'  # Management Occupations 
    ],

    'communication': [
        '27-0000',  # Arts, Design, Entertainment, Sports, and Media Occupations, 
        '15-0000' # Computer and Mathematical Occupations

    ],

    'healthcare': [
        '29-0000',  # Healthcare Practitioners and Technical Occupations
        '31-0000'  # Healthcare Support Occupations
    ],

    'productivity': [
        '15-0000',  # Computer and Mathematical Occupations
        '43-0000'  # Office and Administrative Support Occupations
    ],

    'customer_service': [
        '43-0000',  # Office and Administrative Support Occupations
        '41-0000'  # Sales and Related Occupations 
    ],

    'education': ['25-0000'],  # Educational Instruction and Library Occupations

}

def create_soc_mapping(soc_mapping, soc_codes):
    """
    Create a mapping of cluster to SOC codes.
    
    Args:
    - cluster (str): The name of the cluster.
    - soc_codes (list): List of SOC codes associated with the cluster.
    
    Returns:
    - dict: A dictionary with the cluster as the key and the SOC codes as the value.
    """
    print(soc_mapping)

    data = []
    for cluster, soc_codes in soc_mapping.items():
        for soc_code in soc_codes:
            data.append({
                'cluster': cluster,
                'soc_code': soc_code
            })
    
    soc_df = pd.DataFrame(data)
    merged = pd.merge(soc_df, majors, left_on='soc_code', right_on='major_group', how='left')
    
    for cluster in soc_mapping.keys():
        print(f"{cluster}: {merged[merged['cluster'] == cluster]['soc_or_onet_soc_2019_title'].unique()}")
    return merged

# Result dictionary

soc_df = create_soc_mapping(soc_mapping, majors['soc_or_onet_soc_2019_title'].unique())


###### Latex table


# create a latex table

soc_df2 = create_soc_mapping(soc_mapping, majors['soc_or_onet_soc_2019_title'].unique())

soc_df2 = soc_df2[['cluster', 'major_group', 'soc_or_onet_soc_2019_title']]

soc_df2.columns = ['Cluster', 'Major Group Code', 'Major Group']

soc_df2.to_latex("tables/soc_mapping.tex", 
                 index=False,
                 escape=False, 
                 caption="Clusters and their associated ONET major group codes.",
                 label="tab:soc_mapping",
                 longtable=True)




    

{'legal': ['23-0000'], 'finance': ['13-0000', '11-0000'], 'commerce': ['41-0000', '13-0000', '11-0000'], 'communication': ['27-0000', '15-0000'], 'healthcare': ['29-0000', '31-0000'], 'productivity': ['15-0000', '43-0000'], 'customer_service': ['43-0000', '41-0000'], 'education': ['25-0000']}
legal: ['Legal Occupations']
finance: ['Business and Financial Operations Occupations' 'Management Occupations']
commerce: ['Sales and Related Occupations'
 'Business and Financial Operations Occupations' 'Management Occupations']
communication: ['Arts, Design, Entertainment, Sports, and Media Occupations'
 'Computer and Mathematical Occupations']
healthcare: ['Healthcare Practitioners and Technical Occupations'
 'Healthcare Support Occupations']
productivity: ['Computer and Mathematical Occupations'
 'Office and Administrative Support Occupations']
customer_service: ['Office and Administrative Support Occupations'
 'Sales and Related Occupations']
education: ['Educational Instruction and Library 

In [5]:
soc_df

Unnamed: 0,cluster,soc_code,major_group,soc_or_onet_soc_2019_title
0,legal,23-0000,23-0000,Legal Occupations
1,finance,13-0000,13-0000,Business and Financial Operations Occupations
2,finance,11-0000,11-0000,Management Occupations
3,commerce,41-0000,41-0000,Sales and Related Occupations
4,commerce,13-0000,13-0000,Business and Financial Operations Occupations
5,commerce,11-0000,11-0000,Management Occupations
6,communication,27-0000,27-0000,"Arts, Design, Entertainment, Sports, and Media..."
7,communication,15-0000,15-0000,Computer and Mathematical Occupations
8,healthcare,29-0000,29-0000,Healthcare Practitioners and Technical Occupat...
9,healthcare,31-0000,31-0000,Healthcare Support Occupations


## Step 4: Get the work activities or tasks for each cluster


In [47]:
def get_top_by_major(soc_codes=None):
    """
    Get top abilities by SOC major category.
    
    Args:
    - what (str): 'abilities', 'tasks', or 'dwa' to specify the type of data to retrieve.
    - top_n (int): Number of top items to return.
    - soc_codes (list): List of SOC major codes to filter by. If None, all codes are included.
    """



    work_data = pd.read_csv("https://www.onetcenter.org/dl_files/database/db_29_2_text/Work%20Activities.txt", sep="\t")
    work_data = work_data[work_data['Recommend Suppress'] != 'Y']
    print("stats")
    unique_jobs = work_data['O*NET-SOC Code'].nunique()
    unique_work_acts = work_data['Element Name'].nunique()
    print("unique jobs", unique_jobs)
    print("unique work acts", unique_work_acts)
    

    work_data['soc_major'] = work_data['O*NET-SOC Code'].apply(lambda x: x[:2] + "-0000")
    
        
    work_data['data_value_z'] = work_data.groupby(by='Scale ID')['Data Value'].transform(lambda x: (x - x.mean()) / x.std())
    
    # first aggregate at occupation level 
    work_data = work_data.groupby(by=['soc_major', 'O*NET-SOC Code', 'Element Name'])['data_value_z'].mean().reset_index()
    
    # now aggregate at major group level 
    byname = work_data.groupby(by=['Element Name', 'soc_major'])['data_value_z'].mean().reset_index()
                    
    byname = byname.rename(columns={'Element Name': 'item_name'})
        
    
    return byname



def pretty_format_list(items):
    """
    Pretty format a list of items for display.
    """
    for item in items:
        print(f"- {item}")
        print("---"*3)
        

N = 10




# This one is work activities
print("WORK ACTIVITIES")
print("====="*20)
top_dwa = get_top_by_major()
dwa_merge = pd.merge(soc_df, top_dwa, left_on='soc_code', right_on='soc_major', how='left')

# now aggregate at cluster level
dwa_merge = dwa_merge.groupby(by=['cluster', 'item_name'])['data_value_z'].mean().reset_index()



dwa_merge['rank'] = dwa_merge.groupby(by=['cluster'])['data_value_z'].rank(ascending=False)
dwa_merge = dwa_merge[['cluster', 'item_name', 'rank']]
dwa_merge_top = dwa_merge.query("rank <= @N")
dwa_merge_top.to_csv("data/raw/cluster_activities.csv")


# latex
############### 

dwa_merge_top2 = dwa_merge.query("rank <= @N")
dwa_merge_top2 = dwa_merge.sort_values(by=['cluster', 'rank'], ascending=[True, True])
dwa_merge_top2 = dwa_merge_top2.groupby(by=['cluster'])['item_name'].apply(lambda x: ",".join(x))
dwa_merge_top2.columns = ['Cluster', 'Work Activities']
dwa_merge_top2.to_latex("tables/work_acts.tex", caption="Clusters and top 10 work activities of associated ONET occuptations.", label="work_acts", longtable=True)


WORK ACTIVITIES
stats
unique jobs 879
unique work acts 41


In [41]:
import numpy as np
from sklearn.metrics import jaccard_score
import pandas as pd

# Assuming dwa_merge_top is your DataFrame
# First get the items per cluster
cluster_items = dwa_merge_top.groupby(by=['cluster'])['item_name'].apply(list).reset_index()

# Convert to sets for Jaccard calculation
cluster_sets = {}
for _, row in cluster_items.iterrows():
    cluster_sets[row['cluster']] = set(row['item_name'])

# Calculate Jaccard similarity for all pairs of clusters
clusters = list(cluster_sets.keys())
n_clusters = len(clusters)
jaccard_matrix = np.zeros((n_clusters, n_clusters))

for i in range(n_clusters):
    for j in range(n_clusters):
        set_i = cluster_sets[clusters[i]]
        set_j = cluster_sets[clusters[j]]
        
        # Calculate Jaccard similarity
        if i == j:  # Same cluster, similarity is 1.0
            jaccard_matrix[i, j] = 1.0
        else:
            intersection = len(set_i.intersection(set_j))
            union = len(set_i.union(set_j))
            jaccard_matrix[i, j] = intersection / union if union > 0 else 0.0

# Calculate average Jaccard similarity for each cluster
avg_jaccard_by_cluster = {}
for i, cluster in enumerate(clusters):
    # Exclude self-similarity (which is always 1.0)
    avg_similarity = np.sum(jaccard_matrix[i, :]) - 1.0  # Subtract self-similarity
    avg_similarity = avg_similarity / (n_clusters - 1)  # Divide by number of other clusters
    avg_jaccard_by_cluster[cluster] = avg_similarity

# Convert to DataFrame for better visualization
avg_jaccard_df = pd.DataFrame({
    'cluster': list(avg_jaccard_by_cluster.keys()),
    'avg_jaccard_similarity': list(avg_jaccard_by_cluster.values())
})

# Sort by average Jaccard similarity
avg_jaccard_df = avg_jaccard_df.sort_values('avg_jaccard_similarity', ascending=False)

In [44]:
avg_jaccard_df['avg_jaccard_similarity'].mean()

np.float64(0.6116145759002901)

In [7]:
dwa_merge

Unnamed: 0,cluster,item_name,rank
0,commerce,Analyzing Data or Information,10.0
1,commerce,Assisting and Caring for Others,32.0
2,commerce,Coaching and Developing Others,22.0
3,commerce,Communicating with People Outside the Organiza...,7.0
4,commerce,"Communicating with Supervisors, Peers, or Subo...",3.0
...,...,...,...
323,productivity,Staffing Organizational Units,35.0
324,productivity,Thinking Creatively,13.0
325,productivity,Training and Teaching Others,21.0
326,productivity,Updating and Using Relevant Knowledge,5.0
