# Step 1: Cluster YC startups into defined industries

1. Read in the tags from YC startups
2. Filter for tags related to domains (not just the tech)
3. Cluster into high level categories
4. Select top8 clusters by average count

In [1]:
import pandas as pd
import matplotlib.pyplot as plt


def validate_clusters(valid, clusters):
    """
    Ensures:
      1. Every item in `valid` is captured in at least one of `clusters`.
      2. No cluster contains tags that aren't in `valid`.
    """
    valid_set = set(valid)
    cluster_tags = set()

    for cluster_name, tags in clusters.items():
        cluster_tags.update(tags)

        extra_tags = set(tags) - valid_set
        if extra_tags:
            print(f"BAD: Cluster '{cluster_name}' has tags not in 'valid': {extra_tags}")
            raise ValueError

    missing_tags = valid_set - cluster_tags
    if missing_tags:
        print(f"BAD: These 'valid' tags are missing from clusters: {missing_tags}")
        raise ValueError

    if not missing_tags and not any(set(tags) - valid_set for tags in clusters.values()):
        print("All clusters validated: nothing missing or extra.")


def validate_is_subset(small_set, big_set):
    """
    Ensures that `small_set` is a subset of `big_set`.
    """
    if not small_set.issubset(big_set):
        print(f"BAD: {small_set} is not a subset of {big_set}") 
        raise ValueError
    else:
        print("Subset validated: nothing missing or extra.")
        
        

def assign_cluster(tag):
    for cluster_name, cluster_tags in clusters.items():
        if tag in cluster_tags:
            return cluster_name
    return 'other'

        
df = pd.read_json("../data/raw/yc_ai_assistant_tags.jsonl", lines=True)

print("n unique tags", len(df))
print("n total tags", df['count'].sum())

all_tags = df['tag'].unique().tolist()

# Valid means 
# (A) a domain application not just underlying tech. For example, 'no-code' or 'data-viz' are excluded. 
# (B) Consumer-facing domain application


valid = ['ai-enhanced-learning', 'automotive', 'collaboration', 'compliance','consumer-finance',  'creator-economy', 'customer-service', 'customer-success', 'customer-support', 'design', 'design-tools', 'digital-health', 'e-commerce', 'education', 'email', 'entertainment', 'finance', 'fintech',  'health-tech', 'healthcare', 'healthcare-it', 'hr-tech', 'insurance', 'legal', 'legaltech', 'market-research', 'marketing', 'marketplace',  'note-taking', 'productivity',  'real-estate', 'recruiting',  'remote-work', 'retail',  'sales', 'sms', 'social-media', 'social-network', 'telehealth', 'travel']

validate_is_subset(set(valid), set(all_tags))

clusters = {
    'hr': ['recruiting', 'hr-tech'],
    'legal': ['legal', 'legaltech', 'compliance'],
    'finance': ['fintech', 'finance', 'consumer-finance', 'insurance'],
    'commerce': ['retail', 'e-commerce', 'sales', 'marketing', 'market-research', 'real-estate','marketplace'],
    'communication': ['email', 'sms', 'social-network', 'collaboration', 'social-media'],
    'healthcare': ['telehealth', 'healthcare', 'healthcare-it', 'health-tech', 'digital-health'],
    'productivity': ['productivity', 'note-taking', 'remote-work'],
    'customer_service': ['customer-support', 'customer-success', 'customer-service'],
    'education': ['ai-enhanced-learning', 'education'],
    'arts_and_entertainment': ['creator-economy', 'entertainment', 'design-tools', 'design'],
    'transportation': [ 'automotive',  'travel']
}

validate_clusters(valid, clusters)

dfv = df[df['tag'].isin(valid)]
dfv['cluster'] = dfv['tag'].apply(assign_cluster)
top8 = dfv.groupby('cluster')['count'].sum().sort_values(ascending=False).head(8).index.tolist()

# Latex table
#############################
grouped = dfv.groupby('cluster')['tag'].apply(lambda x: ', '.join(x)).reset_index()
cluster_counts = dfv.groupby('cluster')['count'].sum().reset_index()
grouped = pd.merge(grouped, cluster_counts, on='cluster')
grouped['cluster'] = grouped['cluster'].apply(lambda x: x.replace('_', ' ').title())
grouped = grouped.sort_values(by='count', ascending=False)
display(grouped)
grouped.to_latex("../tables/clusters.tex", 
                 index=False,
                 header=False,
                 escape=False, 
                 caption="Clusters of tags from YC AI Assistant startups. Count is the total number of startups in each cluster. Tags are separated by commas.",
                 label="tab:clusters_yc")


# check one more time everything good
print("Checking at end of cell")
validate_clusters(valid, clusters)
validate_is_subset(set(valid), set(all_tags))


n unique tags 85
n total tags 431
Subset validated: nothing missing or extra.
All clusters validated: nothing missing or extra.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfv['cluster'] = dfv['tag'].apply(assign_cluster)


Unnamed: 0,cluster,tag,count
1,Commerce,"real-estate, e-commerce, retail, sales, market...",18
3,Customer Service,"customer-support, customer-success, customer-s...",13
5,Finance,"fintech, finance, consumer-finance, insurance",11
9,Productivity,"productivity, remote-work, note-taking",10
2,Communication,"email, sms, collaboration, social-network, soc...",8
8,Legal,"legal, legaltech, compliance",7
6,Healthcare,"healthcare, telehealth, healthcare-it, health-...",7
4,Education,"ai-enhanced-learning, education",5
0,Arts And Entertainment,"design-tools, entertainment, creator-economy, ...",4
7,Hr,"recruiting, hr-tech",4


Checking at end of cell
All clusters validated: nothing missing or extra.
Subset validated: nothing missing or extra.


# Step 2: Read in ONET data

SOC_Structure is from `https://www.onetcenter.org/taxonomy/2019/structure.html`

In [2]:
def clean_col(col):
    """
    Returns clean col name
    """
    col = col.replace(" ", "_")
    col = col.replace("-", "_")
    col = col.replace("*", "")
    col = col.lower()
    return col

def print_major_cats():
    """
    Prints major SOC categories and returns a df with cols like 
    
    major_group, soc_or_onet_soc_2019_title 
    """
    soc = pd.read_csv("SOC_Structure.csv")
    soc.columns = [clean_col(s) for s in soc.columns]
    print(soc.columns)
    majors = soc.dropna(subset=['major_group'])[['major_group', 'soc_or_onet_soc_2019_title']]
    majors = majors.sort_values(by=['soc_or_onet_soc_2019_title'])
    for idx, row in majors.iterrows():
        print(f"{row['major_group']}: {row['soc_or_onet_soc_2019_title']}")
    print()
    majors = majors.sort_values(by=['major_group'])
    for idx, row in majors.iterrows():
        print(f"{row['major_group']}: {row['soc_or_onet_soc_2019_title']}")
    return majors
        
majors = print_major_cats()


Index(['major_group', 'minor_group', 'broad_occupation', 'detailed_occupation',
       'detailed_onet_soc', 'soc_or_onet_soc_2019_title'],
      dtype='object')
17-0000: Architecture and Engineering Occupations
27-0000: Arts, Design, Entertainment, Sports, and Media Occupations
37-0000: Building and Grounds Cleaning and Maintenance Occupations
13-0000: Business and Financial Operations Occupations
21-0000: Community and Social Service Occupations
15-0000: Computer and Mathematical Occupations
47-0000: Construction and Extraction Occupations
25-0000: Educational Instruction and Library Occupations
45-0000: Farming, Fishing, and Forestry Occupations
35-0000: Food Preparation and Serving Related Occupations
29-0000: Healthcare Practitioners and Technical Occupations
31-0000: Healthcare Support Occupations
49-0000: Installation, Maintenance, and Repair Occupations
23-0000: Legal Occupations
19-0000: Life, Physical, and Social Science Occupations
11-0000: Management Occupations
55-0000: Mil

# Step 3: Assign the YC clusters to ONET occupation categories

In [3]:
soc_mapping = {
    'hr': ['13-0000'],  # Business and Financial Operations Occupations

    'legal': ['23-0000'],  # Legal Occupations

    'finance': ['13-0000',  # Business and Financial Operations Occupations
                '11-0000'],  # Management Occupations 

    'commerce': [
        '41-0000',  # Sales and Related Occupations
        '13-0000',  # Business and Financial Operations Occupations 
        '11-0000'  # Management Occupations 
    ],

    'communication': [
        '27-0000',  # Arts, Design, Entertainment, Sports, and Media Occupations, 
        '15-0000' # Computer and Mathematical Occupations

    ],

    'healthcare': [
        '29-0000',  # Healthcare Practitioners and Technical Occupations
        '31-0000'  # Healthcare Support Occupations
    ],

    'productivity': [
        '15-0000',  # Computer and Mathematical Occupations
        '43-0000'  # Office and Administrative Support Occupations
    ],

    'customer_service': [
        '43-0000',  # Office and Administrative Support Occupations
        '41-0000'  # Sales and Related Occupations 
    ],

    'education': ['25-0000'],  # Educational Instruction and Library Occupations

    'arts_and_entertainment': [
        '27-0000'  # Arts, Design, Entertainment, Sports, and Media Occupations
    ],

    'transportation': [
        '53-0000',  # Transportation and Material Moving Occupations
    ]
}

def create_soc_mapping(soc_mapping, soc_codes):
    """
    Create a mapping of cluster to SOC codes.
    
    Args:
    - cluster (str): The name of the cluster.
    - soc_codes (list): List of SOC codes associated with the cluster.
    
    Returns:
    - dict: A dictionary with the cluster as the key and the SOC codes as the value.
    """
    print(soc_mapping)

    data = []
    for cluster, soc_codes in soc_mapping.items():
        for soc_code in soc_codes:
            data.append({
                'cluster': cluster,
                'soc_code': soc_code
            })
    
    soc_df = pd.DataFrame(data)
    merged = pd.merge(soc_df, majors, left_on='soc_code', right_on='major_group', how='left')
    
    for cluster in soc_mapping.keys():
        print(f"{cluster}: {merged[merged['cluster'] == cluster]['soc_or_onet_soc_2019_title'].unique()}")
    return merged

# Result dictionary

soc_df = create_soc_mapping(soc_mapping, majors['soc_or_onet_soc_2019_title'].unique())

    

{'hr': ['13-0000'], 'legal': ['23-0000'], 'finance': ['13-0000', '11-0000'], 'commerce': ['41-0000', '13-0000', '11-0000'], 'communication': ['27-0000', '15-0000'], 'healthcare': ['29-0000', '31-0000'], 'productivity': ['15-0000', '43-0000'], 'customer_service': ['43-0000', '41-0000'], 'education': ['25-0000'], 'arts_and_entertainment': ['27-0000'], 'transportation': ['53-0000']}
hr: ['Business and Financial Operations Occupations']
legal: ['Legal Occupations']
finance: ['Business and Financial Operations Occupations' 'Management Occupations']
commerce: ['Sales and Related Occupations'
 'Business and Financial Operations Occupations' 'Management Occupations']
communication: ['Arts, Design, Entertainment, Sports, and Media Occupations'
 'Computer and Mathematical Occupations']
healthcare: ['Healthcare Practitioners and Technical Occupations'
 'Healthcare Support Occupations']
productivity: ['Computer and Mathematical Occupations'
 'Office and Administrative Support Occupations']
custome

## Step 4: Get the work activities or tasks for each cluster


In [4]:
def get_top_by_major(what='tasks', top_n=5, soc_codes=None):
    """
    Get top tasks or detailed work activities (DWA) by SOC major category.
    
    Args:
    - what (str): 'tasks' or 'dwa' to specify the type of data to retrieve.
    - op_n (int): Number of top items to return.
    - soc_codes (list): List of SOC major codes to filter by. If None, all codes are included.
    """
    import pandas as pd
    
    if what == 'tasks':
        task_ratings = pd.read_csv("https://www.onetcenter.org/dl_files/database/db_29_2_text/Task%20Ratings.txt", sep="\t")
        
        # Important filters!
        # There are different scales, we look at task importance---how important task is to job
        # There is a field called "Reccomend Suppress" that ONET website says means it's a low-quality
        # task rating. We want to exclude these.
        task_ratings = task_ratings[task_ratings['Scale ID'] == "RT"]
        task_ratings = task_ratings[task_ratings["Recommend Suppress"] == "N"]
        
        task_statements = pd.read_csv("https://www.onetcenter.org/dl_files/database/db_29_2_text/Task%20Statements.txt", sep="\t")
        
        work_data = pd.merge(task_ratings, task_statements, left_on='Task ID', right_on='Task ID', how='left')
        
        work_data['soc_major'] = work_data['O*NET-SOC Code_x'].apply(lambda x: x[:2] + "-0000")
        
        if soc_codes is not None:
            work_data = work_data[work_data['soc_major'].isin(soc_codes)]
        
        byname = work_data.groupby(by=['Task', 'soc_major'])['Data Value'].mean().reset_index()
        
        byname['rank'] = byname.groupby('soc_major')['Data Value'].rank(method='first', ascending=False)
        
        top_items = byname[byname['rank'] <= top_n]
        
        top_items = top_items.rename(columns={'Task': 'item_name'})
        
    elif what == 'dwa':
        work_data = pd.read_csv("https://www.onetcenter.org/dl_files/database/db_29_2_text/Work%20Activities.txt", sep="\t")
        
        work_data['soc_major'] = work_data['O*NET-SOC Code'].apply(lambda x: x[:2] + "-0000")
        
        if soc_codes is not None:
            work_data = work_data[work_data['soc_major'].isin(soc_codes)]
        
        byname = work_data.groupby(by=['Element Name', 'soc_major'])['Data Value'].mean().reset_index()
        
        byname['rank'] = byname.groupby('soc_major')['Data Value'].rank(method='first', ascending=False)
        
        top_items = byname[byname['rank'] <= top_n]
        
        top_items = top_items.rename(columns={'Element Name': 'item_name'})
        
    else:
        raise ValueError("Invalid value for 'what'. Choose either 'tasks' or 'dwa'.")
    
    return top_items


def pretty_format_list(items):
    """
    Pretty format a list of items for display.
    """
    for item in items:
        print(f"- {item}")
        print("---"*3)
        

N = 10

# This one is work activities
print("WORK ACTIVITIES")
print("====="*20)
top_dwa = get_top_by_major(what='dwa', top_n=N)
dwa_merge = pd.merge(soc_df, top_dwa, left_on='soc_code', right_on='soc_major', how='left')
for cluster in soc_mapping.keys():
    tdf = dwa_merge[dwa_merge['cluster'] == cluster]
    item_names = tdf['item_name'].unique()
    print("CLUSTER:", cluster)
    for i in item_names:
        print(f"- {i}")
    print("====="*20)

print("\n"*3)
print("*******"*20)

# This one is tasks
print("TASKS")
print("====="*20)
top_tasks = get_top_by_major(what='tasks', top_n=N)
tasks_merge = pd.merge(soc_df, top_tasks, left_on='soc_code', right_on='soc_major', how='left')
for cluster in soc_mapping.keys():
    tdf = tasks_merge[tasks_merge['cluster'] == cluster]
    item_names = tdf['item_name'].unique()
    print("CLUSTER:", cluster)
    for i in item_names:
        print(f"- {i}")
    print("====="*20)



WORK ACTIVITIES
CLUSTER: hr
- Analyzing Data or Information
- Communicating with People Outside the Organization
- Communicating with Supervisors, Peers, or Subordinates
- Establishing and Maintaining Interpersonal Relationships
- Evaluating Information to Determine Compliance with Standards
- Getting Information
- Making Decisions and Solving Problems
- Organizing, Planning, and Prioritizing Work
- Processing Information
- Updating and Using Relevant Knowledge
CLUSTER: legal
- Analyzing Data or Information
- Communicating with Supervisors, Peers, or Subordinates
- Establishing and Maintaining Interpersonal Relationships
- Evaluating Information to Determine Compliance with Standards
- Getting Information
- Identifying Objects, Actions, and Events
- Making Decisions and Solving Problems
- Organizing, Planning, and Prioritizing Work
- Processing Information
- Updating and Using Relevant Knowledge
CLUSTER: finance
- Analyzing Data or Information
- Communicating with People Outside the Or