# YC Analysis

1. Read in the tags from YC startups
2. Filter for tags related to domains (not just the tech)
3. Cluster into high level categories
4. Select top8 clusters by average count

In [207]:
import pandas as pd
import matplotlib.pyplot as plt


def validate_clusters(valid, clusters):
    """
    Ensures:
      1. Every item in `valid` is captured in at least one of `clusters`.
      2. No cluster contains tags that aren't in `valid`.
    """
    valid_set = set(valid)
    cluster_tags = set()

    for cluster_name, tags in clusters.items():
        cluster_tags.update(tags)

        extra_tags = set(tags) - valid_set
        if extra_tags:
            print(f"BAD: Cluster '{cluster_name}' has tags not in 'valid': {extra_tags}")
            raise ValueError

    missing_tags = valid_set - cluster_tags
    if missing_tags:
        print(f"BAD: These 'valid' tags are missing from clusters: {missing_tags}")
        raise ValueError

    if not missing_tags and not any(set(tags) - valid_set for tags in clusters.values()):
        print("All clusters validated: nothing missing or extra.")


def validate_is_subset(small_set, big_set):
    """
    Ensures that `small_set` is a subset of `big_set`.
    """
    if not small_set.issubset(big_set):
        print(f"BAD: {small_set} is not a subset of {big_set}") 
        raise ValueError
    else:
        print("Subset validated: nothing missing or extra.")
        
        

def assign_cluster(tag):
    for cluster_name, cluster_tags in clusters.items():
        if tag in cluster_tags:
            return cluster_name
    return 'other'

        
df = pd.read_json("../data/raw/yc_ai_assistant_tags.jsonl", lines=True)

print("n unique tags", len(df))
print("n total tags", df['count'].sum())

all_tags = df['tag'].unique().tolist()

# Valid means 
# (A) a domain application not just underlying tech. For example, 'no-code' or 'data-viz' are excluded. 
# (B) Consumer-facing domain application


valid = ['ai-enhanced-learning', 'automotive', 'collaboration', 'compliance','consumer-finance',  'creator-economy', 'customer-service', 'customer-success', 'customer-support', 'design', 'design-tools', 'digital-health', 'e-commerce', 'education', 'email', 'entertainment', 'finance', 'fintech',  'health-tech', 'healthcare', 'healthcare-it', 'hr-tech', 'insurance', 'legal', 'legaltech', 'market-research', 'marketing', 'marketplace',  'note-taking', 'productivity',  'real-estate', 'recruiting',  'remote-work', 'retail',  'sales', 'sms', 'social-media', 'social-network', 'telehealth', 'travel']

validate_is_subset(set(valid), set(all_tags))

clusters = {
    'hr': ['recruiting', 'hr-tech'],
    'legal': ['legal', 'legaltech', 'compliance'],
    'finance': ['fintech', 'finance', 'consumer-finance', 'insurance'],
    'commerce': ['retail', 'e-commerce', 'sales', 'marketing', 'market-research', 'real-estate','marketplace'],
    'communication': ['email', 'sms', 'social-network', 'collaboration', 'social-media'],
    'healthcare': ['telehealth', 'healthcare', 'healthcare-it', 'health-tech', 'digital-health'],
    'productivity': ['productivity', 'note-taking', 'remote-work'],
    'customer_service': ['customer-support', 'customer-success', 'customer-service'],
    'education': ['ai-enhanced-learning', 'education'],
    'arts_and_entertainment': ['creator-economy', 'entertainment', 'design-tools', 'design'],
    'transportation': [ 'automotive',  'travel']
}

validate_clusters(valid, clusters)

dfv = df[df['tag'].isin(valid)]
dfv['cluster'] = dfv['tag'].apply(assign_cluster)
top8 = dfv.groupby('cluster')['count'].sum().sort_values(ascending=False).head(8).index.tolist()

# Latex table
#############################
grouped = dfv.groupby('cluster')['tag'].apply(lambda x: ', '.join(x)).reset_index()
cluster_counts = dfv.groupby('cluster')['count'].sum().reset_index()
grouped = pd.merge(grouped, cluster_counts, on='cluster')
grouped['cluster'] = grouped['cluster'].apply(lambda x: x.replace('_', ' ').title())
grouped = grouped.sort_values(by='count', ascending=False)
display(grouped)
grouped.to_latex("../tables/clusters.tex", 
                 index=False,
                 header=False,
                 escape=False, 
                 caption="Clusters of tags from YC AI Assistant startups. Count is the total number of startups in each cluster. Tags are separated by commas.",
                 label="tab:clusters_yc")


# check one more time everything good
print("Checking at end of cell")
validate_clusters(valid, clusters)
validate_is_subset(set(valid), set(all_tags))


n unique tags 85
n total tags 431
Subset validated: nothing missing or extra.
All clusters validated: nothing missing or extra.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfv['cluster'] = dfv['tag'].apply(assign_cluster)


Unnamed: 0,cluster,tag,count
1,Commerce,"real-estate, retail, e-commerce, sales, market...",18
3,Customer Service,"customer-support, customer-success, customer-s...",13
5,Finance,"fintech, finance, consumer-finance, insurance",11
9,Productivity,"productivity, remote-work, note-taking",10
2,Communication,"email, sms, collaboration, social-network, soc...",8
8,Legal,"legal, legaltech, compliance",7
6,Healthcare,"telehealth, healthcare, healthcare-it, health-...",7
4,Education,"ai-enhanced-learning, education",5
0,Arts And Entertainment,"design-tools, creator-economy, entertainment, ...",4
7,Hr,"recruiting, hr-tech",4


Checking at end of cell
All clusters validated: nothing missing or extra.
Subset validated: nothing missing or extra.


# # Read ONET data

SOC_Structure is from `https://www.onetcenter.org/taxonomy/2019/structure.html`

In [208]:
def clean_col(col):
    col = col.replace(" ", "_")
    col = col.replace("-", "_")
    col = col.replace("*", "")
    col = col.lower()
    return col


soc = pd.read_csv("SOC_Structure.csv")
soc.columns = [clean_col(s) for s in soc.columns]
print(soc.columns)
majors = soc.dropna(subset=['major_group'])[['major_group', 'soc_or_onet_soc_2019_title']]
majors = majors.sort_values(by=['soc_or_onet_soc_2019_title'])
for idx, row in majors.iterrows():
    print(f"{row['major_group']}: {row['soc_or_onet_soc_2019_title']}")
print()
majors = majors.sort_values(by=['major_group'])
for idx, row in majors.iterrows():
    print(f"{row['major_group']}: {row['soc_or_onet_soc_2019_title']}")


Index(['major_group', 'minor_group', 'broad_occupation', 'detailed_occupation',
       'detailed_onet_soc', 'soc_or_onet_soc_2019_title'],
      dtype='object')
17-0000: Architecture and Engineering Occupations
27-0000: Arts, Design, Entertainment, Sports, and Media Occupations
37-0000: Building and Grounds Cleaning and Maintenance Occupations
13-0000: Business and Financial Operations Occupations
21-0000: Community and Social Service Occupations
15-0000: Computer and Mathematical Occupations
47-0000: Construction and Extraction Occupations
25-0000: Educational Instruction and Library Occupations
45-0000: Farming, Fishing, and Forestry Occupations
35-0000: Food Preparation and Serving Related Occupations
29-0000: Healthcare Practitioners and Technical Occupations
31-0000: Healthcare Support Occupations
49-0000: Installation, Maintenance, and Repair Occupations
23-0000: Legal Occupations
19-0000: Life, Physical, and Social Science Occupations
11-0000: Management Occupations
55-0000: Mil

## Merge with ONET data: Assign clusters to SOC categories

In [198]:
soc_mapping = {
    'customer_service': ['43-0000', '41-0000'],
    'productivity': ['43-0000', '11-0000'],
    'finance': ['13-0000', '11-0000'],
    'commerce': ['13-0000', '41-0000', '11-0000'],
    'education': ['25-0000'],
    'legal': ['23-0000'],
    'hr': ['13-0000', '11-0000'],
    'healthcare': ['29-0000', '31-0000'],
    'arts_entertainment': ['27-0000'],
    'industrial': ['17-0000', '47-0000', '53-0000', '51-0000']
}


data = []
for cluster, soc_codes in soc_mapping.items():
    for soc_code in soc_codes:
        data.append({
            'cluster': cluster,
            'soc_code': soc_code
        })
        
soc_df = pd.DataFrame(data)
merged = pd.merge(soc_df, majors, left_on='soc_code', right_on='major_group', how='left')

for cluster in soc_mapping.keys():
    print(f"{cluster}: {merged[merged['cluster'] == cluster]['soc_or_onet_soc_2019_title'].unique()}")
    

customer_service: ['Office and Administrative Support Occupations'
 'Sales and Related Occupations']
productivity: ['Office and Administrative Support Occupations' 'Management Occupations']
finance: ['Business and Financial Operations Occupations' 'Management Occupations']
commerce: ['Business and Financial Operations Occupations'
 'Sales and Related Occupations' 'Management Occupations']
education: ['Educational Instruction and Library Occupations']
legal: ['Legal Occupations']
hr: ['Business and Financial Operations Occupations' 'Management Occupations']
healthcare: ['Healthcare Practitioners and Technical Occupations'
 'Healthcare Support Occupations']
arts_entertainment: ['Arts, Design, Entertainment, Sports, and Media Occupations']
industrial: ['Architecture and Engineering Occupations'
 'Construction and Extraction Occupations'
 'Transportation and Material Moving Occupations' 'Production Occupations']


In [135]:
## Get the top5 work activities for each cluster

## Get the top5 work activities for each cluster


In [152]:
cluster_socs = soc_df['soc_code'].unique()
work_act = pd.read_csv("https://www.onetcenter.org/dl_files/database/db_29_2_text/Work%20Activities.txt", sep="\t")
work_act['soc_major'] = work_act['O*NET-SOC Code'].apply(lambda x: x[:2] + "-0000")
work_act = work_act[work_act['soc_major'].isin(cluster_socs)]
byname = work_act.groupby(by=['Element Name', 'soc_major'])['Data Value'].mean().reset_index()
byname['rank'] = byname.groupby('soc_major')['Data Value'].rank(method='first', ascending=False)
top5 = byname[byname['rank'] <= 5]
print(top5['Element Name'].value_counts())

soc_acts_merged = pd.merge(soc_df, top5, left_on='soc_code', right_on='soc_major', how='left')



Element Name
Getting Information                                              9
Communicating with Supervisors, Peers, or Subordinates           8
Updating and Using Relevant Knowledge                            6
Establishing and Maintaining Interpersonal Relationships         6
Making Decisions and Solving Problems                            6
Organizing, Planning, and Prioritizing Work                      6
Monitoring Processes, Materials, or Surroundings                 3
Processing Information                                           3
Identifying Objects, Actions, and Events                         3
Handling and Moving Objects                                      3
Assisting and Caring for Others                                  2
Thinking Creatively                                              2
Controlling Machines and Processes                               2
Inspecting Equipment, Structures, or Materials                   2
Evaluating Information to Determine Compliance wi

In [154]:
for cluster in soc_mapping.keys():
    print(f"{cluster}: {soc_acts_merged[soc_acts_merged['cluster'] == cluster]['Element Name'].unique()}")

customer_service: ['Communicating with Supervisors, Peers, or Subordinates'
 'Establishing and Maintaining Interpersonal Relationships'
 'Getting Information' 'Organizing, Planning, and Prioritizing Work'
 'Processing Information' 'Selling or Influencing Others']
productivity: ['Communicating with Supervisors, Peers, or Subordinates'
 'Establishing and Maintaining Interpersonal Relationships'
 'Getting Information' 'Organizing, Planning, and Prioritizing Work'
 'Processing Information' 'Making Decisions and Solving Problems']
finance: ['Communicating with Supervisors, Peers, or Subordinates'
 'Establishing and Maintaining Interpersonal Relationships'
 'Getting Information' 'Making Decisions and Solving Problems'
 'Organizing, Planning, and Prioritizing Work']
commerce: ['Communicating with Supervisors, Peers, or Subordinates'
 'Establishing and Maintaining Interpersonal Relationships'
 'Getting Information' 'Making Decisions and Solving Problems'
 'Organizing, Planning, and Prioritizin

In [149]:
task_ratings = pd.read_csv("https://www.onetcenter.org/dl_files/database/db_29_2_text/Task%20Ratings.txt", sep="\t")

task_statements = pd.read_csv("https://www.onetcenter.org/dl_files/database/db_29_2_text/Task%20Statements.txt", sep="\t")

work_act = pd.merge(task_ratings, task_statements, left_on='Task ID', right_on='Task ID', how='left')


work_act['soc_major'] = work_act['O*NET-SOC Code_x'].apply(lambda x: x[:2] + "-0000")
byname = work_act.groupby(by=['Task', 'soc_major'])['Data Value'].mean().reset_index()
byname['rank'] = byname.groupby('soc_major')['Data Value'].rank(method='first', ascending=False)
top5 = byname[byname['rank'] <= 5]
print(top5['Task'].value_counts())



Task
Act as an intermediary in negotiations between buyers and sellers, generally representing one or the other.                                                                                           1
Adapt instructional techniques to the age and skill levels of students.                                                                                                                               1
Adjust apertures, shutter speeds, and camera focus according to a combination of factors, such as lighting, field depth, subject motion, film type, and film speed.                                   1
Advocate for clients or patients to resolve crises.                                                                                                                                                   1
Analyze and interpret statistical data to identify significant differences in relationships among sources of information.                                                                          

In [148]:
work_act.columns

Index(['O*NET-SOC Code_x', 'Task ID', 'Scale ID', 'Category', 'Data Value',
       'N', 'Standard Error', 'Lower CI Bound', 'Upper CI Bound',
       'Recommend Suppress', 'Date_x', 'Domain Source_x', 'O*NET-SOC Code_y',
       'Task', 'Task Type', 'Incumbents Responding', 'Date_y',
       'Domain Source_y'],
      dtype='object')