In [112]:
import pandas as pd
import re
import numpy as np

In [113]:
df_levels = pd.read_csv("../FinalizedCSVs/companies.csv")
df_internships = pd.read_csv("../FinalizedCSVs/internships.csv")
df_simplify = pd.read_csv("../FinalizedCSVs/simplify_companies.csv")

In [114]:
degree_pattern = re.compile(r'^(Undergrad|Masters|PhD)\b', flags=re.IGNORECASE)

In [None]:
def split_perks(cell):
    if pd.isna(cell):
        return pd.Series({'degree_requirement': np.nan, 'perks_clean': np.nan})
    text = str(cell).strip().strip('"')

    parts = re.split(r'(?:\\n|\n)', text, maxsplit=1)

    if len(parts) > 1 and degree_pattern.match(parts[0]):
        return pd.Series({
            'degree_requirement': parts[0].strip(),
            'perks_clean':        parts[1].strip()
        })

    if degree_pattern.match(text) and len(parts) == 1:
        return pd.Series({
            'degree_requirement': text.strip(),
            'perks_clean':        ''
        })

    return pd.Series({
        'degree_requirement': np.nan,
        'perks_clean':        text
    })


In [116]:
def normalize_slug(slug):
    if pd.isna(slug):
        return ""
    return re.sub(r"[^a-z0-9]", "", slug.lower())

In [117]:
df_levels['normalized_slug'] = df_levels['company_slug'].apply(normalize_slug)
df_internships['normalized_slug'] = df_internships['company_slug'].apply(normalize_slug)
df_simplify['normalized_slug'] = df_simplify['company_simplify_slug'].apply(normalize_slug)

In [118]:
df_internships[['degree_requirement','perks_clean']] = \
    df_internships['perks'].apply(split_perks)

In [119]:
companies = pd.merge(df_levels, df_simplify, on='normalized_slug', how='outer', suffixes=('_levels', '_simplify'))

companies_table = companies[[
    'normalized_slug',
    'company_name', 'description', 'overview',
    'website', 'twitter', 'linkedin',
    'year_founded', 'founded_year',
    'num_employees', 'company_size',
    'headquarters', 'simplify_headquarters',
    'company_stage', 'total_funding',
    'simplify_url', 'simplify_take',
    'believer_points', 'critic_points', 'what_makes_unique',
    'benefits', 'industries'
]].copy()

companies_table = companies_table.drop_duplicates(subset=['normalized_slug']).reset_index(drop=True)
companies_table['company_id'] = companies_table.index + 1

In [120]:
internships_table = pd.merge(df_internships, companies_table[['normalized_slug', 'company_id']], on='normalized_slug', how='left')

internships_table = internships_table[[
    'company_id', 'title', 'location', 'hourly_rate', 'monthly_pay', 'degree_requirement', 'perks_clean', 'apply_link'
]].drop_duplicates().reset_index(drop=True)

internships_table['internship_id'] = internships_table.index + 1

In [121]:
industry_set = set()
company_industries = []

for idx, row in companies.iterrows():
    raw = row.get('industries', '')
    if pd.notna(raw):
        inds = [i.strip() for i in raw.split(',')]
        for ind in inds:
            industry_set.add(ind)
            company_industries.append((row['normalized_slug'], ind))

industries_table = pd.DataFrame({'name': sorted(industry_set)})
industries_table['industry_id'] = industries_table.index + 1

company_industries_table = pd.DataFrame(company_industries, columns=['normalized_slug', 'industry_name'])
company_industries_table = company_industries_table.merge(
    companies_table[['normalized_slug', 'company_id']], on='normalized_slug', how='left'
).merge(
    industries_table, left_on='industry_name', right_on='name', how='left'
)[['company_id', 'industry_id']]

In [None]:
location_set = set()
internship_locations = []

for idx, row in internships_table.iterrows():
    loc = row['location']
    if pd.notna(loc) and isinstance(loc, str):

        loc_clean = loc.split(" - ")[0].strip()
        parts = loc_clean.split(", ")
        if len(parts) == 2:
            city, state = parts
            country = "United States"
        elif len(parts) == 3:
            city, state, country = parts
        else:
            continue
        location_set.add((city, state, country))
        internship_locations.append((row['internship_id'], city, state, country))

locations_table = pd.DataFrame(list(location_set), columns=['city', 'state', 'country'])
locations_table['location_id'] = locations_table.index + 1

internship_locations_table = pd.DataFrame(internship_locations, columns=['internship_id', 'city', 'state', 'country'])
internship_locations_table = internship_locations_table.merge(
    locations_table, on=['city', 'state', 'country'], how='left'
)[['internship_id', 'location_id']]
internship_locations_table['is_remote'] = False

In [123]:
companies_table.head(20)

Unnamed: 0,normalized_slug,company_name,description,overview,website,twitter,linkedin,year_founded,founded_year,num_employees,...,company_stage,total_funding,simplify_url,simplify_take,believer_points,critic_points,what_makes_unique,benefits,industries,company_id
0,1fort,1Fort,,,,,,,,,...,,,https://simplify.jobs/c/1Fort,,,,,,,1
1,1password,1Password,"A password manager, digital vault, form filler...",1Password provides a password management and s...,https://1password.com/?ref=levels.fyi&utm_sour...,https://twitter.com/1Password,https://www.linkedin.com/company/1password,2005.0,2005.0,420.0,...,Series C,$920M,https://simplify.jobs/c/1Password,What believers are saying Growing demand for s...,Growing demand for secure access solutions due...,Emerging competitors offer similar features at...,1Password integrates seamlessly with IAM syste...,👶 Maternity and parental leave top up programs...,"Enterprise Software, Cybersecurity",2
2,7shifts,7shifts,Restaurant Employee Scheduling Software. Make ...,7shifts is a workforce management platform tai...,https://www.7shifts.com/?ref=levels.fyi&utm_so...,https://twitter.com/7shifts,https://www.linkedin.com/company/7shifts,2014.0,2014.0,290.0,...,Series C,$131M,https://simplify.jobs/c/7shifts,What believers are saying 7shifts ranked 382 o...,7shifts ranked 382 on the 2024 Deloitte Techno...,19% staff reduction may impact 7shifts' servic...,7shifts offers a comprehensive platform tailor...,Health Insurance\nCompany Equity\nFlexible Wor...,"Consulting, Enterprise Software",3
3,8451degrees,84.51 Degrees,,,,,,,,,...,,,https://simplify.jobs/c/84.51-Degrees,,,,,,,4
4,acadianassetmanagement,Acadian Asset Management,,Acadian Asset Management specializes in invest...,,,,,1986.0,,...,,,https://simplify.jobs/c/Acadian-Asset-Management,What believers are saying Increased interest i...,Increased interest in ESG investing aligns wit...,Competition from firms like Two Sigma may erod...,Acadian uses sophisticated analytical models f...,Hybrid Work Options\nProfessional Development ...,"Quantitative Finance, Financial Services",5
5,acarasolutions,Acara Solutions,Acara Solutions is a leading provider of recru...,,https://www.acarasolutions.com/?ref=levels.fyi...,https://twitter.com/Acara_Solutions,https://www.linkedin.com/company/acarasolutions,1957.0,,480.0,...,,,https://simplify.jobs/c/Acara-Solutions,,,,,,,6
6,accenture,Accenture,"Accenture plc, stylised as accenture, is an Ir...",,https://www.accenture.com/?ref=levels.fyi&utm_...,https://twitter.com/Accenture,https://www.linkedin.com/company/accentureindia,1989.0,,492185.0,...,,,https://simplify.jobs/c/Accenture,,,,,,,7
7,actian,Actian,,Actian transforms large volumes of data into a...,,,,,2005.0,,...,Series E,$84.5M,https://simplify.jobs/c/Actian,What believers are saying Growing demand for h...,Growing demand for hybrid cloud solutions alig...,Decentralized data management challenges Actia...,"Actian offers a hybrid cloud data warehouse, A...",Health Insurance\nParental Leave\nPaid Vacatio...,"Data & Analytics, Enterprise Software",8
8,activecampaign,ActiveCampaign,ActiveCampaign's category-defining Customer Ex...,ActiveCampaign provides a platform designed to...,https://www.activecampaign.com/?ref=levels.fyi...,https://twitter.com/ActiveCampaign,https://www.linkedin.com/company/activecampaig...,2003.0,2003.0,960.0,...,Series C,$360M,https://simplify.jobs/c/ActiveCampaign,What believers are saying Acquisition of Hilos...,Acquisition of Hilos expands WhatsApp automati...,Emerging AI-driven platforms may erode ActiveC...,ActiveCampaign offers 800+ pre-built automatio...,Health Insurance\nUnlimited Paid Time Off\n401...,"Data & Analytics, Enterprise Software",9
9,activision,Activision,"Activision Publishing, Inc. is an American vid...",,https://www.activision.com/?ref=levels.fyi&utm...,https://twitter.com/Activision,https://www.linkedin.com/company/activision,1979.0,,9250.0,...,,,https://simplify.jobs/c/Activision,,,,,,,10


In [124]:
df_internships.head()

Unnamed: 0,company_slug,company_name,title,location,hourly_rate,monthly_pay,perks,apply_link,normalized_slug,degree_requirement,perks_clean
0,radix-trading,Radix Trading,Quantitative Technologist,"Chicago, IL - Summer / 2025",$166.67,"$28,890","Corporate housing, $25,000 sign-on bonus",https://boards.greenhouse.io/radixuniversity/j...,radixtrading,,"Corporate housing, $25,000 sign-on bonus"
1,arrowstreet-capital,Arrowstreet Capital,Quantitative Researcher,"Boston, MA - Summer / 2025",$152.88,"$26,499","Undergrad (Junior)\n$10,000 Housing, company p...",,arrowstreetcapital,Undergrad (Junior),"$10,000 Housing, company provided transportati..."
2,five-rings,Five Rings,Quantitative Researcher,"New York City, NY - Summer / 2025",$143.75,"$24,917","Corporate housing, Sign-on bonus",https://job-boards.greenhouse.io/fiveringsllc/...,fiverings,,"Corporate housing, Sign-on bonus"
3,sig,SIG,Quantitative Trader,"Bala Cynwyd, PA - Summer / 2025",$137.50,"$23,833","Undergrad (Junior)\nCorporate housing, $20,000...",,sig,Undergrad (Junior),"Corporate housing, $20,000 sign-on"
4,d-e-shaw,D.E. Shaw,,"New York City, NY - Summer / 2025",$126.92,"$22,000","Undergrad (Senior)\n$10,000 Housing, company p...",,deshaw,Undergrad (Senior),"$10,000 Housing, company provided relocation, ..."


In [None]:
companies_table.to_csv("export_companies.csv", index=False)
internships_table.to_csv("export_internships.csv", index=False)
locations_table.to_csv("export_locations.csv", index=False)
internship_locations_table.to_csv("export_internship_locations.csv", index=False)
industries_table.to_csv("export_industries.csv", index=False)
company_industries_table.to_csv("export_company_industries.csv", index=False)