In [108]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import openai
print(openai.__version__)
import json

1.76.0


In [None]:
# insert openai api key here -- can't push to github with this
client = openai.OpenAI(api_key="[INSERT OPEN API KEY]")

In [109]:
def detect_red_flags_and_weights(text):
    if pd.isna(text) or not text.strip():
        return {}

    prompt = (
        "Identify any red‐flag concerns for job candidates in the following text. "
        "For each one, give a severity score from 1 (minor) to 5 (major). "
        "Return *only* a JSON object where keys are the red‐flag phrase and values are the score.\n\n"
        f"Text: {text}"
    )

    resp = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system",  "content": "You are a concise assistant."},
            {"role": "user",    "content": prompt}
        ]
    )

    # extract the raw reply
    raw = resp.choices[0].message.content.strip()

    # try to parse valid JSON; if GPT slipped in markdown, strip it
    try:
        # sometimes GPT wraps JSON in ```json ...```
        json_part = raw
        if raw.startswith("```"):
            json_part = raw.split("```")[1]
        return json.loads(json_part)
    except json.JSONDecodeError:
        # fallback: empty dict
        return {}

In [111]:
df_companies = pd.read_csv("../test normalization/export_companies_copy.csv")
df_companies.head()

Unnamed: 0,normalized_slug,company_name,description,overview,website,twitter,linkedin,year_founded,founded_year,num_employees,...,company_stage,total_funding,simplify_url,simplify_take,believer_points,critic_points,what_makes_unique,benefits,industries,company_id
0,1password,1Password,"A password manager, digital vault, form filler...",1Password provides a password management and s...,https://1password.com/?ref=levels.fyi&utm_sour...,https://twitter.com/1Password,https://www.linkedin.com/company/1password,2005.0,2005,420.0,...,Series C,$920M,https://simplify.jobs/c/1Password,What believers are saying Growing demand for s...,Growing demand for secure access solutions due...,Emerging competitors offer similar features at...,1Password integrates seamlessly with IAM syste...,👶 Maternity and parental leave top up programs...,"Enterprise Software, Cybersecurity",2
1,7shifts,7shifts,Restaurant Employee Scheduling Software. Make ...,7shifts is a workforce management platform tai...,https://www.7shifts.com/?ref=levels.fyi&utm_so...,https://twitter.com/7shifts,https://www.linkedin.com/company/7shifts,2014.0,2014,290.0,...,Series C,$131M,https://simplify.jobs/c/7shifts,What believers are saying 7shifts ranked 382 o...,7shifts ranked 382 on the 2024 Deloitte Techno...,19% staff reduction may impact 7shifts' servic...,7shifts offers a comprehensive platform tailor...,Health Insurance\nCompany Equity\nFlexible Wor...,"Consulting, Enterprise Software",3
2,acadianassetmanagement,Acadian Asset Management,,Acadian Asset Management specializes in invest...,,,,,1986,,...,,,https://simplify.jobs/c/Acadian-Asset-Management,What believers are saying Increased interest i...,Increased interest in ESG investing aligns wit...,Competition from firms like Two Sigma may erod...,Acadian uses sophisticated analytical models f...,Hybrid Work Options\nProfessional Development ...,"Quantitative Finance, Financial Services",5
3,appian,Appian,The company sells a Platform as a Service for ...,Company Historically Provides H1B Sponsorship,https://www.appian.com/?ref=levels.fyi&utm_sou...,https://twitter.com/Appian,https://www.linkedin.com/company/appian-corpor...,1999.0,1999,2000.0,...,IPO,$123M,https://simplify.jobs/c/Appian,What believers are saying Growing demand for l...,Growing demand for low-code platforms boosts A...,Unfavorable legal rulings could harm Appian's ...,Appian's low-code platform enables rapid appli...,Private Health Insurance\nComprehensive Global...,"Consulting, Enterprise Software",57
4,apple,Apple,Apple Inc. is an American multinational techno...,"Apple Inc. designs, manufactures, and sells a ...",https://www.apple.com/?ref=levels.fyi&utm_sour...,https://twitter.com/Apple,https://www.linkedin.com/company/apple,1976.0,1976,147000.0,...,IPO,$6250.9M,https://simplify.jobs/c/Apple,What believers are saying Apple's AR glasses d...,Apple's AR glasses development could capture m...,Developing AR glasses may lead to high R&D cos...,Apple's integrated hardware and software creat...,Health Insurance\nDental Insurance\n401(k) Ret...,"VR & AR, Consumer Software, Fintech, Consumer ...",59


In [112]:
columns_to_check = ['description', 'overview', 'believer_points', 
                    'critic_points', 'what_makes_unique']

In [118]:
flag_cols = [
    'description_red_flags_with_weights',
    'overview_red_flags_with_weights',
    'believer_points_red_flags_with_weights',
    'critic_points_red_flags_with_weights',
    'what_makes_unique_red_flags_with_weights'
]

In [121]:
def compute_red_flag_score(row):
    total = 0
    for col in flag_cols:
        flags = row.get(col, {})
        if isinstance(flags, dict):
            total += sum(flags.values())
    return total

In [122]:
for col in columns_to_check:
    df_companies[f'{col}_red_flags_with_weights'] = (df_companies[col].apply(detect_red_flags_and_weights)
)

In [123]:
df_companies['red_flag_score'] = df_companies.apply(compute_red_flag_score, axis=1)

In [117]:
pd.set_option('display.max_colwidth', None)
df_companies[['company_name', 'description_red_flags_with_weights']].head()

Unnamed: 0,company_name,description_red_flags_with_weights
0,1Password,"{'password manager': 1, 'secure digital wallet': 1}"
1,7shifts,"{'labor compliance easy': 3, 'Get started for free': 2}"
2,Acadian Asset Management,{}
3,Appian,{'lack of specificity': 2}
4,Apple,"{'American multinational technology company': 1, 'specializes in consumer electronics, computer software and online services': 2}"


In [125]:
cols_to_show = [
    'normalized_slug',
    'company_name',
    'overview_red_flags',
    'believer_points_red_flags',
    'critic_points_red_flags',
    'what_makes_unique_red_flags',
    'description_red_flags_with_weights',
    'overview_red_flags_with_weights',
    'believer_points_red_flags_with_weights',
    'critic_points_red_flags_with_weights',
    'what_makes_unique_red_flags_with_weights',
    'red_flag_score'
]

In [127]:
top_red_flags = (
    df_companies
      .sort_values('red_flag_score', ascending=False)
      [cols_to_show]
)

top_red_flags.head()

Unnamed: 0,normalized_slug,company_name,overview_red_flags,believer_points_red_flags,critic_points_red_flags,what_makes_unique_red_flags,description_red_flags_with_weights,overview_red_flags_with_weights,believer_points_red_flags_with_weights,critic_points_red_flags_with_weights,what_makes_unique_red_flags_with_weights,red_flag_score
0,1password,1Password,[],[],"[prici, g pressure, security vul, erabilities, data breaches]",[],"{'password manager': 1, 'remembers all your passwords for you': 3, 'keep account information safe': 2}","{'subscription model': 2, 'seamless employee provisioning': 3, 'enhances security without disrupting productivity': 4}","{'acquisition of Trelica': 3, 'regulatory focus on data privacy': 4}","{'lower prices': 3, 'security vulnerabilities': 4, 'data breaches': 5}",{'subscription model': 1},35
4,apple,Apple,[],[],"[R&D costs, high risk i, vestme, t, geopolitical risks, supply chai, challe, ges]",[],{'American multinational technology company': 1},"{'trade-in program': 3, 'device upgrades': 2}","{'development could capture market share': 3, 'sustainability goals': 1}","{'high R&D costs without guaranteed success': 4, 'geopolitical risks': 5, 'supply chain challenges': 3}","{'enhances customer loyalty': 2, 'incentivizes device upgrades': 3}",27
2,acadianassetmanagement,Acadian Asset Management,[],[],"[Competitio, from Two Sigma, Regulatory scruti, y, Market volatility, Geopolitical te, sio, s.]",[],{},"{'focuses on equities and various asset classes in both developed and emerging markets': 2, 'revenue through performance fees linked to investment success': 3, 'emphasizes corporate social responsibility': 1}",{'Emerging markets expansion': 3},"{'Competition from firms like Two Sigma may erode Acadian's market share': 3, 'Regulatory scrutiny on quantitative trading could impact Acadian's operations': 4, 'Market volatility from geopolitical tensions may affect Acadian's performance': 2}","{'proprietary database': 3, 'sophisticated analytical models': 4}",25
5,applovin,AppLovin,[],[],"[lawsuits, securities fraud, layoffs, leadership cha, ges, i, ter, al restructuri, g]",[],{'grow the app ecosystem': 2},{},"{'privacy-compliant advertising': 4, 'transparent ad models': 3}","{'allegedly violating app store rules': 4, 'class action lawsuits alleging securities fraud': 5, 'layoffs and leadership changes': 3}","{'user acquisition': 2, 'ad monetization': 2}",25
3,appian,Appian,[],[],"[legal ruli, gs, fi, a, cial stability, strategic shifts, clie, t trust, public sector i, efficie, cies]",[],{'None': 1},{'H1B Sponsorship': 5},"{'Growing demand for low-code platforms': 1, 'Partnerships in specialized industries': 1, 'Favorable legal outcomes': 1}","{'Unfavorable legal rulings': 4, 'New CFO': 2, 'UK public sector inefficiencies': 3}",{'subscription-based model': 4},22


In [128]:
top_red_flags.to_csv('top_red_flags.csv', index=False)